FeedsXPathParserHTML.inc 1.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051
  1. <?php
  2. /**
  3. * @files
  4. * Provides the FeedsXPathParserHTML class.
  5. */
  6. class FeedsXPathParserHTML extends FeedsXPathParserBase {
  7. /**
  8. * Implements FeedsXPathParserBase::setup().
  9. */
  10. protected function setup($source_config, FeedsFetcherResult $fetcher_result) {
  11. if (!empty($source_config['exp']['tidy'])) {
  12. $config = array(
  13. 'merge-divs' => FALSE,
  14. 'merge-spans' => FALSE,
  15. 'join-styles' => FALSE,
  16. 'drop-empty-paras' => FALSE,
  17. 'wrap' => 0,
  18. 'tidy-mark' => FALSE,
  19. 'escape-cdata' => TRUE,
  20. 'word-2000' => TRUE,
  21. );
  22. // Default tidy encoding is UTF8.
  23. $encoding = $source_config['exp']['tidy_encoding'];
  24. $raw = tidy_repair_string(trim($fetcher_result->getRaw()), $config, $encoding);
  25. }
  26. else {
  27. $raw = $fetcher_result->getRaw();
  28. }
  29. $doc = new DOMDocument();
  30. // Use our own error handling.
  31. $use = $this->errorStart();
  32. $success = $doc->loadHTML($raw);
  33. unset($raw);
  34. $this->errorStop($use, $source_config['exp']['errors']);
  35. if (!$success) {
  36. throw new Exception(t('There was an error parsing the HTML document.'));
  37. }
  38. return $doc;
  39. }
  40. protected function getRaw(DOMNode $node) {
  41. // DOMDocument::saveHTML() cannot take $node as an argument prior to 5.3.6.
  42. if (version_compare(phpversion(), '5.3.6', '>=')) {
  43. return $this->doc->saveHTML($node);
  44. }
  45. return $this->doc->saveXML($node);
  46. }
  47. }