RuleParser.php 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102
  1. <?php
  2. namespace PicoFeed\Scraper;
  3. use DOMXPath;
  4. use PicoFeed\Parser\XmlParser;
  5. /**
  6. * Rule Parser.
  7. *
  8. * @author Frederic Guillot
  9. */
  10. class RuleParser implements ParserInterface
  11. {
  12. private $dom;
  13. private $xpath;
  14. private $rules = array();
  15. /**
  16. * Constructor.
  17. *
  18. * @param string $html
  19. * @param array $rules
  20. */
  21. public function __construct($html, array $rules)
  22. {
  23. $this->rules = $rules;
  24. $this->dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$html);
  25. $this->xpath = new DOMXPath($this->dom);
  26. }
  27. /**
  28. * Get the relevant content with predefined rules.
  29. *
  30. * @return string
  31. */
  32. public function execute()
  33. {
  34. $this->stripTags();
  35. return $this->findContent();
  36. }
  37. /**
  38. * Remove HTML tags.
  39. */
  40. public function stripTags()
  41. {
  42. if (isset($this->rules['strip']) && is_array($this->rules['strip'])) {
  43. foreach ($this->rules['strip'] as $pattern) {
  44. $nodes = $this->xpath->query($pattern);
  45. if ($nodes !== false && $nodes->length > 0) {
  46. foreach ($nodes as $node) {
  47. $node->parentNode->removeChild($node);
  48. }
  49. }
  50. }
  51. }
  52. }
  53. /**
  54. * Fetch content based on Xpath rules.
  55. */
  56. public function findContent()
  57. {
  58. $content = '';
  59. if (isset($this->rules['body']) && is_array($this->rules['body'])) {
  60. foreach ($this->rules['body'] as $pattern) {
  61. $nodes = $this->xpath->query($pattern);
  62. if ($nodes !== false && $nodes->length > 0) {
  63. foreach ($nodes as $node) {
  64. $content .= $this->dom->saveXML($node);
  65. }
  66. }
  67. }
  68. }
  69. return $content;
  70. }
  71. /**
  72. * Fetch next link based on Xpath rules.
  73. *
  74. * @return string
  75. */
  76. public function findNextLink()
  77. {
  78. if (isset($this->rules['next_page']) && is_array($this->rules['next_page'])) {
  79. foreach ($this->rules['next_page'] as $pattern) {
  80. $nodes = $this->xpath->query($pattern);
  81. if ($nodes !== false && $nodes->length > 0) {
  82. foreach ($nodes as $node) {
  83. return $node->getAttribute('href');
  84. }
  85. }
  86. }
  87. }
  88. return null;
  89. }
  90. }