CandidateParser.php 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281
  1. <?php
  2. namespace PicoFeed\Scraper;
  3. use DomDocument;
  4. use DOMXPath;
  5. use PicoFeed\Logging\Logger;
  6. use PicoFeed\Parser\XmlParser;
  7. /**
  8. * Candidate Parser.
  9. *
  10. * @author Frederic Guillot
  11. */
  12. class CandidateParser implements ParserInterface
  13. {
  14. private $dom;
  15. private $xpath;
  16. /**
  17. * List of attributes to try to get the content, order is important, generic terms at the end.
  18. *
  19. * @var array
  20. */
  21. private $candidatesAttributes = array(
  22. 'articleBody',
  23. 'articlebody',
  24. 'article-body',
  25. 'articleContent',
  26. 'articlecontent',
  27. 'article-content',
  28. 'articlePage',
  29. 'post-content',
  30. 'post_content',
  31. 'entry-content',
  32. 'entry-body',
  33. 'main-content',
  34. 'story_content',
  35. 'storycontent',
  36. 'entryBox',
  37. 'entrytext',
  38. 'comic',
  39. 'post',
  40. 'article',
  41. 'content',
  42. 'main',
  43. );
  44. /**
  45. * List of attributes to strip.
  46. *
  47. * @var array
  48. */
  49. private $stripAttributes = array(
  50. 'comment',
  51. 'share',
  52. 'links',
  53. 'toolbar',
  54. 'fb',
  55. 'footer',
  56. 'credit',
  57. 'bottom',
  58. 'nav',
  59. 'header',
  60. 'social',
  61. 'tag',
  62. 'metadata',
  63. 'entry-utility',
  64. 'related-posts',
  65. 'tweet',
  66. 'categories',
  67. 'post_title',
  68. 'by_line',
  69. 'byline',
  70. 'sponsors',
  71. );
  72. /**
  73. * Tags to remove.
  74. *
  75. * @var array
  76. */
  77. private $stripTags = array(
  78. 'nav',
  79. 'header',
  80. 'footer',
  81. 'aside',
  82. 'form',
  83. );
  84. /**
  85. * Constructor.
  86. *
  87. * @param string $html
  88. */
  89. public function __construct($html)
  90. {
  91. $this->dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$html);
  92. $this->xpath = new DOMXPath($this->dom);
  93. }
  94. /**
  95. * Get the relevant content with the list of potential attributes.
  96. *
  97. * @return string
  98. */
  99. public function execute()
  100. {
  101. $content = $this->findContentWithCandidates();
  102. if (strlen($content) < 200) {
  103. $content = $this->findContentWithArticle();
  104. }
  105. if (strlen($content) < 50) {
  106. $content = $this->findContentWithBody();
  107. }
  108. return $this->stripGarbage($content);
  109. }
  110. /**
  111. * Find content based on the list of tag candidates.
  112. *
  113. * @return string
  114. */
  115. public function findContentWithCandidates()
  116. {
  117. foreach ($this->candidatesAttributes as $candidate) {
  118. Logger::setMessage(get_called_class().': Try this candidate: "'.$candidate.'"');
  119. $nodes = $this->xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');
  120. if ($nodes !== false && $nodes->length > 0) {
  121. Logger::setMessage(get_called_class().': Find candidate "'.$candidate.'"');
  122. return $this->dom->saveXML($nodes->item(0));
  123. }
  124. }
  125. return '';
  126. }
  127. /**
  128. * Find <article/> tag.
  129. *
  130. * @return string
  131. */
  132. public function findContentWithArticle()
  133. {
  134. $nodes = $this->xpath->query('//article');
  135. if ($nodes !== false && $nodes->length > 0) {
  136. Logger::setMessage(get_called_class().': Find <article/> tag');
  137. return $this->dom->saveXML($nodes->item(0));
  138. }
  139. return '';
  140. }
  141. /**
  142. * Find <body/> tag.
  143. *
  144. * @return string
  145. */
  146. public function findContentWithBody()
  147. {
  148. $nodes = $this->xpath->query('//body');
  149. if ($nodes !== false && $nodes->length > 0) {
  150. Logger::setMessage(get_called_class().' Find <body/>');
  151. return $this->dom->saveXML($nodes->item(0));
  152. }
  153. return '';
  154. }
  155. /**
  156. * Strip useless tags.
  157. *
  158. * @param string $content
  159. * @return string
  160. */
  161. public function stripGarbage($content)
  162. {
  163. $dom = XmlParser::getDomDocument($content);
  164. if ($dom !== false) {
  165. $xpath = new DOMXPath($dom);
  166. $this->stripTags($xpath);
  167. $this->stripAttributes($dom, $xpath);
  168. $content = $dom->saveXML($dom->documentElement);
  169. }
  170. return $content;
  171. }
  172. /**
  173. * Remove blacklisted tags.
  174. *
  175. * @param DOMXPath $xpath
  176. */
  177. public function stripTags(DOMXPath $xpath)
  178. {
  179. foreach ($this->stripTags as $tag) {
  180. $nodes = $xpath->query('//'.$tag);
  181. if ($nodes !== false && $nodes->length > 0) {
  182. Logger::setMessage(get_called_class().': Strip tag: "'.$tag.'"');
  183. foreach ($nodes as $node) {
  184. $node->parentNode->removeChild($node);
  185. }
  186. }
  187. }
  188. }
  189. /**
  190. * Remove blacklisted attributes.
  191. *
  192. * @param DomDocument $dom
  193. * @param DOMXPath $xpath
  194. */
  195. public function stripAttributes(DomDocument $dom, DOMXPath $xpath)
  196. {
  197. foreach ($this->stripAttributes as $attribute) {
  198. $nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]');
  199. if ($nodes !== false && $nodes->length > 0) {
  200. Logger::setMessage(get_called_class().': Strip attribute: "'.$attribute.'"');
  201. foreach ($nodes as $node) {
  202. if ($this->shouldRemove($dom, $node)) {
  203. $node->parentNode->removeChild($node);
  204. }
  205. }
  206. }
  207. }
  208. }
  209. /**
  210. * Find link for next page of the article.
  211. *
  212. * @return string
  213. */
  214. public function findNextLink()
  215. {
  216. return null;
  217. }
  218. /**
  219. * Return false if the node should not be removed.
  220. *
  221. * @param DomDocument $dom
  222. * @param \DomNode $node
  223. * @return bool
  224. */
  225. public function shouldRemove(DomDocument $dom, $node)
  226. {
  227. $document_length = strlen($dom->textContent);
  228. $node_length = strlen($node->textContent);
  229. if ($document_length === 0) {
  230. return true;
  231. }
  232. $ratio = $node_length * 100 / $document_length;
  233. if ($ratio >= 90) {
  234. Logger::setMessage(get_called_class().': Should not remove this node ('.$node->nodeName.') ratio: '.$ratio.'%');
  235. return false;
  236. }
  237. return true;
  238. }
  239. }