123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281 |
- <?php
- namespace PicoFeed\Scraper;
- use DomDocument;
- use DOMXPath;
- use PicoFeed\Logging\Logger;
- use PicoFeed\Parser\XmlParser;
- /**
- * Candidate Parser.
- *
- * @author Frederic Guillot
- */
- class CandidateParser implements ParserInterface
- {
- private $dom;
- private $xpath;
- /**
- * List of attributes to try to get the content, order is important, generic terms at the end.
- *
- * @var array
- */
- private $candidatesAttributes = array(
- 'articleBody',
- 'articlebody',
- 'article-body',
- 'articleContent',
- 'articlecontent',
- 'article-content',
- 'articlePage',
- 'post-content',
- 'post_content',
- 'entry-content',
- 'entry-body',
- 'main-content',
- 'story_content',
- 'storycontent',
- 'entryBox',
- 'entrytext',
- 'comic',
- 'post',
- 'article',
- 'content',
- 'main',
- );
- /**
- * List of attributes to strip.
- *
- * @var array
- */
- private $stripAttributes = array(
- 'comment',
- 'share',
- 'links',
- 'toolbar',
- 'fb',
- 'footer',
- 'credit',
- 'bottom',
- 'nav',
- 'header',
- 'social',
- 'tag',
- 'metadata',
- 'entry-utility',
- 'related-posts',
- 'tweet',
- 'categories',
- 'post_title',
- 'by_line',
- 'byline',
- 'sponsors',
- );
- /**
- * Tags to remove.
- *
- * @var array
- */
- private $stripTags = array(
- 'nav',
- 'header',
- 'footer',
- 'aside',
- 'form',
- );
- /**
- * Constructor.
- *
- * @param string $html
- */
- public function __construct($html)
- {
- $this->dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$html);
- $this->xpath = new DOMXPath($this->dom);
- }
- /**
- * Get the relevant content with the list of potential attributes.
- *
- * @return string
- */
- public function execute()
- {
- $content = $this->findContentWithCandidates();
- if (strlen($content) < 200) {
- $content = $this->findContentWithArticle();
- }
- if (strlen($content) < 50) {
- $content = $this->findContentWithBody();
- }
- return $this->stripGarbage($content);
- }
- /**
- * Find content based on the list of tag candidates.
- *
- * @return string
- */
- public function findContentWithCandidates()
- {
- foreach ($this->candidatesAttributes as $candidate) {
- Logger::setMessage(get_called_class().': Try this candidate: "'.$candidate.'"');
- $nodes = $this->xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');
- if ($nodes !== false && $nodes->length > 0) {
- Logger::setMessage(get_called_class().': Find candidate "'.$candidate.'"');
- return $this->dom->saveXML($nodes->item(0));
- }
- }
- return '';
- }
- /**
- * Find <article/> tag.
- *
- * @return string
- */
- public function findContentWithArticle()
- {
- $nodes = $this->xpath->query('//article');
- if ($nodes !== false && $nodes->length > 0) {
- Logger::setMessage(get_called_class().': Find <article/> tag');
- return $this->dom->saveXML($nodes->item(0));
- }
- return '';
- }
- /**
- * Find <body/> tag.
- *
- * @return string
- */
- public function findContentWithBody()
- {
- $nodes = $this->xpath->query('//body');
- if ($nodes !== false && $nodes->length > 0) {
- Logger::setMessage(get_called_class().' Find <body/>');
- return $this->dom->saveXML($nodes->item(0));
- }
- return '';
- }
- /**
- * Strip useless tags.
- *
- * @param string $content
- * @return string
- */
- public function stripGarbage($content)
- {
- $dom = XmlParser::getDomDocument($content);
- if ($dom !== false) {
- $xpath = new DOMXPath($dom);
- $this->stripTags($xpath);
- $this->stripAttributes($dom, $xpath);
- $content = $dom->saveXML($dom->documentElement);
- }
- return $content;
- }
- /**
- * Remove blacklisted tags.
- *
- * @param DOMXPath $xpath
- */
- public function stripTags(DOMXPath $xpath)
- {
- foreach ($this->stripTags as $tag) {
- $nodes = $xpath->query('//'.$tag);
- if ($nodes !== false && $nodes->length > 0) {
- Logger::setMessage(get_called_class().': Strip tag: "'.$tag.'"');
- foreach ($nodes as $node) {
- $node->parentNode->removeChild($node);
- }
- }
- }
- }
- /**
- * Remove blacklisted attributes.
- *
- * @param DomDocument $dom
- * @param DOMXPath $xpath
- */
- public function stripAttributes(DomDocument $dom, DOMXPath $xpath)
- {
- foreach ($this->stripAttributes as $attribute) {
- $nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]');
- if ($nodes !== false && $nodes->length > 0) {
- Logger::setMessage(get_called_class().': Strip attribute: "'.$attribute.'"');
- foreach ($nodes as $node) {
- if ($this->shouldRemove($dom, $node)) {
- $node->parentNode->removeChild($node);
- }
- }
- }
- }
- }
- /**
- * Find link for next page of the article.
- *
- * @return string
- */
- public function findNextLink()
- {
- return null;
- }
- /**
- * Return false if the node should not be removed.
- *
- * @param DomDocument $dom
- * @param \DomNode $node
- * @return bool
- */
- public function shouldRemove(DomDocument $dom, $node)
- {
- $document_length = strlen($dom->textContent);
- $node_length = strlen($node->textContent);
- if ($document_length === 0) {
- return true;
- }
- $ratio = $node_length * 100 / $document_length;
- if ($ratio >= 90) {
- Logger::setMessage(get_called_class().': Should not remove this node ('.$node->nodeName.') ratio: '.$ratio.'%');
- return false;
- }
- return true;
- }
- }
|