valentin_le_moign
/
figureslibres.cc
geforkt von bachir/figureslibres.cc


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281
							<?php

namespace PicoFeed\Scraper;

use DomDocument;
use DOMXPath;
use PicoFeed\Logging\Logger;
use PicoFeed\Parser\XmlParser;

/**
 * Candidate Parser.
 *
 * @author  Frederic Guillot
 */
class CandidateParser implements ParserInterface
{
    private $dom;
    private $xpath;

    /**
     * List of attributes to try to get the content, order is important, generic terms at the end.
     *
     * @var array
     */
    private $candidatesAttributes = array(
        'articleBody',
        'articlebody',
        'article-body',
        'articleContent',
        'articlecontent',
        'article-content',
        'articlePage',
        'post-content',
        'post_content',
        'entry-content',
        'entry-body',
        'main-content',
        'story_content',
        'storycontent',
        'entryBox',
        'entrytext',
        'comic',
        'post',
        'article',
        'content',
        'main',
    );

    /**
     * List of attributes to strip.
     *
     * @var array
     */
    private $stripAttributes = array(
        'comment',
        'share',
        'links',
        'toolbar',
        'fb',
        'footer',
        'credit',
        'bottom',
        'nav',
        'header',
        'social',
        'tag',
        'metadata',
        'entry-utility',
        'related-posts',
        'tweet',
        'categories',
        'post_title',
        'by_line',
        'byline',
        'sponsors',
    );

    /**
     * Tags to remove.
     *
     * @var array
     */
    private $stripTags = array(
        'nav',
        'header',
        'footer',
        'aside',
        'form',
    );

    /**
     * Constructor.
     *
     * @param string $html
     */
    public function __construct($html)
    {
        $this->dom = XmlParser::getHtmlDocument('<?xml version="1.0" encoding="UTF-8">'.$html);
        $this->xpath = new DOMXPath($this->dom);
    }

    /**
     * Get the relevant content with the list of potential attributes.
     *
     * @return string
     */
    public function execute()
    {
        $content = $this->findContentWithCandidates();

        if (strlen($content) < 200) {
            $content = $this->findContentWithArticle();
        }

        if (strlen($content) < 50) {
            $content = $this->findContentWithBody();
        }

        return $this->stripGarbage($content);
    }

    /**
     * Find content based on the list of tag candidates.
     *
     * @return string
     */
    public function findContentWithCandidates()
    {
        foreach ($this->candidatesAttributes as $candidate) {
            Logger::setMessage(get_called_class().': Try this candidate: "'.$candidate.'"');

            $nodes = $this->xpath->query('//*[(contains(@class, "'.$candidate.'") or @id="'.$candidate.'") and not (contains(@class, "nav") or contains(@class, "page"))]');

            if ($nodes !== false && $nodes->length > 0) {
                Logger::setMessage(get_called_class().': Find candidate "'.$candidate.'"');

                return $this->dom->saveXML($nodes->item(0));
            }
        }

        return '';
    }

    /**
     * Find <article/> tag.
     *
     * @return string
     */
    public function findContentWithArticle()
    {
        $nodes = $this->xpath->query('//article');

        if ($nodes !== false && $nodes->length > 0) {
            Logger::setMessage(get_called_class().': Find <article/> tag');

            return $this->dom->saveXML($nodes->item(0));
        }

        return '';
    }

    /**
     * Find <body/> tag.
     *
     * @return string
     */
    public function findContentWithBody()
    {
        $nodes = $this->xpath->query('//body');

        if ($nodes !== false && $nodes->length > 0) {
            Logger::setMessage(get_called_class().' Find <body/>');

            return $this->dom->saveXML($nodes->item(0));
        }

        return '';
    }

    /**
     * Strip useless tags.
     *
     * @param string $content
     * @return string
     */
    public function stripGarbage($content)
    {
        $dom = XmlParser::getDomDocument($content);

        if ($dom !== false) {
            $xpath = new DOMXPath($dom);

            $this->stripTags($xpath);
            $this->stripAttributes($dom, $xpath);

            $content = $dom->saveXML($dom->documentElement);
        }

        return $content;
    }

    /**
     * Remove blacklisted tags.
     *
     * @param DOMXPath $xpath
     */
    public function stripTags(DOMXPath $xpath)
    {
        foreach ($this->stripTags as $tag) {
            $nodes = $xpath->query('//'.$tag);

            if ($nodes !== false && $nodes->length > 0) {
                Logger::setMessage(get_called_class().': Strip tag: "'.$tag.'"');

                foreach ($nodes as $node) {
                    $node->parentNode->removeChild($node);
                }
            }
        }
    }

    /**
     * Remove blacklisted attributes.
     *
     * @param DomDocument $dom
     * @param DOMXPath    $xpath
     */
    public function stripAttributes(DomDocument $dom, DOMXPath $xpath)
    {
        foreach ($this->stripAttributes as $attribute) {
            $nodes = $xpath->query('//*[contains(@class, "'.$attribute.'") or contains(@id, "'.$attribute.'")]');

            if ($nodes !== false && $nodes->length > 0) {
                Logger::setMessage(get_called_class().': Strip attribute: "'.$attribute.'"');

                foreach ($nodes as $node) {
                    if ($this->shouldRemove($dom, $node)) {
                        $node->parentNode->removeChild($node);
                    }
                }
            }
        }
    }

    /**
     * Find link for next page of the article.
     *
     * @return string
     */
    public function findNextLink()
    {
        return null;
    }

    /**
     * Return false if the node should not be removed.
     *
     * @param DomDocument $dom
     * @param \DomNode    $node
     * @return bool
     */
    public function shouldRemove(DomDocument $dom, $node)
    {
        $document_length = strlen($dom->textContent);
        $node_length = strlen($node->textContent);

        if ($document_length === 0) {
            return true;
        }

        $ratio = $node_length * 100 / $document_length;

        if ($ratio >= 90) {
            Logger::setMessage(get_called_class().': Should not remove this node ('.$node->nodeName.') ratio: '.$ratio.'%');

            return false;
        }

        return true;
    }
}