123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282 |
- <?php
- namespace PicoFeed\Scraper;
- use PicoFeed\Base;
- use PicoFeed\Client\Client;
- use PicoFeed\Client\ClientException;
- use PicoFeed\Client\Url;
- use PicoFeed\Encoding\Encoding;
- use PicoFeed\Filter\Filter;
- use PicoFeed\Logging\Logger;
- use PicoFeed\Parser\XmlParser;
- /**
- * Scraper class.
- *
- * @author Frederic Guillot
- */
- class Scraper extends Base
- {
- /**
- * URL.
- *
- * @var string
- */
- private $url = '';
- /**
- * Relevant content.
- *
- * @var string
- */
- private $content = '';
- /**
- * HTML content.
- *
- * @var string
- */
- private $html = '';
- /**
- * HTML content encoding.
- *
- * @var string
- */
- private $encoding = '';
- /**
- * Flag to enable candidates parsing.
- *
- * @var bool
- */
- private $enableCandidateParser = true;
- /**
- * Disable candidates parsing.
- *
- * @return Scraper
- */
- public function disableCandidateParser()
- {
- $this->enableCandidateParser = false;
- return $this;
- }
- /**
- * Get encoding.
- *
- * @return string
- */
- public function getEncoding()
- {
- return $this->encoding;
- }
- /**
- * Set encoding.
- *
- * @param string $encoding
- *
- * @return Scraper
- */
- public function setEncoding($encoding)
- {
- $this->encoding = $encoding;
- return $this;
- }
- /**
- * Get URL to download.
- *
- * @return string
- */
- public function getUrl()
- {
- return $this->url;
- }
- /**
- * Set URL to download.
- *
- * @param string $url URL
- *
- * @return Scraper
- */
- public function setUrl($url)
- {
- $this->url = $url;
- return $this;
- }
- /**
- * Return true if the scraper found relevant content.
- *
- * @return bool
- */
- public function hasRelevantContent()
- {
- return !empty($this->content);
- }
- /**
- * Get relevant content.
- *
- * @return string
- */
- public function getRelevantContent()
- {
- return $this->content;
- }
- /**
- * Get raw content (unfiltered).
- *
- * @return string
- */
- public function getRawContent()
- {
- return $this->html;
- }
- /**
- * Set raw content (unfiltered).
- *
- * @param string $html
- *
- * @return Scraper
- */
- public function setRawContent($html)
- {
- $this->html = $html;
- return $this;
- }
- /**
- * Get filtered relevant content.
- *
- * @return string
- */
- public function getFilteredContent()
- {
- $filter = Filter::html($this->content, $this->url);
- $filter->setConfig($this->config);
- return $filter->execute();
- }
- /**
- * Download the HTML content.
- *
- * @return bool
- */
- public function download()
- {
- if (!empty($this->url)) {
- // Clear everything
- $this->html = '';
- $this->content = '';
- $this->encoding = '';
- try {
- $client = Client::getInstance();
- $client->setConfig($this->config);
- $client->setTimeout($this->config->getGrabberTimeout());
- $client->setUserAgent($this->config->getGrabberUserAgent());
- $client->execute($this->url);
- $this->url = $client->getUrl();
- $this->html = $client->getContent();
- $this->encoding = $client->getEncoding();
- return true;
- } catch (ClientException $e) {
- Logger::setMessage(get_called_class().': '.$e->getMessage());
- }
- }
- return false;
- }
- /**
- * Execute the scraper.
- *
- * @param string $pageContent
- * @param int $recursionDepth
- */
- public function execute($pageContent = '', $recursionDepth = 0)
- {
- $this->html = '';
- $this->encoding = '';
- $this->content = '';
- $this->download();
- $this->prepareHtml();
- $parser = $this->getParser();
- if ($parser !== null) {
- $maxRecursions = $this->config->getMaxRecursions();
- if(!isset($maxRecursions)){
- $maxRecursions = 25;
- }
- $pageContent .= $parser->execute();
- // check if there is a link to next page and recursively get content (max 25 pages)
- if((($nextLink = $parser->findNextLink()) !== null) && $recursionDepth < $maxRecursions){
- $nextLink = Url::resolve($nextLink,$this->url);
- $this->setUrl($nextLink);
- $this->execute($pageContent,$recursionDepth+1);
- }
- else{
- $this->content = $pageContent;
- }
- Logger::setMessage(get_called_class().': Content length: '.strlen($this->content).' bytes');
- }
- }
- /**
- * Get the parser.
- *
- * @return ParserInterface
- */
- public function getParser()
- {
- $ruleLoader = new RuleLoader($this->config);
- $rules = $ruleLoader->getRules($this->url);
- if (!empty($rules['grabber'])) {
- Logger::setMessage(get_called_class().': Parse content with rules');
- foreach ($rules['grabber'] as $pattern => $rule) {
- $url = new Url($this->url);
- $sub_url = $url->getFullPath();
- if (preg_match($pattern, $sub_url)) {
- Logger::setMessage(get_called_class().': Matched url '.$sub_url);
- return new RuleParser($this->html, $rule);
- }
- }
- } elseif ($this->enableCandidateParser) {
- Logger::setMessage(get_called_class().': Parse content with candidates');
- }
- return new CandidateParser($this->html);
- }
- /**
- * Normalize encoding and strip head tag.
- */
- public function prepareHtml()
- {
- $html_encoding = XmlParser::getEncodingFromMetaTag($this->html);
- $this->html = Encoding::convert($this->html, $html_encoding ?: $this->encoding);
- $this->html = Filter::stripHeadTags($this->html);
- Logger::setMessage(get_called_class().': HTTP Encoding "'.$this->encoding.'" ; HTML Encoding "'.$html_encoding.'"');
- }
- }
|