123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596 |
- <?php
- namespace PicoFeed\Processor;
- use Closure;
- use PicoFeed\Base;
- use PicoFeed\Parser\Feed;
- use PicoFeed\Parser\Item;
- use PicoFeed\Scraper\Scraper;
- /**
- * Scraper Processor
- *
- * @package PicoFeed\Processor
- * @author Frederic Guillot
- */
- class ScraperProcessor extends Base implements ItemProcessorInterface
- {
- private $ignoredUrls = array();
- private $scraper;
- /**
- * Callback function for each scraper execution
- *
- * @var Closure
- */
- private $executionCallback;
- /**
- * Add a new execution callback
- *
- * @access public
- * @param Closure $executionCallback
- * @return $this
- */
- public function setExecutionCallback(Closure $executionCallback)
- {
- $this->executionCallback = $executionCallback;
- return $this;
- }
- /**
- * Execute Item Processor
- *
- * @access public
- * @param Feed $feed
- * @param Item $item
- * @return bool
- */
- public function execute(Feed $feed, Item $item)
- {
- if (!in_array($item->getUrl(), $this->ignoredUrls)) {
- $scraper = $this->getScraper();
- $scraper->setUrl($item->getUrl());
- $scraper->execute();
- if ($this->executionCallback && is_callable($this->executionCallback)) {
- call_user_func($this->executionCallback, $feed, $item, $scraper);
- }
- if ($scraper->hasRelevantContent()) {
- $item->setContent($scraper->getFilteredContent());
- }
- }
- return false;
- }
- /**
- * Ignore list of URLs
- *
- * @access public
- * @param array $urls
- * @return $this
- */
- public function ignoreUrls(array $urls)
- {
- $this->ignoredUrls = $urls;
- return $this;
- }
- /**
- * Returns Scraper instance
- *
- * @access public
- * @return Scraper
- */
- public function getScraper()
- {
- if ($this->scraper === null) {
- $this->scraper = new Scraper($this->config);
- }
- return $this->scraper;
- }
- }
|