ScraperProcessor.php 2.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
  1. <?php
  2. namespace PicoFeed\Processor;
  3. use Closure;
  4. use PicoFeed\Base;
  5. use PicoFeed\Parser\Feed;
  6. use PicoFeed\Parser\Item;
  7. use PicoFeed\Scraper\Scraper;
  8. /**
  9. * Scraper Processor
  10. *
  11. * @package PicoFeed\Processor
  12. * @author Frederic Guillot
  13. */
  14. class ScraperProcessor extends Base implements ItemProcessorInterface
  15. {
  16. private $ignoredUrls = array();
  17. private $scraper;
  18. /**
  19. * Callback function for each scraper execution
  20. *
  21. * @var Closure
  22. */
  23. private $executionCallback;
  24. /**
  25. * Add a new execution callback
  26. *
  27. * @access public
  28. * @param Closure $executionCallback
  29. * @return $this
  30. */
  31. public function setExecutionCallback(Closure $executionCallback)
  32. {
  33. $this->executionCallback = $executionCallback;
  34. return $this;
  35. }
  36. /**
  37. * Execute Item Processor
  38. *
  39. * @access public
  40. * @param Feed $feed
  41. * @param Item $item
  42. * @return bool
  43. */
  44. public function execute(Feed $feed, Item $item)
  45. {
  46. if (!in_array($item->getUrl(), $this->ignoredUrls)) {
  47. $scraper = $this->getScraper();
  48. $scraper->setUrl($item->getUrl());
  49. $scraper->execute();
  50. if ($this->executionCallback && is_callable($this->executionCallback)) {
  51. call_user_func($this->executionCallback, $feed, $item, $scraper);
  52. }
  53. if ($scraper->hasRelevantContent()) {
  54. $item->setContent($scraper->getFilteredContent());
  55. }
  56. }
  57. return false;
  58. }
  59. /**
  60. * Ignore list of URLs
  61. *
  62. * @access public
  63. * @param array $urls
  64. * @return $this
  65. */
  66. public function ignoreUrls(array $urls)
  67. {
  68. $this->ignoredUrls = $urls;
  69. return $this;
  70. }
  71. /**
  72. * Returns Scraper instance
  73. *
  74. * @access public
  75. * @return Scraper
  76. */
  77. public function getScraper()
  78. {
  79. if ($this->scraper === null) {
  80. $this->scraper = new Scraper($this->config);
  81. }
  82. return $this->scraper;
  83. }
  84. }