Reader.php 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189
  1. <?php
  2. namespace PicoFeed\Reader;
  3. use DOMXPath;
  4. use PicoFeed\Base;
  5. use PicoFeed\Client\Client;
  6. use PicoFeed\Client\Url;
  7. use PicoFeed\Logging\Logger;
  8. use PicoFeed\Parser\XmlParser;
  9. /**
  10. * Reader class.
  11. *
  12. * @author Frederic Guillot
  13. */
  14. class Reader extends Base
  15. {
  16. /**
  17. * Feed formats for detection.
  18. *
  19. * @var array
  20. */
  21. private $formats = array(
  22. 'Atom' => '//feed',
  23. 'Rss20' => '//rss[@version="2.0"]',
  24. 'Rss92' => '//rss[@version="0.92"]',
  25. 'Rss91' => '//rss[@version="0.91"]',
  26. 'Rss10' => '//rdf',
  27. );
  28. /**
  29. * Download a feed (no discovery).
  30. *
  31. * @param string $url Feed url
  32. * @param string $last_modified Last modified HTTP header
  33. * @param string $etag Etag HTTP header
  34. * @param string $username HTTP basic auth username
  35. * @param string $password HTTP basic auth password
  36. *
  37. * @return \PicoFeed\Client\Client
  38. */
  39. public function download($url, $last_modified = '', $etag = '', $username = '', $password = '')
  40. {
  41. $url = $this->prependScheme($url);
  42. return Client::getInstance()
  43. ->setConfig($this->config)
  44. ->setLastModified($last_modified)
  45. ->setEtag($etag)
  46. ->setUsername($username)
  47. ->setPassword($password)
  48. ->execute($url);
  49. }
  50. /**
  51. * Discover and download a feed.
  52. *
  53. * @param string $url Feed or website url
  54. * @param string $last_modified Last modified HTTP header
  55. * @param string $etag Etag HTTP header
  56. * @param string $username HTTP basic auth username
  57. * @param string $password HTTP basic auth password
  58. * @return Client
  59. * @throws SubscriptionNotFoundException
  60. */
  61. public function discover($url, $last_modified = '', $etag = '', $username = '', $password = '')
  62. {
  63. $client = $this->download($url, $last_modified, $etag, $username, $password);
  64. // It's already a feed or the feed was not modified
  65. if (!$client->isModified() || $this->detectFormat($client->getContent())) {
  66. return $client;
  67. }
  68. // Try to find a subscription
  69. $links = $this->find($client->getUrl(), $client->getContent());
  70. if (empty($links)) {
  71. throw new SubscriptionNotFoundException('Unable to find a subscription');
  72. }
  73. return $this->download($links[0], $last_modified, $etag, $username, $password);
  74. }
  75. /**
  76. * Find feed urls inside a HTML document.
  77. *
  78. * @param string $url Website url
  79. * @param string $html HTML content
  80. *
  81. * @return array List of feed links
  82. */
  83. public function find($url, $html)
  84. {
  85. Logger::setMessage(get_called_class().': Try to discover subscriptions');
  86. $dom = XmlParser::getHtmlDocument($html);
  87. $xpath = new DOMXPath($dom);
  88. $links = array();
  89. $queries = array(
  90. '//link[@type="application/rss+xml"]',
  91. '//link[@type="application/atom+xml"]',
  92. );
  93. foreach ($queries as $query) {
  94. $nodes = $xpath->query($query);
  95. foreach ($nodes as $node) {
  96. $link = $node->getAttribute('href');
  97. if (!empty($link)) {
  98. $feedUrl = new Url($link);
  99. $siteUrl = new Url($url);
  100. $links[] = $feedUrl->getAbsoluteUrl($feedUrl->isRelativeUrl() ? $siteUrl->getBaseUrl() : '');
  101. }
  102. }
  103. }
  104. Logger::setMessage(get_called_class().': '.implode(', ', $links));
  105. return $links;
  106. }
  107. /**
  108. * Get a parser instance.
  109. *
  110. * @param string $url Site url
  111. * @param string $content Feed content
  112. * @param string $encoding HTTP encoding
  113. * @return \PicoFeed\Parser\Parser
  114. * @throws UnsupportedFeedFormatException
  115. */
  116. public function getParser($url, $content, $encoding)
  117. {
  118. $format = $this->detectFormat($content);
  119. if (empty($format)) {
  120. throw new UnsupportedFeedFormatException('Unable to detect feed format');
  121. }
  122. $className = '\PicoFeed\Parser\\'.$format;
  123. $parser = new $className($content, $encoding, $url);
  124. $parser->setHashAlgo($this->config->getParserHashAlgo());
  125. $parser->setConfig($this->config);
  126. return $parser;
  127. }
  128. /**
  129. * Detect the feed format.
  130. *
  131. * @param string $content Feed content
  132. * @return string
  133. */
  134. public function detectFormat($content)
  135. {
  136. $dom = XmlParser::getHtmlDocument($content);
  137. $xpath = new DOMXPath($dom);
  138. foreach ($this->formats as $parser_name => $query) {
  139. $nodes = $xpath->query($query);
  140. if ($nodes->length === 1) {
  141. return $parser_name;
  142. }
  143. }
  144. return '';
  145. }
  146. /**
  147. * Add the prefix "http://" if the end-user just enter a domain name.
  148. *
  149. * @param string $url Url
  150. * @return string
  151. */
  152. public function prependScheme($url)
  153. {
  154. if (!preg_match('%^https?://%', $url)) {
  155. $url = 'http://'.$url;
  156. }
  157. return $url;
  158. }
  159. }