common_syndication_parser.inc 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590
  1. <?php
  2. /**
  3. * @file
  4. * Downloading and parsing functions for Common Syndication Parser.
  5. * Pillaged from FeedAPI common syndication parser.
  6. *
  7. * @todo Restructure. OO could work wonders here.
  8. * @todo Write unit tests.
  9. * @todo Keep in Feeds project or host on Drupal?
  10. */
  11. /**
  12. * Parse the feed into a data structure.
  13. *
  14. * @param $feed
  15. * The feed object (contains the URL or the parsed XML structure.
  16. * @return
  17. * stdClass The structured datas extracted from the feed.
  18. */
  19. function common_syndication_parser_parse($string) {
  20. if (!defined('LIBXML_VERSION') || (version_compare(phpversion(), '5.1.0', '<'))) {
  21. @ $xml = simplexml_load_string($string, NULL);
  22. }
  23. else {
  24. @ $xml = simplexml_load_string($string, NULL, LIBXML_NOERROR | LIBXML_NOWARNING | LIBXML_NOCDATA);
  25. }
  26. // Got a malformed XML.
  27. if ($xml === FALSE || is_null($xml)) {
  28. return FALSE;
  29. }
  30. $feed_type = _parser_common_syndication_feed_format_detect($xml);
  31. if ($feed_type == "atom1.0") {
  32. return _parser_common_syndication_atom10_parse($xml);
  33. }
  34. if ($feed_type == "RSS2.0" || $feed_type == "RSS0.91" || $feed_type == "RSS0.92") {
  35. return _parser_common_syndication_RSS20_parse($xml);
  36. }
  37. if ($feed_type == "RDF") {
  38. return _parser_common_syndication_RDF10_parse($xml);
  39. }
  40. return FALSE;
  41. }
  42. /**
  43. * Get the cached version of the <var>$url</var>
  44. */
  45. function _parser_common_syndication_cache_get($url) {
  46. $cache_file = _parser_common_syndication_sanitize_cache() . '/' . md5($url);
  47. if (file_exists($cache_file)) {
  48. $file_content = file_get_contents($cache_file);
  49. return unserialize($file_content);
  50. }
  51. return FALSE;
  52. }
  53. /**
  54. * Determine the feed format of a SimpleXML parsed object structure.
  55. *
  56. * @param $xml
  57. * SimpleXML-preprocessed feed.
  58. * @return
  59. * The feed format short description or FALSE if not compatible.
  60. */
  61. function _parser_common_syndication_feed_format_detect($xml) {
  62. if (!is_object($xml)) {
  63. return FALSE;
  64. }
  65. $attr = $xml->attributes();
  66. $type = strtolower($xml->getName());
  67. if (isset($xml->entry) && $type == "feed") {
  68. return "atom1.0";
  69. }
  70. if ($type == "rss" && $attr["version"] == "2.0") {
  71. return "RSS2.0";
  72. }
  73. if ($type == "rdf" && isset($xml->channel)) {
  74. return "RDF";
  75. }
  76. if ($type == "rss" && $attr["version"] == "0.91") {
  77. return "RSS0.91";
  78. }
  79. if ($type == "rss" && $attr["version"] == "0.92") {
  80. return "RSS0.92";
  81. }
  82. return FALSE;
  83. }
  84. /**
  85. * Parse atom feeds.
  86. */
  87. function _parser_common_syndication_atom10_parse($feed_XML) {
  88. $parsed_source = array();
  89. $ns = array(
  90. "georss" => "http://www.georss.org/georss",
  91. );
  92. $base = $feed_XML->xpath("@base");
  93. $base = (string) array_shift($base);
  94. if (!valid_url($base, TRUE)) {
  95. $base = FALSE;
  96. }
  97. // Detect the title
  98. $parsed_source['title'] = isset($feed_XML->title) ? _parser_common_syndication_title("{$feed_XML->title}") : "";
  99. // Detect the description
  100. $parsed_source['description'] = isset($feed_XML->subtitle) ? "{$feed_XML->subtitle}" : "";
  101. $parsed_source['link'] = _parser_common_syndication_link($feed_XML->link);
  102. if (valid_url($parsed_source['link']) && !valid_url($parsed_source['link'], TRUE) && !empty($base)) {
  103. $parsed_source['link'] = $base . $parsed_source['link'];
  104. }
  105. $parsed_source['items'] = array();
  106. foreach ($feed_XML->entry as $news) {
  107. $original_url = NULL;
  108. $guid = !empty($news->id) ? "{$news->id}" : NULL;
  109. if (valid_url($guid, TRUE)) {
  110. $original_url = $guid;
  111. }
  112. $georss = (array)$news->children($ns["georss"]);
  113. $geoname = '';
  114. if (isset($georss['featureName'])) {
  115. $geoname = "{$georss['featureName']}";
  116. }
  117. $latlon =
  118. $lat =
  119. $lon = NULL;
  120. if (isset($georss['point'])) {
  121. $latlon = explode(' ', $georss['point']);
  122. $lat = "{$latlon[0]}";
  123. $lon = "{$latlon[1]}";
  124. if (!$geoname) {
  125. $geoname = "{$lat} {$lon}";
  126. }
  127. }
  128. $additional_taxonomies = array();
  129. if (isset($news->category)) {
  130. $additional_taxonomies['ATOM Categories'] = array();
  131. $additional_taxonomies['ATOM Domains'] = array();
  132. foreach ($news->category as $category) {
  133. if (isset($category['scheme'])) {
  134. $domain = "{$category['scheme']}";
  135. if (!empty($domain)) {
  136. if (!isset($additional_taxonomies['ATOM Domains'][$domain])) {
  137. $additional_taxonomies['ATOM Domains'][$domain] = array();
  138. }
  139. $additional_taxonomies['ATOM Domains'][$domain][] = count($additional_taxonomies['ATOM Categories']) - 1;
  140. }
  141. }
  142. $additional_taxonomies['ATOM Categories'][] = "{$category['term']}";
  143. }
  144. }
  145. $title = "{$news->title}";
  146. $body = '';
  147. if (!empty($news->content)) {
  148. foreach ($news->content->children() as $child) {
  149. $body .= $child->asXML();
  150. }
  151. $body .= "{$news->content}";
  152. }
  153. elseif (!empty($news->summary)) {
  154. foreach ($news->summary->children() as $child) {
  155. $body .= $child->asXML();
  156. }
  157. $body .= "{$news->summary}";
  158. }
  159. if (!empty($news->content['src'])) {
  160. // some src elements in some valid atom feeds contained no urls at all
  161. if (valid_url("{$news->content['src']}", TRUE)) {
  162. $original_url = "{$news->content['src']}";
  163. }
  164. }
  165. $author_found = FALSE;
  166. if (!empty($news->source->author->name)) {
  167. $original_author = "{$news->source->author->name}";
  168. $author_found = TRUE;
  169. }
  170. elseif (!empty($news->author->name)) {
  171. $original_author = "{$news->author->name}";
  172. $author_found = TRUE;
  173. }
  174. if (!empty($feed_XML->author->name) && !$author_found) {
  175. $original_author = "{$feed_XML->author->name}";
  176. }
  177. $original_url = _parser_common_syndication_link($news->link);
  178. $item = array();
  179. $item['title'] = _parser_common_syndication_title($title, $body);
  180. $item['description'] = $body;
  181. $item['author_name'] = $original_author;
  182. // Fall back to updated for timestamp if both published and issued are
  183. // empty.
  184. if (isset($news->published)) {
  185. $item['timestamp'] = _parser_common_syndication_parse_date("{$news->published}");
  186. }
  187. elseif (isset($news->issued)) {
  188. $item['timestamp'] = _parser_common_syndication_parse_date("{$news->issued}");
  189. }
  190. elseif (isset($news->updated)) {
  191. $item['timestamp'] = _parser_common_syndication_parse_date("{$news->updated}");
  192. }
  193. $item['url'] = trim($original_url);
  194. if (valid_url($item['url']) && !valid_url($item['url'], TRUE) && !empty($base)) {
  195. $item['url'] = $base . $item['url'];
  196. }
  197. // Fall back on URL if GUID is empty.
  198. if (!empty($guid)) {
  199. $item['guid'] = $guid;
  200. }
  201. else {
  202. $item['guid'] = $item['url'];
  203. }
  204. $item['geolocations'] = array();
  205. if ($lat && $lon) {
  206. $item['geolocations'] = array(
  207. array(
  208. 'name' => $geoname,
  209. 'lat' => $lat,
  210. 'lon' => $lon,
  211. ),
  212. );
  213. }
  214. $item['tags'] = isset($additional_taxonomies['ATOM Categories']) ? $additional_taxonomies['ATOM Categories'] : array();
  215. $item['domains'] = isset($additional_taxonomies['ATOM Domains']) ? $additional_taxonomies['ATOM Domains'] : array();
  216. $parsed_source['items'][] = $item;
  217. }
  218. return $parsed_source;
  219. }
  220. /**
  221. * Parse RDF Site Summary (RSS) 1.0 feeds in RDF/XML format.
  222. *
  223. * @see http://web.resource.org/rss/1.0/
  224. */
  225. function _parser_common_syndication_RDF10_parse($feed_XML) {
  226. // Declare some canonical standard prefixes for well-known namespaces:
  227. static $canonical_namespaces = array(
  228. 'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
  229. 'rdfs' => 'http://www.w3.org/2000/01/rdf-schema#',
  230. 'xsi' => 'http://www.w3.org/2001/XMLSchema-instance#',
  231. 'xsd' => 'http://www.w3.org/2001/XMLSchema#',
  232. 'owl' => 'http://www.w3.org/2002/07/owl#',
  233. 'dc' => 'http://purl.org/dc/elements/1.1/',
  234. 'dcterms' => 'http://purl.org/dc/terms/',
  235. 'dcmitype' => 'http://purl.org/dc/dcmitype/',
  236. 'foaf' => 'http://xmlns.com/foaf/0.1/',
  237. 'rss' => 'http://purl.org/rss/1.0/',
  238. );
  239. // Get all namespaces declared in the feed element, with special handling
  240. // for PHP versions prior to 5.1.2 as they don't handle namespaces.
  241. $namespaces = version_compare(phpversion(), '5.1.2', '<') ? array() : $feed_XML->getNamespaces(TRUE);
  242. // Process the <rss:channel> resource containing feed metadata:
  243. foreach ($feed_XML->children($canonical_namespaces['rss'])->channel as $rss_channel) {
  244. $parsed_source = array(
  245. 'title' => _parser_common_syndication_title((string) $rss_channel->title),
  246. 'description' => (string) $rss_channel->description,
  247. 'link' => (string) $rss_channel->link,
  248. 'items' => array(),
  249. );
  250. break;
  251. }
  252. // Process each <rss:item> resource contained in the feed:
  253. foreach ($feed_XML->children($canonical_namespaces['rss'])->item as $rss_item) {
  254. // Extract all available RDF statements from the feed item's RDF/XML
  255. // tags, allowing for both the item's attributes and child elements to
  256. // contain RDF properties:
  257. $rdf_data = array();
  258. foreach ($namespaces as $ns => $ns_uri) {
  259. // Note that we attempt to normalize the found property name
  260. // namespaces to well-known 'standard' prefixes where possible, as the
  261. // feed may in principle use any arbitrary prefixes and we should
  262. // still be able to correctly handle it.
  263. foreach ($rss_item->attributes($ns_uri) as $attr_name => $attr_value) {
  264. $ns_prefix = ($ns_prefix = array_search($ns_uri, $canonical_namespaces)) ? $ns_prefix : $ns;
  265. $rdf_data[$ns_prefix . ':' . $attr_name][] = (string) $attr_value;
  266. }
  267. foreach ($rss_item->children($ns_uri) as $rss_property) {
  268. $ns_prefix = ($ns_prefix = array_search($ns_uri, $canonical_namespaces)) ? $ns_prefix : $ns;
  269. $rdf_data[$ns_prefix . ':' . $rss_property->getName()][] = (string) $rss_property;
  270. }
  271. }
  272. // Declaratively define mappings that determine how to construct the result object.
  273. $item = _parser_common_syndication_RDF10_item($rdf_data, array(
  274. 'title' => array('rss:title', 'dc:title'),
  275. 'description' => array('rss:description', 'dc:description', 'content:encoded'),
  276. 'url' => array('rss:link', 'rdf:about'),
  277. 'author_name' => array('dc:creator', 'dc:publisher'),
  278. 'guid' => 'rdf:about',
  279. 'timestamp' => 'dc:date',
  280. 'tags' => 'dc:subject'
  281. ));
  282. // Special handling for the title:
  283. $item['title'] = _parser_common_syndication_title($item['title'], $item['description']);
  284. // Parse any date/time values into Unix timestamps:
  285. $item['timestamp'] = _parser_common_syndication_parse_date($item['timestamp']);
  286. // If no GUID found, use the URL of the feed.
  287. if (empty($item['guid'])) {
  288. $item['guid'] = $item['url'];
  289. }
  290. // Add every found RDF property to the feed item.
  291. $item['rdf'] = array();
  292. foreach ($rdf_data as $rdf_property => $rdf_value) {
  293. // looks nicer in the mapper UI
  294. // @todo Revisit, not used with feedapi mapper anymore.
  295. $rdf_property = str_replace(':', '_', $rdf_property);
  296. $item['rdf'][$rdf_property] = $rdf_value;
  297. }
  298. $parsed_source['items'][] = $item;
  299. }
  300. return $parsed_source;
  301. }
  302. function _parser_common_syndication_RDF10_property($rdf_data, $rdf_properties = array()) {
  303. $rdf_properties = is_array($rdf_properties) ? $rdf_properties : array_slice(func_get_args(), 1);
  304. foreach ($rdf_properties as $rdf_property) {
  305. if ($rdf_property && !empty($rdf_data[$rdf_property])) {
  306. // remove empty strings
  307. return array_filter($rdf_data[$rdf_property], 'strlen');
  308. }
  309. }
  310. }
  311. function _parser_common_syndication_RDF10_item($rdf_data, $mappings) {
  312. foreach ($mappings as $k => $v) {
  313. $values = _parser_common_syndication_RDF10_property($rdf_data, $v);
  314. $mappings[$k] = !is_array($values) || count($values) > 1 ? $values : reset($values);
  315. }
  316. return $mappings;
  317. }
  318. /**
  319. * Parse RSS2.0 feeds.
  320. */
  321. function _parser_common_syndication_RSS20_parse($feed_XML) {
  322. $ns = array(
  323. "content" => "http://purl.org/rss/1.0/modules/content/",
  324. "dc" => "http://purl.org/dc/elements/1.1/",
  325. "georss" => "http://www.georss.org/georss",
  326. );
  327. $parsed_source = array();
  328. // Detect the title.
  329. $parsed_source['title'] = isset($feed_XML->channel->title) ? _parser_common_syndication_title("{$feed_XML->channel->title}") : "";
  330. // Detect the description.
  331. $parsed_source['description'] = isset($feed_XML->channel->description) ? "{$feed_XML->channel->description}" : "";
  332. // Detect the link.
  333. $parsed_source['link'] = isset($feed_XML->channel->link) ? "{$feed_XML->channel->link}" : "";
  334. $parsed_source['items'] = array();
  335. foreach ($feed_XML->xpath('//item') as $news) {
  336. $title = $body = $original_author = $original_url = $guid = '';
  337. $category = $news->xpath('category');
  338. // Get children for current namespace.
  339. if (version_compare(phpversion(), '5.1.2', '>')) {
  340. $content = (array)$news->children($ns["content"]);
  341. $dc = (array)$news->children($ns["dc"]);
  342. $georss = (array)$news->children($ns["georss"]);
  343. }
  344. $news = (array) $news;
  345. $news['category'] = $category;
  346. if (isset($news['title'])) {
  347. $title = "{$news['title']}";
  348. }
  349. if (isset($news['description'])) {
  350. $body = "{$news['description']}";
  351. }
  352. // Some sources use content:encoded as description i.e.
  353. // PostNuke PageSetter module.
  354. if (isset($news['encoded'])) { // content:encoded for PHP < 5.1.2.
  355. if (strlen($body) < strlen("{$news['encoded']}")) {
  356. $body = "{$news['encoded']}";
  357. }
  358. }
  359. if (isset($content['encoded'])) { // content:encoded for PHP >= 5.1.2.
  360. if (strlen($body) < strlen("{$content['encoded']}")) {
  361. $body = "{$content['encoded']}";
  362. }
  363. }
  364. if (!isset($body)) {
  365. $body = "{$news['title']}";
  366. }
  367. if (!empty($news['author'])) {
  368. $original_author = "{$news['author']}";
  369. }
  370. elseif (!empty($dc["creator"])) {
  371. $original_author = (string)$dc["creator"];
  372. }
  373. if (!empty($news['link'])) {
  374. $original_url = "{$news['link']}";
  375. $guid = $original_url;
  376. }
  377. if (!empty($news['guid'])) {
  378. $guid = "{$news['guid']}";
  379. }
  380. if (!empty($georss['featureName'])) {
  381. $geoname = "{$georss['featureName']}";
  382. }
  383. $lat =
  384. $lon =
  385. $latlon =
  386. $geoname = NULL;
  387. if (!empty($georss['point'])) {
  388. $latlon = explode(' ', $georss['point']);
  389. $lat = "{$latlon[0]}";
  390. $lon = "{$latlon[1]}";
  391. if (!$geoname) {
  392. $geoname = "$lat $lon";
  393. }
  394. }
  395. $additional_taxonomies = array();
  396. $additional_taxonomies['RSS Categories'] = array();
  397. $additional_taxonomies['RSS Domains'] = array();
  398. if (isset($news['category'])) {
  399. foreach ($news['category'] as $category) {
  400. $additional_taxonomies['RSS Categories'][] = "{$category}";
  401. if (isset($category['domain'])) {
  402. $domain = "{$category['domain']}";
  403. if (!empty($domain)) {
  404. if (!isset($additional_taxonomies['RSS Domains'][$domain])) {
  405. $additional_taxonomies['RSS Domains'][$domain] = array();
  406. }
  407. $additional_taxonomies['RSS Domains'][$domain][] = count($additional_taxonomies['RSS Categories']) - 1;
  408. }
  409. }
  410. }
  411. }
  412. $item = array();
  413. $item['title'] = _parser_common_syndication_title($title, $body);
  414. $item['description'] = $body;
  415. $item['author_name'] = $original_author;
  416. if (!empty($news['pubDate'])) {
  417. $item['timestamp'] = _parser_common_syndication_parse_date($news['pubDate']);
  418. }
  419. elseif (!empty($dc['date'])) {
  420. $item['timestamp'] = _parser_common_syndication_parse_date($dc['date']);
  421. }
  422. else {
  423. $item['timestamp'] = time();
  424. }
  425. $item['url'] = trim($original_url);
  426. $item['guid'] = $guid;
  427. $item['geolocations'] = array();
  428. if (isset($geoname, $lat, $lon)) {
  429. $item['geolocations'] = array(
  430. array(
  431. 'name' => $geoname,
  432. 'lat' => $lat,
  433. 'lon' => $lon,
  434. ),
  435. );
  436. }
  437. $item['domains'] = $additional_taxonomies['RSS Domains'];
  438. $item['tags'] = $additional_taxonomies['RSS Categories'];
  439. $parsed_source['items'][] = $item;
  440. }
  441. return $parsed_source;
  442. }
  443. /**
  444. * Parse a date comes from a feed.
  445. *
  446. * @param $date_string
  447. * The date string in various formats.
  448. * @return
  449. * The timestamp of the string or the current time if can't be parsed
  450. */
  451. function _parser_common_syndication_parse_date($date_str) {
  452. // PHP < 5.3 doesn't like the GMT- notation for parsing timezones.
  453. $date_str = str_replace("GMT-", "-", $date_str);
  454. $date_str = str_replace("GMT+", "+", $date_str);
  455. $parsed_date = strtotime($date_str);
  456. if ($parsed_date === FALSE || $parsed_date == -1) {
  457. $parsed_date = _parser_common_syndication_parse_w3cdtf($date_str);
  458. }
  459. return $parsed_date === FALSE ? time() : $parsed_date;
  460. }
  461. /**
  462. * Parse the W3C date/time format, a subset of ISO 8601.
  463. *
  464. * PHP date parsing functions do not handle this format.
  465. * See http://www.w3.org/TR/NOTE-datetime for more information.
  466. * Originally from MagpieRSS (http://magpierss.sourceforge.net/).
  467. *
  468. * @param $date_str
  469. * A string with a potentially W3C DTF date.
  470. * @return
  471. * A timestamp if parsed successfully or FALSE if not.
  472. */
  473. function _parser_common_syndication_parse_w3cdtf($date_str) {
  474. if (preg_match('/(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2})(:(\d{2}))?(?:([-+])(\d{2}):?(\d{2})|(Z))?/', $date_str, $match)) {
  475. list($year, $month, $day, $hours, $minutes, $seconds) = array($match[1], $match[2], $match[3], $match[4], $match[5], $match[6]);
  476. // Calculate the epoch for current date assuming GMT.
  477. $epoch = gmmktime($hours, $minutes, $seconds, $month, $day, $year);
  478. if ($match[10] != 'Z') { // Z is zulu time, aka GMT
  479. list($tz_mod, $tz_hour, $tz_min) = array($match[8], $match[9], $match[10]);
  480. // Zero out the variables.
  481. if (!$tz_hour) {
  482. $tz_hour = 0;
  483. }
  484. if (!$tz_min) {
  485. $tz_min = 0;
  486. }
  487. $offset_secs = (($tz_hour * 60) + $tz_min) * 60;
  488. // Is timezone ahead of GMT? If yes, subtract offset.
  489. if ($tz_mod == '+') {
  490. $offset_secs *= -1;
  491. }
  492. $epoch += $offset_secs;
  493. }
  494. return $epoch;
  495. }
  496. else {
  497. return FALSE;
  498. }
  499. }
  500. /**
  501. * Extract the link that points to the original content (back to site or
  502. * original article)
  503. *
  504. * @param $links
  505. * Array of SimpleXML objects
  506. */
  507. function _parser_common_syndication_link($links) {
  508. $to_link = '';
  509. if (count($links) > 0) {
  510. foreach ($links as $link) {
  511. $link = $link->attributes();
  512. $to_link = isset($link["href"]) ? "{$link["href"]}" : "";
  513. if (isset($link["rel"])) {
  514. if ("{$link["rel"]}" == 'alternate') {
  515. break;
  516. }
  517. }
  518. }
  519. }
  520. return $to_link;
  521. }
  522. /**
  523. * Prepare raw data to be a title
  524. */
  525. function _parser_common_syndication_title($title, $body = FALSE) {
  526. if (empty($title) && !empty($body)) {
  527. // Explode to words and use the first 3 words.
  528. $words = preg_split('/[\s,]+/', strip_tags($body));
  529. $title = implode(' ', array_slice($words, 0, 3));
  530. }
  531. return $title;
  532. }