first import

This commit is contained in:
Bachir Soussi Chiadmi
2015-04-08 11:40:19 +02:00
commit 1bc61b12ad
8435 changed files with 1582817 additions and 0 deletions

View File

@@ -0,0 +1,327 @@
<?php
/**
* @file
* Contains CSV Parser.
*
* Functions in this file are independent of the Feeds specific implementation.
* Thanks to jpetso http://drupal.org/user/56020 for most of the code in this
* file.
*/
/**
* Text lines from file iterator.
*/
class ParserCSVIterator implements Iterator {
private $handle;
private $currentLine;
private $currentPos;
public function __construct($filepath) {
$this->handle = fopen($filepath, 'r');
$this->currentLine = NULL;
$this->currentPos = NULL;
}
function __destruct() {
if ($this->handle) {
fclose($this->handle);
}
}
public function rewind($pos = 0) {
if ($this->handle) {
fseek($this->handle, $pos);
$this->next();
}
}
public function next() {
if ($this->handle) {
$this->currentLine = feof($this->handle) ? NULL : fgets($this->handle);
$this->currentPos = ftell($this->handle);
return $this->currentLine;
}
}
public function valid() {
return isset($this->currentLine);
}
public function current() {
return $this->currentLine;
}
public function currentPos() {
return $this->currentPos;
}
public function key() {
return 'line';
}
}
/**
* Functionality to parse CSV files into a two dimensional array.
*/
class ParserCSV {
private $delimiter;
private $skipFirstLine;
private $columnNames;
private $timeout;
private $timeoutReached;
private $startByte;
private $lineLimit;
private $lastLinePos;
public function __construct() {
$this->delimiter = ',';
$this->skipFirstLine = FALSE;
$this->columnNames = FALSE;
$this->timeout = FALSE;
$this->timeoutReached = FALSE;
$this->startByte = 0;
$this->lineLimit = 0;
$this->lastLinePos = 0;
}
/**
* Set the column delimiter string.
* By default, the comma (',') is used as delimiter.
*/
public function setDelimiter($delimiter) {
$this->delimiter = $delimiter;
}
/**
* Set this to TRUE if the parser should skip the first line of the CSV text,
* which might be desired if the first line contains the column names.
* By default, this is set to FALSE and the first line is not skipped.
*/
public function setSkipFirstLine($skipFirstLine) {
$this->skipFirstLine = $skipFirstLine;
}
/**
* Specify an array of column names if you know them in advance, or FALSE
* (which is the default) to unset any prior column names. If no column names
* are set, the parser will put each row into a simple numerically indexed
* array. If column names are given, the parser will create arrays with
* these column names as array keys instead.
*/
public function setColumnNames($columnNames) {
$this->columnNames = $columnNames;
}
/**
* Define the time (in milliseconds) after which the parser stops parsing,
* even if it has not yet finished processing the CSV data. If the timeout
* has been reached before parsing is done, the parse() method will return
* an incomplete list of rows - a single row will never be cut off in the
* middle, though. By default, no timeout (@p $timeout == FALSE) is defined.
*
* You can check if the timeout has been reached by calling the
* timeoutReached() method after parse() has been called.
*/
public function setTimeout($timeout) {
$this->timeout = $timeout;
}
/**
* After calling the parse() method, determine if the timeout (set by the
* setTimeout() method) has been reached.
*
* @deprecated Use lastLinePos() instead to determine whether a file has
* finished parsing.
*/
public function timeoutReached() {
return $this->timeoutReached;
}
/**
* Define the number of lines to parse in one parsing operation.
*
* By default, all lines of a file are being parsed.
*/
public function setLineLimit($lines) {
$this->lineLimit = $lines;
}
/**
* Get the byte number where the parser left off after last parse() call.
*
* @return
* 0 if all lines or no line has been parsed, the byte position of where a
* timeout or the line limit has been reached otherwise. This position can be
* used to set the start byte for the next iteration after parse() has
* reached the timeout set with setTimeout() or the line limit set with
* setLineLimit().
*
* @see ParserCSV::setStartByte()
*/
public function lastLinePos() {
return $this->lastLinePos;
}
/**
* Set the byte where file should be started to read.
*
* Useful when parsing a file in batches.
*/
public function setStartByte($start) {
return $this->startByte = $start;
}
/**
* Parse CSV files into a two dimensional array.
*
* @param Iterator $lineIterator
* An Iterator object that yields line strings, e.g. ParserCSVIterator.
* @param $start
* The byte number from where to start parsing the file.
* @param $lines
* The number of lines to parse, 0 for all lines.
* @return
* Two dimensional array that contains the data in the CSV file.
*/
public function parse(Iterator $lineIterator) {
$skipLine = $this->skipFirstLine;
$rows = array();
$this->timeoutReached = FALSE;
$this->lastLinePos = 0;
$maxTime = empty($this->timeout) ? FALSE : (microtime() + $this->timeout);
$linesParsed = 0;
for ($lineIterator->rewind($this->startByte); $lineIterator->valid(); $lineIterator->next()) {
// Make really sure we've got lines without trailing newlines.
$line = trim($lineIterator->current(), "\r\n");
// Skip empty lines.
if (empty($line)) {
continue;
}
// If the first line contains column names, skip it.
if ($skipLine) {
$skipLine = FALSE;
continue;
}
// The actual parser. explode() is unfortunately not suitable because the
// delimiter might be located inside a quoted field, and that would break
// the field and/or require additional effort to re-join the fields.
$quoted = FALSE;
$currentIndex = 0;
$currentField = '';
$fields = array();
// We must use strlen() as we're parsing byte by byte using strpos(), so
// drupal_strlen() will not work properly.
while ($currentIndex <= strlen($line)) {
if ($quoted) {
$nextQuoteIndex = strpos($line, '"', $currentIndex);
if ($nextQuoteIndex === FALSE) {
// There's a line break before the quote is closed, so fetch the
// next line and start from there.
$currentField .= substr($line, $currentIndex);
$lineIterator->next();
if (!$lineIterator->valid()) {
// Whoa, an unclosed quote! Well whatever, let's just ignore
// that shortcoming and record it nevertheless.
$fields[] = $currentField;
break;
}
// Ok, so, on with fetching the next line, as mentioned above.
$currentField .= "\n";
$line = trim($lineIterator->current(), "\r\n");
$currentIndex = 0;
continue;
}
// There's actually another quote in this line...
// find out whether it's escaped or not.
$currentField .= substr($line, $currentIndex, $nextQuoteIndex - $currentIndex);
if (isset($line[$nextQuoteIndex + 1]) && $line[$nextQuoteIndex + 1] === '"') {
// Escaped quote, add a single one to the field and proceed quoted.
$currentField .= '"';
$currentIndex = $nextQuoteIndex + 2;
}
else {
// End of the quoted section, close the quote and let the
// $quoted == FALSE block finalize the field.
$quoted = FALSE;
$currentIndex = $nextQuoteIndex + 1;
}
}
else { // $quoted == FALSE
// First, let's find out where the next character of interest is.
$nextQuoteIndex = strpos($line, '"', $currentIndex);
$nextDelimiterIndex = strpos($line, $this->delimiter, $currentIndex);
if ($nextQuoteIndex === FALSE) {
$nextIndex = $nextDelimiterIndex;
}
elseif ($nextDelimiterIndex === FALSE) {
$nextIndex = $nextQuoteIndex;
}
else {
$nextIndex = min($nextQuoteIndex, $nextDelimiterIndex);
}
if ($nextIndex === FALSE) {
// This line is done, add the rest of it as last field.
$currentField .= substr($line, $currentIndex);
$fields[] = $currentField;
break;
}
elseif ($line[$nextIndex] === $this->delimiter[0]) {
$length = ($nextIndex + strlen($this->delimiter) - 1) - $currentIndex;
$currentField .= substr($line, $currentIndex, $length);
$fields[] = $currentField;
$currentField = '';
$currentIndex += $length + 1;
// Continue with the next field.
}
else { // $line[$nextIndex] == '"'
$quoted = TRUE;
$currentField .= substr($line, $currentIndex, $nextIndex - $currentIndex);
$currentIndex = $nextIndex + 1;
// Continue this field in the $quoted == TRUE block.
}
}
}
// End of CSV parser. We've now got all the fields of the line as strings
// in the $fields array.
if (empty($this->columnNames)) {
$row = $fields;
}
else {
$row = array();
foreach ($this->columnNames as $columnName) {
$field = array_shift($fields);
$row[$columnName] = isset($field) ? $field : '';
}
}
$rows[] = $row;
// Quit parsing if timeout has been reached or requested lines have been
// reached.
if (!empty($maxTime) && microtime() > $maxTime) {
$this->timeoutReached = TRUE;
$this->lastLinePos = $lineIterator->currentPos();
break;
}
$linesParsed++;
if ($this->lineLimit && $linesParsed >= $this->lineLimit) {
$this->lastLinePos = $lineIterator->currentPos();
break;
}
}
return $rows;
}
}

View File

@@ -0,0 +1,390 @@
<?php
/**
* @file
* Pubsubhubbub subscriber library.
*
* Readme
* http://github.com/lxbarth/PuSHSubscriber
*
* License
* http://github.com/lxbarth/PuSHSubscriber/blob/master/LICENSE.txt
*/
/**
* PubSubHubbub subscriber.
*/
class PuSHSubscriber {
protected $domain;
protected $subscriber_id;
protected $subscription_class;
protected $env;
/**
* Singleton.
*
* PuSHSubscriber identifies a unique subscription by a domain and a numeric
* id. The numeric id is assumed to e unique in its domain.
*
* @param $domain
* A string that identifies the domain in which $subscriber_id is unique.
* @param $subscriber_id
* A numeric subscriber id.
* @param $subscription_class
* The class to use for handling subscriptions. Class MUST implement
* PuSHSubscriberSubscriptionInterface
* @param PuSHSubscriberEnvironmentInterface $env
* Environmental object for messaging and logging.
*/
public static function instance($domain, $subscriber_id, $subscription_class, PuSHSubscriberEnvironmentInterface $env) {
static $subscribers;
if (!isset($subscriber[$domain][$subscriber_id])) {
$subscriber = new PuSHSubscriber($domain, $subscriber_id, $subscription_class, $env);
}
return $subscriber;
}
/**
* Protect constructor.
*/
protected function __construct($domain, $subscriber_id, $subscription_class, PuSHSubscriberEnvironmentInterface $env) {
$this->domain = $domain;
$this->subscriber_id = $subscriber_id;
$this->subscription_class = $subscription_class;
$this->env = $env;
}
/**
* Subscribe to a given URL. Attempt to retrieve 'hub' and 'self' links from
* document at $url and issue a subscription request to the hub.
*
* @param $url
* The URL of the feed to subscribe to.
* @param $callback_url
* The full URL that hub should invoke for subscription verification or for
* notifications.
* @param $hub
* The URL of a hub. If given overrides the hub URL found in the document
* at $url.
*/
public function subscribe($url, $callback_url, $hub = '') {
// Fetch document, find rel=hub and rel=self.
// If present, issue subscription request.
$request = curl_init($url);
curl_setopt($request, CURLOPT_FOLLOWLOCATION, TRUE);
curl_setopt($request, CURLOPT_RETURNTRANSFER, TRUE);
$data = curl_exec($request);
if (curl_getinfo($request, CURLINFO_HTTP_CODE) == 200) {
try {
$xml = @ new SimpleXMLElement($data);
$xml->registerXPathNamespace('atom', 'http://www.w3.org/2005/Atom');
if (empty($hub) && $hub = @current($xml->xpath("//atom:link[attribute::rel='hub']"))) {
$hub = (string) $hub->attributes()->href;
}
if ($self = @current($xml->xpath("//atom:link[attribute::rel='self']"))) {
$self = (string) $self->attributes()->href;
}
}
catch (Exception $e) {
// Do nothing.
}
}
curl_close($request);
// Fall back to $url if $self is not given.
if (!$self) {
$self = $url;
}
if (!empty($hub) && !empty($self)) {
$this->request($hub, $self, 'subscribe', $callback_url);
}
}
/**
* @todo Unsubscribe from a hub.
* @todo Make sure we unsubscribe with the correct topic URL as it can differ
* from the initial subscription URL.
*
* @param $topic_url
* The URL of the topic to unsubscribe from.
* @param $callback_url
* The callback to unsubscribe.
*/
public function unsubscribe($topic_url, $callback_url) {
if ($sub = $this->subscription()) {
$this->request($sub->hub, $sub->topic, 'unsubscribe', $callback_url);
$sub->delete();
}
}
/**
* Request handler for subscription callbacks.
*/
public function handleRequest($callback) {
if (isset($_GET['hub_challenge'])) {
$this->verifyRequest();
}
// No subscription notification has ben sent, we are being notified.
else {
if ($raw = $this->receive()) {
$callback($raw, $this->domain, $this->subscriber_id);
}
}
}
/**
* Receive a notification.
*
* @param $ignore_signature
* If FALSE, only accept payload if there is a signature present and the
* signature matches the payload. Warning: setting to TRUE results in
* unsafe behavior.
*
* @return
* An XML string that is the payload of the notification if valid, FALSE
* otherwise.
*/
public function receive($ignore_signature = FALSE) {
/**
* Verification steps:
*
* 1) Verify that this is indeed a POST reuest.
* 2) Verify that posted string is XML.
* 3) Per default verify sender of message by checking the message's
* signature against the shared secret.
*/
if ($_SERVER['REQUEST_METHOD'] == 'POST') {
$raw = file_get_contents('php://input');
if (@simplexml_load_string($raw)) {
if ($ignore_signature) {
return $raw;
}
if (isset($_SERVER['HTTP_X_HUB_SIGNATURE']) && ($sub = $this->subscription())) {
$result = array();
parse_str($_SERVER['HTTP_X_HUB_SIGNATURE'], $result);
if (isset($result['sha1']) && $result['sha1'] == hash_hmac('sha1', $raw, $sub->secret)) {
return $raw;
}
else {
$this->log('Could not verify signature.', 'error');
}
}
else {
$this->log('No signature present.', 'error');
}
}
}
return FALSE;
}
/**
* Verify a request. After a hub has received a subscribe or unsubscribe
* request (see PuSHSubscriber::request()) it sends back a challenge verifying
* that an action indeed was requested ($_GET['hub_challenge']). This
* method handles the challenge.
*/
public function verifyRequest() {
if (isset($_GET['hub_challenge'])) {
/**
* If a subscription is present, compare the verify token. If the token
* matches, set the status on the subscription record and confirm
* positive.
*
* If we cannot find a matching subscription and the hub checks on
* 'unsubscribe' confirm positive.
*
* In all other cases confirm negative.
*/
if ($sub = $this->subscription()) {
if ($_GET['hub_verify_token'] == $sub->post_fields['hub.verify_token']) {
if ($_GET['hub_mode'] == 'subscribe' && $sub->status == 'subscribe') {
$sub->status = 'subscribed';
$sub->post_fields = array();
$sub->save();
$this->log('Verified "subscribe" request.');
$verify = TRUE;
}
elseif ($_GET['hub_mode'] == 'unsubscribe' && $sub->status == 'unsubscribe') {
$sub->status = 'unsubscribed';
$sub->post_fields = array();
$sub->save();
$this->log('Verified "unsubscribe" request.');
$verify = TRUE;
}
}
}
elseif ($_GET['hub_mode'] == 'unsubscribe') {
$this->log('Verified "unsubscribe" request.');
$verify = TRUE;
}
if ($verify) {
header('HTTP/1.1 200 "Found"', NULL, 200);
print $_GET['hub_challenge'];
drupal_exit();
}
}
header('HTTP/1.1 404 "Not Found"', NULL, 404);
$this->log('Could not verify subscription.', 'error');
drupal_exit();
}
/**
* Issue a subscribe or unsubcribe request to a PubsubHubbub hub.
*
* @param $hub
* The URL of the hub's subscription endpoint.
* @param $topic
* The topic URL of the feed to subscribe to.
* @param $mode
* 'subscribe' or 'unsubscribe'.
* @param $callback_url
* The subscriber's notifications callback URL.
*
* Compare to http://pubsubhubbub.googlecode.com/svn/trunk/pubsubhubbub-core-0.2.html#anchor5
*
* @todo Make concurrency safe.
*/
protected function request($hub, $topic, $mode, $callback_url) {
$secret = hash('sha1', uniqid(rand(), TRUE));
$post_fields = array(
'hub.callback' => $callback_url,
'hub.mode' => $mode,
'hub.topic' => $topic,
'hub.verify' => 'sync',
'hub.lease_seconds' => '', // Permanent subscription.
'hub.secret' => $secret,
'hub.verify_token' => md5(session_id() . rand()),
);
$sub = new $this->subscription_class($this->domain, $this->subscriber_id, $hub, $topic, $secret, $mode, $post_fields);
$sub->save();
// Issue subscription request.
$request = curl_init($hub);
curl_setopt($request, CURLOPT_POST, TRUE);
curl_setopt($request, CURLOPT_POSTFIELDS, $post_fields);
curl_setopt($request, CURLOPT_RETURNTRANSFER, TRUE);
curl_exec($request);
$code = curl_getinfo($request, CURLINFO_HTTP_CODE);
if (in_array($code, array(202, 204))) {
$this->log("Positive response to \"$mode\" request ($code).");
}
else {
$sub->status = $mode . ' failed';
$sub->save();
$this->log("Error issuing \"$mode\" request to $hub ($code).", 'error');
}
curl_close($request);
}
/**
* Get the subscription associated with this subscriber.
*
* @return
* A PuSHSubscriptionInterface object if a subscription exist, NULL
* otherwise.
*/
public function subscription() {
return call_user_func(array($this->subscription_class, 'load'), $this->domain, $this->subscriber_id);
}
/**
* Determine whether this subscriber is successfully subscribed or not.
*/
public function subscribed() {
if ($sub = $this->subscription()) {
if ($sub->status == 'subscribed') {
return TRUE;
}
}
return FALSE;
}
/**
* Helper for messaging.
*/
protected function msg($msg, $level = 'status') {
$this->env->msg($msg, $level);
}
/**
* Helper for logging.
*/
protected function log($msg, $level = 'status') {
$this->env->log("{$this->domain}:{$this->subscriber_id}\t$msg", $level);
}
}
/**
* Implement to provide a storage backend for subscriptions.
*
* Variables passed in to the constructor must be accessible as public class
* variables.
*/
interface PuSHSubscriptionInterface {
/**
* @param $domain
* A string that defines the domain in which the subscriber_id is unique.
* @param $subscriber_id
* A unique numeric subscriber id.
* @param $hub
* The URL of the hub endpoint.
* @param $topic
* The topic to subscribe to.
* @param $secret
* A secret key used for message authentication.
* @param $status
* The status of the subscription.
* 'subscribe' - subscribing to a feed.
* 'unsubscribe' - unsubscribing from a feed.
* 'subscribed' - subscribed.
* 'unsubscribed' - unsubscribed.
* 'subscribe failed' - subscribe request failed.
* 'unsubscribe failed' - unsubscribe request failed.
* @param $post_fields
* An array of the fields posted to the hub.
*/
public function __construct($domain, $subscriber_id, $hub, $topic, $secret, $status = '', $post_fields = '');
/**
* Save a subscription.
*/
public function save();
/**
* Load a subscription.
*
* @return
* A PuSHSubscriptionInterface object if a subscription exist, NULL
* otherwise.
*/
public static function load($domain, $subscriber_id);
/**
* Delete a subscription.
*/
public function delete();
}
/**
* Implement to provide environmental functionality like user messages and
* logging.
*/
interface PuSHSubscriberEnvironmentInterface {
/**
* A message to be displayed to the user on the current page load.
*
* @param $msg
* A string that is the message to be displayed.
* @param $level
* A string that is either 'status', 'warning' or 'error'.
*/
public function msg($msg, $level = 'status');
/**
* A log message to be logged to the database or the file system.
*
* @param $msg
* A string that is the message to be displayed.
* @param $level
* A string that is either 'status', 'warning' or 'error'.
*/
public function log($msg, $level = 'status');
}

View File

@@ -0,0 +1,590 @@
<?php
/**
* @file
* Downloading and parsing functions for Common Syndication Parser.
* Pillaged from FeedAPI common syndication parser.
*
* @todo Restructure. OO could work wonders here.
* @todo Write unit tests.
* @todo Keep in Feeds project or host on Drupal?
*/
/**
* Parse the feed into a data structure.
*
* @param $feed
* The feed object (contains the URL or the parsed XML structure.
* @return
* stdClass The structured datas extracted from the feed.
*/
function common_syndication_parser_parse($string) {
if (!defined('LIBXML_VERSION') || (version_compare(phpversion(), '5.1.0', '<'))) {
@ $xml = simplexml_load_string($string, NULL);
}
else {
@ $xml = simplexml_load_string($string, NULL, LIBXML_NOERROR | LIBXML_NOWARNING | LIBXML_NOCDATA);
}
// Got a malformed XML.
if ($xml === FALSE || is_null($xml)) {
return FALSE;
}
$feed_type = _parser_common_syndication_feed_format_detect($xml);
if ($feed_type == "atom1.0") {
return _parser_common_syndication_atom10_parse($xml);
}
if ($feed_type == "RSS2.0" || $feed_type == "RSS0.91" || $feed_type == "RSS0.92") {
return _parser_common_syndication_RSS20_parse($xml);
}
if ($feed_type == "RDF") {
return _parser_common_syndication_RDF10_parse($xml);
}
return FALSE;
}
/**
* Get the cached version of the <var>$url</var>
*/
function _parser_common_syndication_cache_get($url) {
$cache_file = _parser_common_syndication_sanitize_cache() . '/' . md5($url);
if (file_exists($cache_file)) {
$file_content = file_get_contents($cache_file);
return unserialize($file_content);
}
return FALSE;
}
/**
* Determine the feed format of a SimpleXML parsed object structure.
*
* @param $xml
* SimpleXML-preprocessed feed.
* @return
* The feed format short description or FALSE if not compatible.
*/
function _parser_common_syndication_feed_format_detect($xml) {
if (!is_object($xml)) {
return FALSE;
}
$attr = $xml->attributes();
$type = strtolower($xml->getName());
if (isset($xml->entry) && $type == "feed") {
return "atom1.0";
}
if ($type == "rss" && $attr["version"] == "2.0") {
return "RSS2.0";
}
if ($type == "rdf" && isset($xml->channel)) {
return "RDF";
}
if ($type == "rss" && $attr["version"] == "0.91") {
return "RSS0.91";
}
if ($type == "rss" && $attr["version"] == "0.92") {
return "RSS0.92";
}
return FALSE;
}
/**
* Parse atom feeds.
*/
function _parser_common_syndication_atom10_parse($feed_XML) {
$parsed_source = array();
$ns = array(
"georss" => "http://www.georss.org/georss",
);
$base = $feed_XML->xpath("@base");
$base = (string) array_shift($base);
if (!valid_url($base, TRUE)) {
$base = FALSE;
}
// Detect the title
$parsed_source['title'] = isset($feed_XML->title) ? _parser_common_syndication_title("{$feed_XML->title}") : "";
// Detect the description
$parsed_source['description'] = isset($feed_XML->subtitle) ? "{$feed_XML->subtitle}" : "";
$parsed_source['link'] = _parser_common_syndication_link($feed_XML->link);
if (valid_url($parsed_source['link']) && !valid_url($parsed_source['link'], TRUE) && !empty($base)) {
$parsed_source['link'] = $base . $parsed_source['link'];
}
$parsed_source['items'] = array();
foreach ($feed_XML->entry as $news) {
$original_url = NULL;
$guid = !empty($news->id) ? "{$news->id}" : NULL;
if (valid_url($guid, TRUE)) {
$original_url = $guid;
}
$georss = (array)$news->children($ns["georss"]);
$geoname = '';
if (isset($georss['featureName'])) {
$geoname = "{$georss['featureName']}";
}
$latlon =
$lat =
$lon = NULL;
if (isset($georss['point'])) {
$latlon = explode(' ', $georss['point']);
$lat = "{$latlon[0]}";
$lon = "{$latlon[1]}";
if (!$geoname) {
$geoname = "{$lat} {$lon}";
}
}
$additional_taxonomies = array();
if (isset($news->category)) {
$additional_taxonomies['ATOM Categories'] = array();
$additional_taxonomies['ATOM Domains'] = array();
foreach ($news->category as $category) {
if (isset($category['scheme'])) {
$domain = "{$category['scheme']}";
if (!empty($domain)) {
if (!isset($additional_taxonomies['ATOM Domains'][$domain])) {
$additional_taxonomies['ATOM Domains'][$domain] = array();
}
$additional_taxonomies['ATOM Domains'][$domain][] = count($additional_taxonomies['ATOM Categories']) - 1;
}
}
$additional_taxonomies['ATOM Categories'][] = "{$category['term']}";
}
}
$title = "{$news->title}";
$body = '';
if (!empty($news->content)) {
foreach ($news->content->children() as $child) {
$body .= $child->asXML();
}
$body .= "{$news->content}";
}
elseif (!empty($news->summary)) {
foreach ($news->summary->children() as $child) {
$body .= $child->asXML();
}
$body .= "{$news->summary}";
}
if (!empty($news->content['src'])) {
// some src elements in some valid atom feeds contained no urls at all
if (valid_url("{$news->content['src']}", TRUE)) {
$original_url = "{$news->content['src']}";
}
}
$author_found = FALSE;
if (!empty($news->source->author->name)) {
$original_author = "{$news->source->author->name}";
$author_found = TRUE;
}
elseif (!empty($news->author->name)) {
$original_author = "{$news->author->name}";
$author_found = TRUE;
}
if (!empty($feed_XML->author->name) && !$author_found) {
$original_author = "{$feed_XML->author->name}";
}
$original_url = _parser_common_syndication_link($news->link);
$item = array();
$item['title'] = _parser_common_syndication_title($title, $body);
$item['description'] = $body;
$item['author_name'] = $original_author;
// Fall back to updated for timestamp if both published and issued are
// empty.
if (isset($news->published)) {
$item['timestamp'] = _parser_common_syndication_parse_date("{$news->published}");
}
elseif (isset($news->issued)) {
$item['timestamp'] = _parser_common_syndication_parse_date("{$news->issued}");
}
elseif (isset($news->updated)) {
$item['timestamp'] = _parser_common_syndication_parse_date("{$news->updated}");
}
$item['url'] = trim($original_url);
if (valid_url($item['url']) && !valid_url($item['url'], TRUE) && !empty($base)) {
$item['url'] = $base . $item['url'];
}
// Fall back on URL if GUID is empty.
if (!empty($guid)) {
$item['guid'] = $guid;
}
else {
$item['guid'] = $item['url'];
}
$item['geolocations'] = array();
if ($lat && $lon) {
$item['geolocations'] = array(
array(
'name' => $geoname,
'lat' => $lat,
'lon' => $lon,
),
);
}
$item['tags'] = isset($additional_taxonomies['ATOM Categories']) ? $additional_taxonomies['ATOM Categories'] : array();
$item['domains'] = isset($additional_taxonomies['ATOM Domains']) ? $additional_taxonomies['ATOM Domains'] : array();
$parsed_source['items'][] = $item;
}
return $parsed_source;
}
/**
* Parse RDF Site Summary (RSS) 1.0 feeds in RDF/XML format.
*
* @see http://web.resource.org/rss/1.0/
*/
function _parser_common_syndication_RDF10_parse($feed_XML) {
// Declare some canonical standard prefixes for well-known namespaces:
static $canonical_namespaces = array(
'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
'rdfs' => 'http://www.w3.org/2000/01/rdf-schema#',
'xsi' => 'http://www.w3.org/2001/XMLSchema-instance#',
'xsd' => 'http://www.w3.org/2001/XMLSchema#',
'owl' => 'http://www.w3.org/2002/07/owl#',
'dc' => 'http://purl.org/dc/elements/1.1/',
'dcterms' => 'http://purl.org/dc/terms/',
'dcmitype' => 'http://purl.org/dc/dcmitype/',
'foaf' => 'http://xmlns.com/foaf/0.1/',
'rss' => 'http://purl.org/rss/1.0/',
);
// Get all namespaces declared in the feed element, with special handling
// for PHP versions prior to 5.1.2 as they don't handle namespaces.
$namespaces = version_compare(phpversion(), '5.1.2', '<') ? array() : $feed_XML->getNamespaces(TRUE);
// Process the <rss:channel> resource containing feed metadata:
foreach ($feed_XML->children($canonical_namespaces['rss'])->channel as $rss_channel) {
$parsed_source = array(
'title' => _parser_common_syndication_title((string) $rss_channel->title),
'description' => (string) $rss_channel->description,
'link' => (string) $rss_channel->link,
'items' => array(),
);
break;
}
// Process each <rss:item> resource contained in the feed:
foreach ($feed_XML->children($canonical_namespaces['rss'])->item as $rss_item) {
// Extract all available RDF statements from the feed item's RDF/XML
// tags, allowing for both the item's attributes and child elements to
// contain RDF properties:
$rdf_data = array();
foreach ($namespaces as $ns => $ns_uri) {
// Note that we attempt to normalize the found property name
// namespaces to well-known 'standard' prefixes where possible, as the
// feed may in principle use any arbitrary prefixes and we should
// still be able to correctly handle it.
foreach ($rss_item->attributes($ns_uri) as $attr_name => $attr_value) {
$ns_prefix = ($ns_prefix = array_search($ns_uri, $canonical_namespaces)) ? $ns_prefix : $ns;
$rdf_data[$ns_prefix . ':' . $attr_name][] = (string) $attr_value;
}
foreach ($rss_item->children($ns_uri) as $rss_property) {
$ns_prefix = ($ns_prefix = array_search($ns_uri, $canonical_namespaces)) ? $ns_prefix : $ns;
$rdf_data[$ns_prefix . ':' . $rss_property->getName()][] = (string) $rss_property;
}
}
// Declaratively define mappings that determine how to construct the result object.
$item = _parser_common_syndication_RDF10_item($rdf_data, array(
'title' => array('rss:title', 'dc:title'),
'description' => array('rss:description', 'dc:description', 'content:encoded'),
'url' => array('rss:link', 'rdf:about'),
'author_name' => array('dc:creator', 'dc:publisher'),
'guid' => 'rdf:about',
'timestamp' => 'dc:date',
'tags' => 'dc:subject'
));
// Special handling for the title:
$item['title'] = _parser_common_syndication_title($item['title'], $item['description']);
// Parse any date/time values into Unix timestamps:
$item['timestamp'] = _parser_common_syndication_parse_date($item['timestamp']);
// If no GUID found, use the URL of the feed.
if (empty($item['guid'])) {
$item['guid'] = $item['url'];
}
// Add every found RDF property to the feed item.
$item['rdf'] = array();
foreach ($rdf_data as $rdf_property => $rdf_value) {
// looks nicer in the mapper UI
// @todo Revisit, not used with feedapi mapper anymore.
$rdf_property = str_replace(':', '_', $rdf_property);
$item['rdf'][$rdf_property] = $rdf_value;
}
$parsed_source['items'][] = $item;
}
return $parsed_source;
}
function _parser_common_syndication_RDF10_property($rdf_data, $rdf_properties = array()) {
$rdf_properties = is_array($rdf_properties) ? $rdf_properties : array_slice(func_get_args(), 1);
foreach ($rdf_properties as $rdf_property) {
if ($rdf_property && !empty($rdf_data[$rdf_property])) {
// remove empty strings
return array_filter($rdf_data[$rdf_property], 'strlen');
}
}
}
function _parser_common_syndication_RDF10_item($rdf_data, $mappings) {
foreach ($mappings as $k => $v) {
$values = _parser_common_syndication_RDF10_property($rdf_data, $v);
$mappings[$k] = !is_array($values) || count($values) > 1 ? $values : reset($values);
}
return $mappings;
}
/**
* Parse RSS2.0 feeds.
*/
function _parser_common_syndication_RSS20_parse($feed_XML) {
$ns = array(
"content" => "http://purl.org/rss/1.0/modules/content/",
"dc" => "http://purl.org/dc/elements/1.1/",
"georss" => "http://www.georss.org/georss",
);
$parsed_source = array();
// Detect the title.
$parsed_source['title'] = isset($feed_XML->channel->title) ? _parser_common_syndication_title("{$feed_XML->channel->title}") : "";
// Detect the description.
$parsed_source['description'] = isset($feed_XML->channel->description) ? "{$feed_XML->channel->description}" : "";
// Detect the link.
$parsed_source['link'] = isset($feed_XML->channel->link) ? "{$feed_XML->channel->link}" : "";
$parsed_source['items'] = array();
foreach ($feed_XML->xpath('//item') as $news) {
$title = $body = $original_author = $original_url = $guid = '';
$category = $news->xpath('category');
// Get children for current namespace.
if (version_compare(phpversion(), '5.1.2', '>')) {
$content = (array)$news->children($ns["content"]);
$dc = (array)$news->children($ns["dc"]);
$georss = (array)$news->children($ns["georss"]);
}
$news = (array) $news;
$news['category'] = $category;
if (isset($news['title'])) {
$title = "{$news['title']}";
}
if (isset($news['description'])) {
$body = "{$news['description']}";
}
// Some sources use content:encoded as description i.e.
// PostNuke PageSetter module.
if (isset($news['encoded'])) { // content:encoded for PHP < 5.1.2.
if (strlen($body) < strlen("{$news['encoded']}")) {
$body = "{$news['encoded']}";
}
}
if (isset($content['encoded'])) { // content:encoded for PHP >= 5.1.2.
if (strlen($body) < strlen("{$content['encoded']}")) {
$body = "{$content['encoded']}";
}
}
if (!isset($body)) {
$body = "{$news['title']}";
}
if (!empty($news['author'])) {
$original_author = "{$news['author']}";
}
elseif (!empty($dc["creator"])) {
$original_author = (string)$dc["creator"];
}
if (!empty($news['link'])) {
$original_url = "{$news['link']}";
$guid = $original_url;
}
if (!empty($news['guid'])) {
$guid = "{$news['guid']}";
}
if (!empty($georss['featureName'])) {
$geoname = "{$georss['featureName']}";
}
$lat =
$lon =
$latlon =
$geoname = NULL;
if (!empty($georss['point'])) {
$latlon = explode(' ', $georss['point']);
$lat = "{$latlon[0]}";
$lon = "{$latlon[1]}";
if (!$geoname) {
$geoname = "$lat $lon";
}
}
$additional_taxonomies = array();
$additional_taxonomies['RSS Categories'] = array();
$additional_taxonomies['RSS Domains'] = array();
if (isset($news['category'])) {
foreach ($news['category'] as $category) {
$additional_taxonomies['RSS Categories'][] = "{$category}";
if (isset($category['domain'])) {
$domain = "{$category['domain']}";
if (!empty($domain)) {
if (!isset($additional_taxonomies['RSS Domains'][$domain])) {
$additional_taxonomies['RSS Domains'][$domain] = array();
}
$additional_taxonomies['RSS Domains'][$domain][] = count($additional_taxonomies['RSS Categories']) - 1;
}
}
}
}
$item = array();
$item['title'] = _parser_common_syndication_title($title, $body);
$item['description'] = $body;
$item['author_name'] = $original_author;
if (!empty($news['pubDate'])) {
$item['timestamp'] = _parser_common_syndication_parse_date($news['pubDate']);
}
elseif (!empty($dc['date'])) {
$item['timestamp'] = _parser_common_syndication_parse_date($dc['date']);
}
else {
$item['timestamp'] = time();
}
$item['url'] = trim($original_url);
$item['guid'] = $guid;
$item['geolocations'] = array();
if (isset($geoname, $lat, $lon)) {
$item['geolocations'] = array(
array(
'name' => $geoname,
'lat' => $lat,
'lon' => $lon,
),
);
}
$item['domains'] = $additional_taxonomies['RSS Domains'];
$item['tags'] = $additional_taxonomies['RSS Categories'];
$parsed_source['items'][] = $item;
}
return $parsed_source;
}
/**
* Parse a date comes from a feed.
*
* @param $date_string
* The date string in various formats.
* @return
* The timestamp of the string or the current time if can't be parsed
*/
function _parser_common_syndication_parse_date($date_str) {
// PHP < 5.3 doesn't like the GMT- notation for parsing timezones.
$date_str = str_replace("GMT-", "-", $date_str);
$date_str = str_replace("GMT+", "+", $date_str);
$parsed_date = strtotime($date_str);
if ($parsed_date === FALSE || $parsed_date == -1) {
$parsed_date = _parser_common_syndication_parse_w3cdtf($date_str);
}
return $parsed_date === FALSE ? time() : $parsed_date;
}
/**
* Parse the W3C date/time format, a subset of ISO 8601.
*
* PHP date parsing functions do not handle this format.
* See http://www.w3.org/TR/NOTE-datetime for more information.
* Originally from MagpieRSS (http://magpierss.sourceforge.net/).
*
* @param $date_str
* A string with a potentially W3C DTF date.
* @return
* A timestamp if parsed successfully or FALSE if not.
*/
function _parser_common_syndication_parse_w3cdtf($date_str) {
if (preg_match('/(\d{4})-(\d{2})-(\d{2})T(\d{2}):(\d{2})(:(\d{2}))?(?:([-+])(\d{2}):?(\d{2})|(Z))?/', $date_str, $match)) {
list($year, $month, $day, $hours, $minutes, $seconds) = array($match[1], $match[2], $match[3], $match[4], $match[5], $match[6]);
// Calculate the epoch for current date assuming GMT.
$epoch = gmmktime($hours, $minutes, $seconds, $month, $day, $year);
if ($match[10] != 'Z') { // Z is zulu time, aka GMT
list($tz_mod, $tz_hour, $tz_min) = array($match[8], $match[9], $match[10]);
// Zero out the variables.
if (!$tz_hour) {
$tz_hour = 0;
}
if (!$tz_min) {
$tz_min = 0;
}
$offset_secs = (($tz_hour * 60) + $tz_min) * 60;
// Is timezone ahead of GMT? If yes, subtract offset.
if ($tz_mod == '+') {
$offset_secs *= -1;
}
$epoch += $offset_secs;
}
return $epoch;
}
else {
return FALSE;
}
}
/**
* Extract the link that points to the original content (back to site or
* original article)
*
* @param $links
* Array of SimpleXML objects
*/
function _parser_common_syndication_link($links) {
$to_link = '';
if (count($links) > 0) {
foreach ($links as $link) {
$link = $link->attributes();
$to_link = isset($link["href"]) ? "{$link["href"]}" : "";
if (isset($link["rel"])) {
if ("{$link["rel"]}" == 'alternate') {
break;
}
}
}
}
return $to_link;
}
/**
* Prepare raw data to be a title
*/
function _parser_common_syndication_title($title, $body = FALSE) {
if (empty($title) && !empty($body)) {
// Explode to words and use the first 3 words.
$words = preg_split('/[\s,]+/', strip_tags($body));
$title = implode(' ', array_slice($words, 0, 3));
}
return $title;
}

View File

@@ -0,0 +1,405 @@
<?php
/**
* @file
* Download via HTTP.
*
* Support caching, HTTP Basic Authentication, detection of RSS/Atom feeds,
* redirects.
*/
/**
* PCRE for finding the link tags in html.
*/
define('HTTP_REQUEST_PCRE_LINK_TAG', '/<link((?:[\x09\x0A\x0B\x0C\x0D\x20]+[^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3D\x3E]*(?:[\x09\x0A\x0B\x0C\x0D\x20]*=[\x09\x0A\x0B\x0C\x0D\x20]*(?:"(?:[^"]*)"|\'(?:[^\']*)\'|(?:[^\x09\x0A\x0B\x0C\x0D\x20\x22\x27\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x3E]*)?))?)*)[\x09\x0A\x0B\x0C\x0D\x20]*(>(.*)<\/link>|(\/)?>)/si');
/**
* PCRE for matching all the attributes in a tag.
*/
define('HTTP_REQUEST_PCRE_TAG_ATTRIBUTES', '/[\x09\x0A\x0B\x0C\x0D\x20]+([^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3D\x3E]*)(?:[\x09\x0A\x0B\x0C\x0D\x20]*=[\x09\x0A\x0B\x0C\x0D\x20]*(?:"([^"]*)"|\'([^\']*)\'|([^\x09\x0A\x0B\x0C\x0D\x20\x22\x27\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x3E]*)?))?/');
/**
* For cUrl specific errors.
*/
class HRCurlException extends Exception {}
/**
* Discover RSS or atom feeds at the given URL. If document in given URL is an
* HTML document, function attempts to discover RSS or Atom feeds.
*
* @param string $url
* The url of the feed to retrieve.
* @param array $settings
* An optional array of settings. Valid options are: accept_invalid_cert.
*
* @return bool|string
* The discovered feed, or FALSE if the URL is not reachable or there was an
* error.
*/
function http_request_get_common_syndication($url, $settings = NULL) {
$accept_invalid_cert = isset($settings['accept_invalid_cert']) ? $settings['accept_invalid_cert'] : FALSE;
$download = http_request_get($url, NULL, NULL, $accept_invalid_cert);
// Cannot get the feed, return.
// http_request_get() always returns 200 even if its 304.
if ($download->code != 200) {
return FALSE;
}
// Drop the data into a seperate variable so all manipulations of the html
// will not effect the actual object that exists in the static cache.
// @see http_request_get.
$downloaded_string = $download->data;
// If this happens to be a feed then just return the url.
if (http_request_is_feed($download->headers['content-type'], $downloaded_string)) {
return $url;
}
$discovered_feeds = http_request_find_feeds($downloaded_string);
foreach ($discovered_feeds as $feed_url) {
$absolute = http_request_create_absolute_url($feed_url, $url);
if (!empty($absolute)) {
// @TODO: something more intelligent?
return $absolute;
}
}
}
/**
* Get the content from the given URL.
*
* @param string $url
* A valid URL (not only web URLs).
* @param string $username
* If the URL uses authentication, supply the username.
* @param string $password
* If the URL uses authentication, supply the password.
* @param bool $accept_invalid_cert
* Whether to accept invalid certificates.
* @return stdClass
* An object that describes the data downloaded from $url.
*/
function http_request_get($url, $username = NULL, $password = NULL, $accept_invalid_cert = FALSE) {
// Intra-pagedownload cache, avoid to download the same content twice within
// one page download (it's possible, compatible and parse calls).
static $download_cache = array();
if (isset($download_cache[$url])) {
return $download_cache[$url];
}
if (!$username && valid_url($url, TRUE)) {
// Handle password protected feeds.
$url_parts = parse_url($url);
if (!empty($url_parts['user'])) {
$password = $url_parts['pass'];
$username = $url_parts['user'];
}
}
$curl = http_request_use_curl();
// Only download and parse data if really needs refresh.
// Based on "Last-Modified" and "If-Modified-Since".
$headers = array();
if ($cache = cache_get('feeds_http_download_' . md5($url))) {
$last_result = $cache->data;
$last_headers = array_change_key_case($last_result->headers);
if (!empty($last_headers['etag'])) {
if ($curl) {
$headers[] = 'If-None-Match: ' . $last_headers['etag'];
}
else {
$headers['If-None-Match'] = $last_headers['etag'];
}
}
if (!empty($last_headers['last-modified'])) {
if ($curl) {
$headers[] = 'If-Modified-Since: ' . $last_headers['last-modified'];
}
else {
$headers['If-Modified-Since'] = $last_headers['last-modified'];
}
}
if (!empty($username) && !$curl) {
$headers['Authorization'] = 'Basic ' . base64_encode("$username:$password");
}
}
// Support the 'feed' and 'webcal' schemes by converting them into 'http'.
$url = strtr($url, array('feed://' => 'http://', 'webcal://' => 'http://'));
if ($curl) {
$headers[] = 'User-Agent: Drupal (+http://drupal.org/)';
$result = new stdClass();
// Parse the URL and make sure we can handle the schema.
// cURL can only support either http:// or https://.
// CURLOPT_PROTOCOLS is only supported with cURL 7.19.4
$uri = parse_url($url);
if (!isset($uri['scheme'])) {
$result->error = 'missing schema';
$result->code = -1002;
}
else {
switch ($uri['scheme']) {
case 'http':
case 'https':
// Valid scheme.
break;
default:
$result->error = 'invalid schema ' . $uri['scheme'];
$result->code = -1003;
break;
}
}
// If the scheme was valid, continue to request the feed using cURL.
if (empty($result->error)) {
$download = curl_init($url);
curl_setopt($download, CURLOPT_FOLLOWLOCATION, TRUE);
if (!empty($username)) {
curl_setopt($download, CURLOPT_USERPWD, "{$username}:{$password}");
curl_setopt($download, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
}
curl_setopt($download, CURLOPT_HTTPHEADER, $headers);
curl_setopt($download, CURLOPT_HEADER, TRUE);
curl_setopt($download, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($download, CURLOPT_ENCODING, '');
curl_setopt($download, CURLOPT_TIMEOUT, variable_get('http_request_timeout', 30));
if ($accept_invalid_cert) {
curl_setopt($download, CURLOPT_SSL_VERIFYPEER, 0);
}
$header = '';
$data = curl_exec($download);
if (curl_error($download)) {
throw new HRCurlException(
t('cURL error (@code) @error for @url', array(
'@code' => curl_errno($download),
'@error' => curl_error($download),
'@url' => $url
)), curl_errno($download)
);
}
$header_size = curl_getinfo($download, CURLINFO_HEADER_SIZE);
$header = substr($data, 0, $header_size - 1);
$result->data = substr($data, $header_size);
$headers = preg_split("/(\r\n){2}/", $header);
$header_lines = preg_split("/\r\n|\n|\r/", end($headers));
$result->headers = array();
array_shift($header_lines); // skip HTTP response status
while ($line = trim(array_shift($header_lines))) {
list($header, $value) = explode(':', $line, 2);
// Normalize the headers.
$header = strtolower($header);
if (isset($result->headers[$header]) && $header == 'set-cookie') {
// RFC 2109: the Set-Cookie response header comprises the token Set-
// Cookie:, followed by a comma-separated list of one or more cookies.
$result->headers[$header] .= ',' . trim($value);
}
else {
$result->headers[$header] = trim($value);
}
}
$result->code = curl_getinfo($download, CURLINFO_HTTP_CODE);
curl_close($download);
}
}
else {
$result = drupal_http_request($url, array('headers' => $headers, 'timeout' => variable_get('http_request_timeout', 30)));
}
$result->code = isset($result->code) ? $result->code : 200;
// In case of 304 Not Modified try to return cached data.
if ($result->code == 304) {
if (isset($last_result)) {
$last_result->from_cache = TRUE;
return $last_result;
}
else {
// It's a tragedy, this file must exist and contain good data.
// In this case, clear cache and repeat.
cache_clear_all('feeds_http_download_' . md5($url), 'cache');
return http_request_get($url, $username, $password);
}
}
// Set caches.
cache_set('feeds_http_download_' . md5($url), $result);
$download_cache[$url] = $result;
return $result;
}
/**
* Decides if it's possible to use cURL or not.
*
* @return bool
* TRUE if curl is available, FALSE otherwise.
*/
function http_request_use_curl() {
// Allow site administrators to choose to not use cURL.
if (variable_get('feeds_never_use_curl', FALSE)) {
return FALSE;
}
// Check availability of cURL on the system.
$basedir = ini_get("open_basedir");
return function_exists('curl_init') && !ini_get('safe_mode') && empty($basedir);
}
/**
* Clear cache for a specific URL.
*/
function http_request_clear_cache($url) {
cache_clear_all('feeds_http_download_' . md5($url), 'cache');
}
/**
* Returns if the provided $content_type is a feed.
*
* @param string $content_type
* The Content-Type header.
*
* @param string $data
* The actual data from the http request.
*
* @return bool
* Returns TRUE if this is a parsable feed.
*/
function http_request_is_feed($content_type, $data) {
$pos = strpos($content_type, ';');
if ($pos !== FALSE) {
$content_type = substr($content_type, 0, $pos);
}
$content_type = strtolower($content_type);
if (strpos($content_type, 'xml') !== FALSE) {
return TRUE;
}
// @TODO: Sometimes the content-type can be text/html but still be a valid
// feed.
return FALSE;
}
/**
* Finds potential feed tags in the HTML document.
*
* @param string $html
* The html string to search.
*
* @return array
* An array of href to feeds.
*/
function http_request_find_feeds($html) {
$matches = array();
preg_match_all(HTTP_REQUEST_PCRE_LINK_TAG, $html, $matches);
$links = $matches[1];
$valid_links = array();
// Build up all the links information.
foreach ($links as $link_tag) {
$attributes = array();
$candidate = array();
preg_match_all(HTTP_REQUEST_PCRE_TAG_ATTRIBUTES, $link_tag, $attributes, PREG_SET_ORDER);
foreach ($attributes as $attribute) {
// Find the key value pairs, attribute[1] is key and attribute[2] is the
// value.
if (!empty($attribute[1]) && !empty($attribute[2])) {
$candidate[drupal_strtolower($attribute[1])] = drupal_strtolower(decode_entities($attribute[2]));
}
}
// Examine candidate to see if it s a feed.
// @TODO: could/should use http_request_is_feed ??
if (isset($candidate['rel']) && $candidate['rel'] == 'alternate') {
if (isset($candidate['href']) && isset($candidate['type']) && strpos($candidate['type'], 'xml') !== FALSE) {
// All tests pass, its a valid candidate.
$valid_links[] = $candidate['href'];
}
}
}
return $valid_links;
}
/**
* Create an absolute url.
*
* @param string $url
* The href to transform.
* @param string $base_url
* The url to be used as the base for a relative $url.
*
* @return string
* An absolute url
*/
function http_request_create_absolute_url($url, $base_url) {
$url = trim($url);
if (valid_url($url, TRUE)) {
// Valid absolute url already.
return $url;
}
// Turn relative url into absolute.
if (valid_url($url, FALSE)) {
// Produces variables $scheme, $host, $user, $pass, $path, $query and
// $fragment.
$parsed_url = parse_url($base_url);
$path = dirname($parsed_url['path']);
// Adding to the existing path.
if ($url{0} == '/') {
$cparts = array_filter(explode("/", $url));
}
else {
// Backtracking from the existing path.
$cparts = array_merge(array_filter(explode("/", $path)), array_filter(explode("/", $url)));
foreach ($cparts as $i => $part) {
if ($part == '.') {
$cparts[$i] = NULL;
}
if ($part == '..') {
$cparts[$i - 1] = NULL;
$cparts[$i] = NULL;
}
}
$cparts = array_filter($cparts);
}
$path = implode("/", $cparts);
// Build the prefix to the path.
$absolute_url = '';
if (isset($parsed_url['scheme'])) {
$absolute_url = $parsed_url['scheme'] . '://';
}
if (isset($parsed_url['user'])) {
$absolute_url .= $parsed_url['user'];
if (isset($pass)) {
$absolute_url .= ':' . $parsed_url['pass'];
}
$absolute_url .= '@';
}
if (isset($parsed_url['host'])) {
$absolute_url .= $parsed_url['host'] . '/';
}
$absolute_url .= $path;
if (valid_url($absolute_url, TRUE)) {
return $absolute_url;
}
}
return FALSE;
}

View File

@@ -0,0 +1,46 @@
<?php
/**
* @file
* OPML Parser.
*/
/**
* Parse OPML file.
*
* @param $raw
* File contents.
* @return
* An array of the parsed OPML file.
*/
function opml_parser_parse($raw) {
$feeds = $items = array();
$xml = @ new SimpleXMLElement($raw);
$feeds['title'] = (string)current($xml->xpath('//head/title'));
// @todo Make xpath case insensitive.
$outlines = $xml->xpath('//outline[@xmlUrl]');
foreach ($outlines as $outline) {
$item = array();
foreach ($outline->attributes() as $k => $v) {
if (in_array(strtolower($k), array('title', 'text', 'xmlurl'))) {
$item[strtolower($k)] = (string) $v;
}
}
// If no title, forge it from text.
if (!isset($item['title']) && isset($item['text'])) {
if (strlen($item['text']) < 40) {
$item['title'] = $item['text'];
}
else {
$item['title'] = trim(substr($item['text'], 0, 30)) . ' ...';
}
}
if (isset($item['title']) && isset($item['xmlurl'])) {
$items[] = $item;
}
}
$feeds['items'] = $items;
return $feeds;
}