http_request.inc 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405
  1. <?php
  2. /**
  3. * @file
  4. * Download via HTTP.
  5. *
  6. * Support caching, HTTP Basic Authentication, detection of RSS/Atom feeds,
  7. * redirects.
  8. */
  9. /**
  10. * PCRE for finding the link tags in html.
  11. */
  12. define('HTTP_REQUEST_PCRE_LINK_TAG', '/<link((?:[\x09\x0A\x0B\x0C\x0D\x20]+[^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3D\x3E]*(?:[\x09\x0A\x0B\x0C\x0D\x20]*=[\x09\x0A\x0B\x0C\x0D\x20]*(?:"(?:[^"]*)"|\'(?:[^\']*)\'|(?:[^\x09\x0A\x0B\x0C\x0D\x20\x22\x27\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x3E]*)?))?)*)[\x09\x0A\x0B\x0C\x0D\x20]*(>(.*)<\/link>|(\/)?>)/si');
  13. /**
  14. * PCRE for matching all the attributes in a tag.
  15. */
  16. define('HTTP_REQUEST_PCRE_TAG_ATTRIBUTES', '/[\x09\x0A\x0B\x0C\x0D\x20]+([^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3D\x3E]*)(?:[\x09\x0A\x0B\x0C\x0D\x20]*=[\x09\x0A\x0B\x0C\x0D\x20]*(?:"([^"]*)"|\'([^\']*)\'|([^\x09\x0A\x0B\x0C\x0D\x20\x22\x27\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x3E]*)?))?/');
  17. /**
  18. * For cUrl specific errors.
  19. */
  20. class HRCurlException extends Exception {}
  21. /**
  22. * Discover RSS or atom feeds at the given URL. If document in given URL is an
  23. * HTML document, function attempts to discover RSS or Atom feeds.
  24. *
  25. * @param string $url
  26. * The url of the feed to retrieve.
  27. * @param array $settings
  28. * An optional array of settings. Valid options are: accept_invalid_cert.
  29. *
  30. * @return bool|string
  31. * The discovered feed, or FALSE if the URL is not reachable or there was an
  32. * error.
  33. */
  34. function http_request_get_common_syndication($url, $settings = NULL) {
  35. $accept_invalid_cert = isset($settings['accept_invalid_cert']) ? $settings['accept_invalid_cert'] : FALSE;
  36. $download = http_request_get($url, NULL, NULL, $accept_invalid_cert);
  37. // Cannot get the feed, return.
  38. // http_request_get() always returns 200 even if its 304.
  39. if ($download->code != 200) {
  40. return FALSE;
  41. }
  42. // Drop the data into a seperate variable so all manipulations of the html
  43. // will not effect the actual object that exists in the static cache.
  44. // @see http_request_get.
  45. $downloaded_string = $download->data;
  46. // If this happens to be a feed then just return the url.
  47. if (http_request_is_feed($download->headers['content-type'], $downloaded_string)) {
  48. return $url;
  49. }
  50. $discovered_feeds = http_request_find_feeds($downloaded_string);
  51. foreach ($discovered_feeds as $feed_url) {
  52. $absolute = http_request_create_absolute_url($feed_url, $url);
  53. if (!empty($absolute)) {
  54. // @TODO: something more intelligent?
  55. return $absolute;
  56. }
  57. }
  58. }
  59. /**
  60. * Get the content from the given URL.
  61. *
  62. * @param string $url
  63. * A valid URL (not only web URLs).
  64. * @param string $username
  65. * If the URL uses authentication, supply the username.
  66. * @param string $password
  67. * If the URL uses authentication, supply the password.
  68. * @param bool $accept_invalid_cert
  69. * Whether to accept invalid certificates.
  70. * @return stdClass
  71. * An object that describes the data downloaded from $url.
  72. */
  73. function http_request_get($url, $username = NULL, $password = NULL, $accept_invalid_cert = FALSE) {
  74. // Intra-pagedownload cache, avoid to download the same content twice within
  75. // one page download (it's possible, compatible and parse calls).
  76. static $download_cache = array();
  77. if (isset($download_cache[$url])) {
  78. return $download_cache[$url];
  79. }
  80. if (!$username && valid_url($url, TRUE)) {
  81. // Handle password protected feeds.
  82. $url_parts = parse_url($url);
  83. if (!empty($url_parts['user'])) {
  84. $password = $url_parts['pass'];
  85. $username = $url_parts['user'];
  86. }
  87. }
  88. $curl = http_request_use_curl();
  89. // Only download and parse data if really needs refresh.
  90. // Based on "Last-Modified" and "If-Modified-Since".
  91. $headers = array();
  92. if ($cache = cache_get('feeds_http_download_' . md5($url))) {
  93. $last_result = $cache->data;
  94. $last_headers = array_change_key_case($last_result->headers);
  95. if (!empty($last_headers['etag'])) {
  96. if ($curl) {
  97. $headers[] = 'If-None-Match: ' . $last_headers['etag'];
  98. }
  99. else {
  100. $headers['If-None-Match'] = $last_headers['etag'];
  101. }
  102. }
  103. if (!empty($last_headers['last-modified'])) {
  104. if ($curl) {
  105. $headers[] = 'If-Modified-Since: ' . $last_headers['last-modified'];
  106. }
  107. else {
  108. $headers['If-Modified-Since'] = $last_headers['last-modified'];
  109. }
  110. }
  111. if (!empty($username) && !$curl) {
  112. $headers['Authorization'] = 'Basic ' . base64_encode("$username:$password");
  113. }
  114. }
  115. // Support the 'feed' and 'webcal' schemes by converting them into 'http'.
  116. $url = strtr($url, array('feed://' => 'http://', 'webcal://' => 'http://'));
  117. if ($curl) {
  118. $headers[] = 'User-Agent: Drupal (+http://drupal.org/)';
  119. $result = new stdClass();
  120. // Parse the URL and make sure we can handle the schema.
  121. // cURL can only support either http:// or https://.
  122. // CURLOPT_PROTOCOLS is only supported with cURL 7.19.4
  123. $uri = parse_url($url);
  124. if (!isset($uri['scheme'])) {
  125. $result->error = 'missing schema';
  126. $result->code = -1002;
  127. }
  128. else {
  129. switch ($uri['scheme']) {
  130. case 'http':
  131. case 'https':
  132. // Valid scheme.
  133. break;
  134. default:
  135. $result->error = 'invalid schema ' . $uri['scheme'];
  136. $result->code = -1003;
  137. break;
  138. }
  139. }
  140. // If the scheme was valid, continue to request the feed using cURL.
  141. if (empty($result->error)) {
  142. $download = curl_init($url);
  143. curl_setopt($download, CURLOPT_FOLLOWLOCATION, TRUE);
  144. if (!empty($username)) {
  145. curl_setopt($download, CURLOPT_USERPWD, "{$username}:{$password}");
  146. curl_setopt($download, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
  147. }
  148. curl_setopt($download, CURLOPT_HTTPHEADER, $headers);
  149. curl_setopt($download, CURLOPT_HEADER, TRUE);
  150. curl_setopt($download, CURLOPT_RETURNTRANSFER, TRUE);
  151. curl_setopt($download, CURLOPT_ENCODING, '');
  152. curl_setopt($download, CURLOPT_TIMEOUT, variable_get('http_request_timeout', 30));
  153. if ($accept_invalid_cert) {
  154. curl_setopt($download, CURLOPT_SSL_VERIFYPEER, 0);
  155. }
  156. $header = '';
  157. $data = curl_exec($download);
  158. if (curl_error($download)) {
  159. throw new HRCurlException(
  160. t('cURL error (@code) @error for @url', array(
  161. '@code' => curl_errno($download),
  162. '@error' => curl_error($download),
  163. '@url' => $url
  164. )), curl_errno($download)
  165. );
  166. }
  167. $header_size = curl_getinfo($download, CURLINFO_HEADER_SIZE);
  168. $header = substr($data, 0, $header_size - 1);
  169. $result->data = substr($data, $header_size);
  170. $headers = preg_split("/(\r\n){2}/", $header);
  171. $header_lines = preg_split("/\r\n|\n|\r/", end($headers));
  172. $result->headers = array();
  173. array_shift($header_lines); // skip HTTP response status
  174. while ($line = trim(array_shift($header_lines))) {
  175. list($header, $value) = explode(':', $line, 2);
  176. // Normalize the headers.
  177. $header = strtolower($header);
  178. if (isset($result->headers[$header]) && $header == 'set-cookie') {
  179. // RFC 2109: the Set-Cookie response header comprises the token Set-
  180. // Cookie:, followed by a comma-separated list of one or more cookies.
  181. $result->headers[$header] .= ',' . trim($value);
  182. }
  183. else {
  184. $result->headers[$header] = trim($value);
  185. }
  186. }
  187. $result->code = curl_getinfo($download, CURLINFO_HTTP_CODE);
  188. curl_close($download);
  189. }
  190. }
  191. else {
  192. $result = drupal_http_request($url, array('headers' => $headers, 'timeout' => variable_get('http_request_timeout', 30)));
  193. }
  194. $result->code = isset($result->code) ? $result->code : 200;
  195. // In case of 304 Not Modified try to return cached data.
  196. if ($result->code == 304) {
  197. if (isset($last_result)) {
  198. $last_result->from_cache = TRUE;
  199. return $last_result;
  200. }
  201. else {
  202. // It's a tragedy, this file must exist and contain good data.
  203. // In this case, clear cache and repeat.
  204. cache_clear_all('feeds_http_download_' . md5($url), 'cache');
  205. return http_request_get($url, $username, $password);
  206. }
  207. }
  208. // Set caches.
  209. cache_set('feeds_http_download_' . md5($url), $result);
  210. $download_cache[$url] = $result;
  211. return $result;
  212. }
  213. /**
  214. * Decides if it's possible to use cURL or not.
  215. *
  216. * @return bool
  217. * TRUE if curl is available, FALSE otherwise.
  218. */
  219. function http_request_use_curl() {
  220. // Allow site administrators to choose to not use cURL.
  221. if (variable_get('feeds_never_use_curl', FALSE)) {
  222. return FALSE;
  223. }
  224. // Check availability of cURL on the system.
  225. $basedir = ini_get("open_basedir");
  226. return function_exists('curl_init') && !ini_get('safe_mode') && empty($basedir);
  227. }
  228. /**
  229. * Clear cache for a specific URL.
  230. */
  231. function http_request_clear_cache($url) {
  232. cache_clear_all('feeds_http_download_' . md5($url), 'cache');
  233. }
  234. /**
  235. * Returns if the provided $content_type is a feed.
  236. *
  237. * @param string $content_type
  238. * The Content-Type header.
  239. *
  240. * @param string $data
  241. * The actual data from the http request.
  242. *
  243. * @return bool
  244. * Returns TRUE if this is a parsable feed.
  245. */
  246. function http_request_is_feed($content_type, $data) {
  247. $pos = strpos($content_type, ';');
  248. if ($pos !== FALSE) {
  249. $content_type = substr($content_type, 0, $pos);
  250. }
  251. $content_type = strtolower($content_type);
  252. if (strpos($content_type, 'xml') !== FALSE) {
  253. return TRUE;
  254. }
  255. // @TODO: Sometimes the content-type can be text/html but still be a valid
  256. // feed.
  257. return FALSE;
  258. }
  259. /**
  260. * Finds potential feed tags in the HTML document.
  261. *
  262. * @param string $html
  263. * The html string to search.
  264. *
  265. * @return array
  266. * An array of href to feeds.
  267. */
  268. function http_request_find_feeds($html) {
  269. $matches = array();
  270. preg_match_all(HTTP_REQUEST_PCRE_LINK_TAG, $html, $matches);
  271. $links = $matches[1];
  272. $valid_links = array();
  273. // Build up all the links information.
  274. foreach ($links as $link_tag) {
  275. $attributes = array();
  276. $candidate = array();
  277. preg_match_all(HTTP_REQUEST_PCRE_TAG_ATTRIBUTES, $link_tag, $attributes, PREG_SET_ORDER);
  278. foreach ($attributes as $attribute) {
  279. // Find the key value pairs, attribute[1] is key and attribute[2] is the
  280. // value.
  281. if (!empty($attribute[1]) && !empty($attribute[2])) {
  282. $candidate[drupal_strtolower($attribute[1])] = drupal_strtolower(decode_entities($attribute[2]));
  283. }
  284. }
  285. // Examine candidate to see if it s a feed.
  286. // @TODO: could/should use http_request_is_feed ??
  287. if (isset($candidate['rel']) && $candidate['rel'] == 'alternate') {
  288. if (isset($candidate['href']) && isset($candidate['type']) && strpos($candidate['type'], 'xml') !== FALSE) {
  289. // All tests pass, its a valid candidate.
  290. $valid_links[] = $candidate['href'];
  291. }
  292. }
  293. }
  294. return $valid_links;
  295. }
  296. /**
  297. * Create an absolute url.
  298. *
  299. * @param string $url
  300. * The href to transform.
  301. * @param string $base_url
  302. * The url to be used as the base for a relative $url.
  303. *
  304. * @return string
  305. * An absolute url
  306. */
  307. function http_request_create_absolute_url($url, $base_url) {
  308. $url = trim($url);
  309. if (valid_url($url, TRUE)) {
  310. // Valid absolute url already.
  311. return $url;
  312. }
  313. // Turn relative url into absolute.
  314. if (valid_url($url, FALSE)) {
  315. // Produces variables $scheme, $host, $user, $pass, $path, $query and
  316. // $fragment.
  317. $parsed_url = parse_url($base_url);
  318. $path = dirname($parsed_url['path']);
  319. // Adding to the existing path.
  320. if ($url{0} == '/') {
  321. $cparts = array_filter(explode("/", $url));
  322. }
  323. else {
  324. // Backtracking from the existing path.
  325. $cparts = array_merge(array_filter(explode("/", $path)), array_filter(explode("/", $url)));
  326. foreach ($cparts as $i => $part) {
  327. if ($part == '.') {
  328. $cparts[$i] = NULL;
  329. }
  330. if ($part == '..') {
  331. $cparts[$i - 1] = NULL;
  332. $cparts[$i] = NULL;
  333. }
  334. }
  335. $cparts = array_filter($cparts);
  336. }
  337. $path = implode("/", $cparts);
  338. // Build the prefix to the path.
  339. $absolute_url = '';
  340. if (isset($parsed_url['scheme'])) {
  341. $absolute_url = $parsed_url['scheme'] . '://';
  342. }
  343. if (isset($parsed_url['user'])) {
  344. $absolute_url .= $parsed_url['user'];
  345. if (isset($pass)) {
  346. $absolute_url .= ':' . $parsed_url['pass'];
  347. }
  348. $absolute_url .= '@';
  349. }
  350. if (isset($parsed_url['host'])) {
  351. $absolute_url .= $parsed_url['host'] . '/';
  352. }
  353. $absolute_url .= $path;
  354. if (valid_url($absolute_url, TRUE)) {
  355. return $absolute_url;
  356. }
  357. }
  358. return FALSE;
  359. }