http_request.inc 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512
  1. <?php
  2. /**
  3. * @file
  4. * Download via HTTP.
  5. *
  6. * Support caching, HTTP Basic Authentication, detection of RSS/Atom feeds,
  7. * redirects.
  8. */
  9. /**
  10. * PCRE for finding the link tags in html.
  11. */
  12. define('HTTP_REQUEST_PCRE_LINK_TAG', '/<link((?:[\x09\x0A\x0B\x0C\x0D\x20]+[^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3D\x3E]*(?:[\x09\x0A\x0B\x0C\x0D\x20]*=[\x09\x0A\x0B\x0C\x0D\x20]*(?:"(?:[^"]*)"|\'(?:[^\']*)\'|(?:[^\x09\x0A\x0B\x0C\x0D\x20\x22\x27\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x3E]*)?))?)*)[\x09\x0A\x0B\x0C\x0D\x20]*(>(.*)<\/link>|(\/)?>)/si');
  13. /**
  14. * PCRE for matching all the attributes in a tag.
  15. */
  16. define('HTTP_REQUEST_PCRE_TAG_ATTRIBUTES', '/[\x09\x0A\x0B\x0C\x0D\x20]+([^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x2F\x3D\x3E]*)(?:[\x09\x0A\x0B\x0C\x0D\x20]*=[\x09\x0A\x0B\x0C\x0D\x20]*(?:"([^"]*)"|\'([^\']*)\'|([^\x09\x0A\x0B\x0C\x0D\x20\x22\x27\x3E][^\x09\x0A\x0B\x0C\x0D\x20\x3E]*)?))?/');
  17. /**
  18. * For cUrl specific errors.
  19. */
  20. class HRCurlException extends Exception {}
  21. /**
  22. * Discovers RSS or atom feeds at the given URL.
  23. *
  24. * If document in given URL is an HTML document, function attempts to discover
  25. * RSS or Atom feeds.
  26. *
  27. * @param string $url
  28. * The url of the feed to retrieve.
  29. * @param array $settings
  30. * An optional array of settings. Valid options are: accept_invalid_cert.
  31. *
  32. * @return bool|string
  33. * The discovered feed, or FALSE if the URL is not reachable or there was an
  34. * error.
  35. */
  36. function http_request_get_common_syndication($url, $settings = array()) {
  37. $accept_invalid_cert = isset($settings['accept_invalid_cert']) ? $settings['accept_invalid_cert'] : FALSE;
  38. $download = http_request_get($url, NULL, NULL, $accept_invalid_cert);
  39. // Cannot get the feed, return.
  40. // http_request_get() always returns 200 even if its 304.
  41. if ($download->code != 200) {
  42. return FALSE;
  43. }
  44. // Drop the data into a separate variable so all manipulations of the html
  45. // will not effect the actual object that exists in the static cache.
  46. // @see http_request_get.
  47. $downloaded_string = $download->data;
  48. // If this happens to be a feed then just return the url.
  49. if (isset($download->headers['content-type']) && http_request_is_feed($download->headers['content-type'], $downloaded_string)) {
  50. return $url;
  51. }
  52. $discovered_feeds = http_request_find_feeds($downloaded_string);
  53. foreach ($discovered_feeds as $feed_url) {
  54. $absolute = http_request_create_absolute_url($feed_url, $url);
  55. if (!empty($absolute)) {
  56. // @TODO: something more intelligent?
  57. return $absolute;
  58. }
  59. }
  60. }
  61. /**
  62. * Get the content from the given URL.
  63. *
  64. * @param string $url
  65. * A valid URL (not only web URLs).
  66. * @param string $username
  67. * If the URL uses authentication, supply the username.
  68. * @param string $password
  69. * If the URL uses authentication, supply the password.
  70. * @param bool $accept_invalid_cert
  71. * Whether to accept invalid certificates.
  72. * @param integer $timeout
  73. * Timeout in seconds to wait for an HTTP get request to finish.
  74. *
  75. * @return stdClass
  76. * An object that describes the data downloaded from $url.
  77. */
  78. function http_request_get($url, $username = NULL, $password = NULL, $accept_invalid_cert = FALSE, $timeout = NULL) {
  79. // Intra-pagedownload cache, avoid to download the same content twice within
  80. // one page download (it's possible, compatible and parse calls).
  81. static $download_cache = array();
  82. if (isset($download_cache[$url])) {
  83. return $download_cache[$url];
  84. }
  85. // Determine request timeout.
  86. $request_timeout = !empty($timeout) ? $timeout : variable_get('http_request_timeout', 30);
  87. if (!$username && valid_url($url, TRUE)) {
  88. // Handle password protected feeds.
  89. $url_parts = parse_url($url);
  90. if (!empty($url_parts['user'])) {
  91. $password = urldecode($url_parts['pass']);
  92. $username = urldecode($url_parts['user']);
  93. }
  94. }
  95. $curl = http_request_use_curl();
  96. // Only download and parse data if really needs refresh.
  97. // Based on "Last-Modified" and "If-Modified-Since".
  98. $headers = array();
  99. if ($cache = http_request_get_cache($url)) {
  100. $last_result = $cache->data;
  101. $last_headers = array_change_key_case($last_result->headers);
  102. if (!empty($last_headers['etag'])) {
  103. if ($curl) {
  104. $headers[] = 'If-None-Match: ' . $last_headers['etag'];
  105. }
  106. else {
  107. $headers['If-None-Match'] = $last_headers['etag'];
  108. }
  109. }
  110. if (!empty($last_headers['last-modified'])) {
  111. if ($curl) {
  112. $headers[] = 'If-Modified-Since: ' . $last_headers['last-modified'];
  113. }
  114. else {
  115. $headers['If-Modified-Since'] = $last_headers['last-modified'];
  116. }
  117. }
  118. if (!empty($username) && !$curl) {
  119. $headers['Authorization'] = 'Basic ' . base64_encode("$username:$password");
  120. }
  121. }
  122. // Support the 'feed' and 'webcal' schemes by converting them into 'http'.
  123. $url = strtr($url, array('feed://' => 'http://', 'webcal://' => 'http://'));
  124. if ($curl) {
  125. $headers[] = 'User-Agent: Drupal (+http://drupal.org/)';
  126. $result = new stdClass();
  127. $result->headers = array();
  128. // Parse the URL and make sure we can handle the schema.
  129. // cURL can only support either http:// or https://.
  130. // CURLOPT_PROTOCOLS is only supported with cURL 7.19.4
  131. $uri = parse_url($url);
  132. if (!isset($uri['scheme'])) {
  133. $result->error = 'missing schema';
  134. $result->code = -1002;
  135. }
  136. else {
  137. switch ($uri['scheme']) {
  138. case 'http':
  139. case 'https':
  140. // Valid scheme.
  141. break;
  142. default:
  143. $result->error = 'invalid schema ' . $uri['scheme'];
  144. $result->code = -1003;
  145. break;
  146. }
  147. }
  148. // If the scheme was valid, continue to request the feed using cURL.
  149. if (empty($result->error)) {
  150. $download = curl_init($url);
  151. curl_setopt($download, CURLOPT_FOLLOWLOCATION, TRUE);
  152. if (!empty($username)) {
  153. curl_setopt($download, CURLOPT_USERPWD, "{$username}:{$password}");
  154. curl_setopt($download, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
  155. }
  156. curl_setopt($download, CURLOPT_HTTPHEADER, $headers);
  157. curl_setopt($download, CURLOPT_HEADER, TRUE);
  158. curl_setopt($download, CURLOPT_RETURNTRANSFER, TRUE);
  159. curl_setopt($download, CURLOPT_ENCODING, '');
  160. curl_setopt($download, CURLOPT_TIMEOUT, $request_timeout);
  161. $proxy_server = variable_get('proxy_server');
  162. if ($proxy_server && _drupal_http_use_proxy($uri['host'])) {
  163. curl_setopt($download, CURLOPT_PROXY, $proxy_server);
  164. curl_setopt($download, CURLOPT_PROXYPORT, variable_get('proxy_port', 8080));
  165. // Proxy user/password.
  166. if ($proxy_username = variable_get('proxy_username')) {
  167. $username_password = $proxy_username . ':' . variable_get('proxy_password', '');
  168. curl_setopt($download, CURLOPT_PROXYUSERPWD, $username_password);
  169. curl_setopt($download, CURLOPT_PROXYAUTH, variable_get('proxy_auth_method', CURLAUTH_BASIC));
  170. }
  171. }
  172. if ($accept_invalid_cert) {
  173. curl_setopt($download, CURLOPT_SSL_VERIFYPEER, 0);
  174. curl_setopt($download, CURLOPT_SSL_VERIFYHOST, 0);
  175. }
  176. $header = '';
  177. $data = curl_exec($download);
  178. if (curl_error($download)) {
  179. throw new HRCurlException(
  180. t('cURL error (@code) @error for @url', array(
  181. '@code' => curl_errno($download),
  182. '@error' => curl_error($download),
  183. '@url' => $url,
  184. )), curl_errno($download)
  185. );
  186. }
  187. // When using a proxy, remove extra data from the header which is not
  188. // considered by CURLINFO_HEADER_SIZE (possibly cURL bug).
  189. // This data is only added when to HTTP header when working with a proxy.
  190. // Example string added: <HTTP/1.0 200 Connection established\r\n\r\n>
  191. // This was fixed in libcurl version 7.30.0 (0x71e00) (April 12, 2013),
  192. // so this workaround only removes the proxy-added headers if we are using
  193. // an older version of libcurl.
  194. $curl_ver = curl_version();
  195. if ($proxy_server && $curl_ver['version_number'] < 0x71e00 && _drupal_http_use_proxy($uri['host'])) {
  196. $http_header_break = "\r\n\r\n";
  197. $response = explode($http_header_break, $data);
  198. if (count($response) > 2) {
  199. $data = substr($data, strlen($response[0] . $http_header_break), strlen($data));
  200. }
  201. }
  202. $header_size = curl_getinfo($download, CURLINFO_HEADER_SIZE);
  203. $header = substr($data, 0, $header_size - 1);
  204. $result->data = substr($data, $header_size);
  205. $headers = preg_split("/(\r\n){2}/", $header);
  206. $header_lines = preg_split("/\r\n|\n|\r/", end($headers));
  207. // Skip HTTP response status.
  208. array_shift($header_lines);
  209. while ($line = trim(array_shift($header_lines))) {
  210. list($header, $value) = explode(':', $line, 2);
  211. // Normalize the headers.
  212. $header = strtolower($header);
  213. if (isset($result->headers[$header]) && $header == 'set-cookie') {
  214. // RFC 2109: the Set-Cookie response header comprises the token Set-
  215. // Cookie:, followed by a comma-separated list of one or more cookies.
  216. $result->headers[$header] .= ',' . trim($value);
  217. }
  218. else {
  219. $result->headers[$header] = trim($value);
  220. }
  221. }
  222. $result->code = curl_getinfo($download, CURLINFO_HTTP_CODE);
  223. curl_close($download);
  224. }
  225. }
  226. else {
  227. $result = drupal_http_request($url, array('headers' => $headers, 'timeout' => $request_timeout));
  228. $result->headers = isset($result->headers) ? $result->headers : array();
  229. }
  230. $result->code = isset($result->code) ? $result->code : 200;
  231. // In case of 304 Not Modified try to return cached data.
  232. if ($result->code == 304) {
  233. if (isset($last_result)) {
  234. $last_result->from_cache = TRUE;
  235. return $last_result;
  236. }
  237. else {
  238. // It's a tragedy, this file must exist and contain good data.
  239. // In this case, clear cache and repeat.
  240. http_request_clear_cache($url);
  241. return http_request_get($url, $username, $password, $accept_invalid_cert, $request_timeout);
  242. }
  243. }
  244. // Set caches.
  245. http_request_set_cache($url, $result);
  246. $download_cache[$url] = $result;
  247. return $result;
  248. }
  249. /**
  250. * Decides if it's possible to use cURL or not.
  251. *
  252. * @return bool
  253. * TRUE if cURL may be used, FALSE otherwise.
  254. */
  255. function http_request_use_curl() {
  256. // Allow site administrators to choose to not use cURL.
  257. if (variable_get('feeds_never_use_curl', FALSE)) {
  258. return FALSE;
  259. }
  260. // Check that the PHP cURL extension has been enabled.
  261. if (!extension_loaded('curl')) {
  262. return FALSE;
  263. }
  264. // cURL below PHP 5.6.0 must not have open_basedir or safe_mode enabled.
  265. if (version_compare(PHP_VERSION, '5.6.0', '<')) {
  266. return !ini_get('safe_mode') && !ini_get('open_basedir');
  267. }
  268. // cURL in PHP 5.6.0 and above re-enables CURLOPT_FOLLOWLOCATION with
  269. // open_basedir so there is no need to check for this.
  270. return TRUE;
  271. }
  272. /**
  273. * Clear cache for a specific URL.
  274. *
  275. * @param string $url
  276. * The URL to clear.
  277. */
  278. function http_request_clear_cache($url) {
  279. cache_clear_all(hash('sha256', $url), 'cache_feeds_http');
  280. }
  281. /**
  282. * Gets the cache for a specific URL.
  283. *
  284. * @param string $url
  285. * The URL to find the cached item.
  286. *
  287. * @return object|false
  288. * The cache or FALSE on failure.
  289. */
  290. function http_request_get_cache($url) {
  291. return cache_get(hash('sha256', $url), 'cache_feeds_http');
  292. }
  293. /**
  294. * Sets the cache for a specific URL.
  295. *
  296. * @param string $url
  297. * The URL to cache.
  298. * @param stdClass $result
  299. * The result of the HTTP request.
  300. */
  301. function http_request_set_cache($url, stdClass $result) {
  302. cache_set(hash('sha256', $url), $result, 'cache_feeds_http');
  303. }
  304. /**
  305. * Returns if the provided $content_type is a feed.
  306. *
  307. * @param string $content_type
  308. * The Content-Type header.
  309. *
  310. * @param string $data
  311. * The actual data from the http request.
  312. *
  313. * @return bool
  314. * Returns TRUE if this is a parsable feed.
  315. */
  316. function http_request_is_feed($content_type, $data) {
  317. $pos = strpos($content_type, ';');
  318. if ($pos !== FALSE) {
  319. $content_type = substr($content_type, 0, $pos);
  320. }
  321. $content_type = strtolower($content_type);
  322. if (strpos($content_type, 'xml') !== FALSE) {
  323. return TRUE;
  324. }
  325. // @TODO: Sometimes the content-type can be text/html but still be a valid
  326. // feed.
  327. return FALSE;
  328. }
  329. /**
  330. * Finds potential feed tags in the HTML document.
  331. *
  332. * @param string $html
  333. * The html string to search.
  334. *
  335. * @return array
  336. * An array of href to feeds.
  337. */
  338. function http_request_find_feeds($html) {
  339. $matches = array();
  340. preg_match_all(HTTP_REQUEST_PCRE_LINK_TAG, $html, $matches);
  341. $links = $matches[1];
  342. $valid_links = array();
  343. // Build up all the links information.
  344. foreach ($links as $link_tag) {
  345. $attributes = array();
  346. $candidate = array();
  347. preg_match_all(HTTP_REQUEST_PCRE_TAG_ATTRIBUTES, $link_tag, $attributes, PREG_SET_ORDER);
  348. foreach ($attributes as $attribute) {
  349. // Find the key value pairs, attribute[1] is key and attribute[2] is the
  350. // value. However, if the link tag used single quotes, the value might
  351. // be in attribute[3] instead.
  352. if (empty($attribute[2])) {
  353. $attribute[2] = $attribute[3];
  354. }
  355. if (!empty($attribute[1]) && !empty($attribute[2])) {
  356. $candidate[drupal_strtolower($attribute[1])] = drupal_strtolower(decode_entities($attribute[2]));
  357. }
  358. }
  359. // Examine candidate to see if it s a feed.
  360. // @TODO: could/should use http_request_is_feed ??
  361. if (isset($candidate['rel']) && $candidate['rel'] == 'alternate') {
  362. if (isset($candidate['href']) && isset($candidate['type']) && strpos($candidate['type'], 'xml') !== FALSE) {
  363. // All tests pass, its a valid candidate.
  364. $valid_links[] = $candidate['href'];
  365. }
  366. }
  367. }
  368. return $valid_links;
  369. }
  370. /**
  371. * Create an absolute url.
  372. *
  373. * @param string $url
  374. * The href to transform.
  375. * @param string $base_url
  376. * The url to be used as the base for a relative $url.
  377. *
  378. * @return string
  379. * An absolute url
  380. */
  381. function http_request_create_absolute_url($url, $base_url) {
  382. $url = trim($url);
  383. if (valid_url($url, TRUE)) {
  384. // Valid absolute url already.
  385. return $url;
  386. }
  387. // Turn relative url into absolute.
  388. if (valid_url($url, FALSE)) {
  389. // Produces variables $scheme, $host, $user, $pass, $path, $query and
  390. // $fragment.
  391. $parsed_url = parse_url($base_url);
  392. if ($parsed_url === FALSE) {
  393. // Invalid $base_url.
  394. return FALSE;
  395. }
  396. $path = isset($parsed_url['path']) ? $parsed_url['path'] : '';
  397. if (strlen($path) > 0 && substr($path, -1) != '/') {
  398. // Path ends not with '/', so remove all before previous '/'.
  399. $path = dirname($path);
  400. }
  401. // Adding to the existing path.
  402. $cparts = array();
  403. if ($url{0} == '/') {
  404. $cparts = array_filter(explode("/", $url));
  405. }
  406. else {
  407. // Backtracking from the existing path.
  408. $path_cparts = array_filter(explode("/", $path));
  409. $url_cparts = array_filter(explode("/", $url));
  410. $cparts = array_merge($path_cparts, $url_cparts);
  411. }
  412. $remove_parts = 0;
  413. // Start from behind.
  414. $reverse_cparts = array_reverse($cparts);
  415. foreach ($reverse_cparts as $i => &$part) {
  416. if ($part == '.') {
  417. $part = NULL;
  418. }
  419. elseif ($part == '..') {
  420. $part = NULL;
  421. $remove_parts++;
  422. }
  423. elseif ($remove_parts > 0) {
  424. // If the current part isn't "..", and we had ".." before, then delete
  425. // the part.
  426. $part = NULL;
  427. $remove_parts--;
  428. }
  429. }
  430. $cparts = array_filter(array_reverse($reverse_cparts));
  431. $path = implode("/", $cparts);
  432. // Build the prefix to the path.
  433. $absolute_url = '';
  434. if (isset($parsed_url['scheme'])) {
  435. $absolute_url = $parsed_url['scheme'] . '://';
  436. }
  437. if (isset($parsed_url['user'])) {
  438. $absolute_url .= $parsed_url['user'];
  439. if (isset($pass)) {
  440. $absolute_url .= ':' . $parsed_url['pass'];
  441. }
  442. $absolute_url .= '@';
  443. }
  444. if (isset($parsed_url['host'])) {
  445. $absolute_url .= $parsed_url['host'] . '/';
  446. }
  447. $absolute_url .= $path;
  448. if (valid_url($absolute_url, TRUE)) {
  449. return $absolute_url;
  450. }
  451. }
  452. return FALSE;
  453. }