UrlHelper.php 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422
  1. <?php
  2. namespace Drupal\Component\Utility;
  3. /**
  4. * Helper class URL based methods.
  5. *
  6. * @ingroup utility
  7. */
  8. class UrlHelper {
  9. /**
  10. * The list of allowed protocols.
  11. *
  12. * @var array
  13. */
  14. protected static $allowedProtocols = ['http', 'https'];
  15. /**
  16. * Parses an array into a valid, rawurlencoded query string.
  17. *
  18. * Function rawurlencode() is RFC3986 compliant, and as a consequence RFC3987
  19. * compliant. The latter defines the required format of "URLs" in HTML5.
  20. * urlencode() is almost the same as rawurlencode(), except that it encodes
  21. * spaces as "+" instead of "%20". This makes its result non compliant to
  22. * RFC3986 and as a consequence non compliant to RFC3987 and as a consequence
  23. * not valid as a "URL" in HTML5.
  24. *
  25. * @todo Remove this function once PHP 5.4 is required as we can use just
  26. * http_build_query() directly.
  27. *
  28. * @param array $query
  29. * The query parameter array to be processed; for instance,
  30. * \Drupal::request()->query->all().
  31. * @param string $parent
  32. * (optional) Internal use only. Used to build the $query array key for
  33. * nested items. Defaults to an empty string.
  34. *
  35. * @return string
  36. * A rawurlencoded string which can be used as or appended to the URL query
  37. * string.
  38. *
  39. * @ingroup php_wrappers
  40. */
  41. public static function buildQuery(array $query, $parent = '') {
  42. $params = [];
  43. foreach ($query as $key => $value) {
  44. $key = ($parent ? $parent . rawurlencode('[' . $key . ']') : rawurlencode($key));
  45. // Recurse into children.
  46. if (is_array($value)) {
  47. $params[] = static::buildQuery($value, $key);
  48. }
  49. // If a query parameter value is NULL, only append its key.
  50. elseif (!isset($value)) {
  51. $params[] = $key;
  52. }
  53. else {
  54. // For better readability of paths in query strings, we decode slashes.
  55. $params[] = $key . '=' . str_replace('%2F', '/', rawurlencode($value));
  56. }
  57. }
  58. return implode('&', $params);
  59. }
  60. /**
  61. * Filters a URL query parameter array to remove unwanted elements.
  62. *
  63. * @param array $query
  64. * An array to be processed.
  65. * @param array $exclude
  66. * (optional) A list of $query array keys to remove. Use "parent[child]" to
  67. * exclude nested items.
  68. * @param string $parent
  69. * Internal use only. Used to build the $query array key for nested items.
  70. *
  71. * @return
  72. * An array containing query parameters.
  73. */
  74. public static function filterQueryParameters(array $query, array $exclude = [], $parent = '') {
  75. // If $exclude is empty, there is nothing to filter.
  76. if (empty($exclude)) {
  77. return $query;
  78. }
  79. elseif (!$parent) {
  80. $exclude = array_flip($exclude);
  81. }
  82. $params = [];
  83. foreach ($query as $key => $value) {
  84. $string_key = ($parent ? $parent . '[' . $key . ']' : $key);
  85. if (isset($exclude[$string_key])) {
  86. continue;
  87. }
  88. if (is_array($value)) {
  89. $params[$key] = static::filterQueryParameters($value, $exclude, $string_key);
  90. }
  91. else {
  92. $params[$key] = $value;
  93. }
  94. }
  95. return $params;
  96. }
  97. /**
  98. * Parses a URL string into its path, query, and fragment components.
  99. *
  100. * This function splits both internal paths like @code node?b=c#d @endcode and
  101. * external URLs like @code https://example.com/a?b=c#d @endcode into their
  102. * component parts. See
  103. * @link http://tools.ietf.org/html/rfc3986#section-3 RFC 3986 @endlink for an
  104. * explanation of what the component parts are.
  105. *
  106. * Note that, unlike the RFC, when passed an external URL, this function
  107. * groups the scheme, authority, and path together into the path component.
  108. *
  109. * @param string $url
  110. * The internal path or external URL string to parse.
  111. *
  112. * @return array
  113. * An associative array containing:
  114. * - path: The path component of $url. If $url is an external URL, this
  115. * includes the scheme, authority, and path.
  116. * - query: An array of query parameters from $url, if they exist.
  117. * - fragment: The fragment component from $url, if it exists.
  118. *
  119. * @see \Drupal\Core\Utility\LinkGenerator
  120. * @see http://tools.ietf.org/html/rfc3986
  121. *
  122. * @ingroup php_wrappers
  123. */
  124. public static function parse($url) {
  125. $options = [
  126. 'path' => NULL,
  127. 'query' => [],
  128. 'fragment' => '',
  129. ];
  130. // External URLs: not using parse_url() here, so we do not have to rebuild
  131. // the scheme, host, and path without having any use for it.
  132. // The URL is considered external if it contains the '://' delimiter. Since
  133. // a URL can also be passed as a query argument, we check if this delimiter
  134. // appears in front of the '?' query argument delimiter.
  135. $scheme_delimiter_position = strpos($url, '://');
  136. $query_delimiter_position = strpos($url, '?');
  137. if ($scheme_delimiter_position !== FALSE && ($query_delimiter_position === FALSE || $scheme_delimiter_position < $query_delimiter_position)) {
  138. // Split off the fragment, if any.
  139. if (strpos($url, '#') !== FALSE) {
  140. list($url, $options['fragment']) = explode('#', $url, 2);
  141. }
  142. // Split off everything before the query string into 'path'.
  143. $parts = explode('?', $url);
  144. // Don't support URLs without a path, like 'http://'.
  145. list(, $path) = explode('://', $parts[0], 2);
  146. if ($path != '') {
  147. $options['path'] = $parts[0];
  148. }
  149. // If there is a query string, transform it into keyed query parameters.
  150. if (isset($parts[1])) {
  151. parse_str($parts[1], $options['query']);
  152. }
  153. }
  154. // Internal URLs.
  155. else {
  156. // parse_url() does not support relative URLs, so make it absolute. For
  157. // instance, the relative URL "foo/bar:1" isn't properly parsed.
  158. $parts = parse_url('http://example.com/' . $url);
  159. // Strip the leading slash that was just added.
  160. $options['path'] = substr($parts['path'], 1);
  161. if (isset($parts['query'])) {
  162. parse_str($parts['query'], $options['query']);
  163. }
  164. if (isset($parts['fragment'])) {
  165. $options['fragment'] = $parts['fragment'];
  166. }
  167. }
  168. return $options;
  169. }
  170. /**
  171. * Encodes a Drupal path for use in a URL.
  172. *
  173. * For aesthetic reasons slashes are not escaped.
  174. *
  175. * @param string $path
  176. * The Drupal path to encode.
  177. *
  178. * @return string
  179. * The encoded path.
  180. */
  181. public static function encodePath($path) {
  182. return str_replace('%2F', '/', rawurlencode($path));
  183. }
  184. /**
  185. * Determines whether a path is external to Drupal.
  186. *
  187. * An example of an external path is http://example.com. If a path cannot be
  188. * assessed by Drupal's menu handler, then we must treat it as potentially
  189. * insecure.
  190. *
  191. * @param string $path
  192. * The internal path or external URL being linked to, such as "node/34" or
  193. * "http://example.com/foo".
  194. *
  195. * @return bool
  196. * TRUE or FALSE, where TRUE indicates an external path.
  197. */
  198. public static function isExternal($path) {
  199. $colonpos = strpos($path, ':');
  200. // Some browsers treat \ as / so normalize to forward slashes.
  201. $path = str_replace('\\', '/', $path);
  202. // If the path starts with 2 slashes then it is always considered an
  203. // external URL without an explicit protocol part.
  204. return (strpos($path, '//') === 0)
  205. // Leading control characters may be ignored or mishandled by browsers,
  206. // so assume such a path may lead to an external location. The \p{C}
  207. // character class matches all UTF-8 control, unassigned, and private
  208. // characters.
  209. || (preg_match('/^\p{C}/u', $path) !== 0)
  210. // Avoid calling static::stripDangerousProtocols() if there is any slash
  211. // (/), hash (#) or question_mark (?) before the colon (:) occurrence -
  212. // if any - as this would clearly mean it is not a URL.
  213. || ($colonpos !== FALSE
  214. && !preg_match('![/?#]!', substr($path, 0, $colonpos))
  215. && static::stripDangerousProtocols($path) == $path);
  216. }
  217. /**
  218. * Determines if an external URL points to this installation.
  219. *
  220. * @param string $url
  221. * A string containing an external URL, such as "http://example.com/foo".
  222. * @param string $base_url
  223. * The base URL string to check against, such as "http://example.com/"
  224. *
  225. * @return bool
  226. * TRUE if the URL has the same domain and base path.
  227. *
  228. * @throws \InvalidArgumentException
  229. * Exception thrown when a either $url or $bath_url are not fully qualified.
  230. */
  231. public static function externalIsLocal($url, $base_url) {
  232. // Some browsers treat \ as / so normalize to forward slashes.
  233. $url = str_replace('\\', '/', $url);
  234. // Leading control characters may be ignored or mishandled by browsers, so
  235. // assume such a path may lead to an non-local location. The \p{C} character
  236. // class matches all UTF-8 control, unassigned, and private characters.
  237. if (preg_match('/^\p{C}/u', $url) !== 0) {
  238. return FALSE;
  239. }
  240. $url_parts = parse_url($url);
  241. $base_parts = parse_url($base_url);
  242. if (empty($base_parts['host']) || empty($url_parts['host'])) {
  243. throw new \InvalidArgumentException('A path was passed when a fully qualified domain was expected.');
  244. }
  245. if (!isset($url_parts['path']) || !isset($base_parts['path'])) {
  246. return (!isset($base_parts['path']) || $base_parts['path'] == '/')
  247. && ($url_parts['host'] == $base_parts['host']);
  248. }
  249. else {
  250. // When comparing base paths, we need a trailing slash to make sure a
  251. // partial URL match isn't occurring. Since base_path() always returns
  252. // with a trailing slash, we don't need to add the trailing slash here.
  253. return ($url_parts['host'] == $base_parts['host'] && stripos($url_parts['path'], $base_parts['path']) === 0);
  254. }
  255. }
  256. /**
  257. * Processes an HTML attribute value and strips dangerous protocols from URLs.
  258. *
  259. * @param string $string
  260. * The string with the attribute value.
  261. *
  262. * @return string
  263. * Cleaned up and HTML-escaped version of $string.
  264. */
  265. public static function filterBadProtocol($string) {
  266. // Get the plain text representation of the attribute value (i.e. its
  267. // meaning).
  268. $string = Html::decodeEntities($string);
  269. return Html::escape(static::stripDangerousProtocols($string));
  270. }
  271. /**
  272. * Gets the allowed protocols.
  273. *
  274. * @return array
  275. * An array of protocols, for example http, https and irc.
  276. */
  277. public static function getAllowedProtocols() {
  278. return static::$allowedProtocols;
  279. }
  280. /**
  281. * Sets the allowed protocols.
  282. *
  283. * @param array $protocols
  284. * An array of protocols, for example http, https and irc.
  285. */
  286. public static function setAllowedProtocols(array $protocols = []) {
  287. static::$allowedProtocols = $protocols;
  288. }
  289. /**
  290. * Strips dangerous protocols (for example, 'javascript:') from a URI.
  291. *
  292. * This function must be called for all URIs within user-entered input prior
  293. * to being output to an HTML attribute value. It is often called as part of
  294. * \Drupal\Component\Utility\UrlHelper::filterBadProtocol() or
  295. * \Drupal\Component\Utility\Xss::filter(), but those functions return an
  296. * HTML-encoded string, so this function can be called independently when the
  297. * output needs to be a plain-text string for passing to functions that will
  298. * call Html::escape() separately. The exact behavior depends on the value:
  299. * - If the value is a well-formed (per RFC 3986) relative URL or
  300. * absolute URL that does not use a dangerous protocol (like
  301. * "javascript:"), then the URL remains unchanged. This includes all
  302. * URLs generated via Url::toString() and UrlGeneratorTrait::url().
  303. * - If the value is a well-formed absolute URL with a dangerous protocol,
  304. * the protocol is stripped. This process is repeated on the remaining URL
  305. * until it is stripped down to a safe protocol.
  306. * - If the value is not a well-formed URL, the same sanitization behavior as
  307. * for well-formed URLs will be invoked, which strips most substrings that
  308. * precede a ":". The result can be used in URL attributes such as "href"
  309. * or "src" (only after calling Html::escape() separately), but this may not
  310. * produce valid HTML (for example, malformed URLs within "href" attributes
  311. * fail HTML validation). This can be avoided by using
  312. * Url::fromUri($possibly_not_a_url)->toString(), which either throws an
  313. * exception or returns a well-formed URL.
  314. *
  315. * @param string $uri
  316. * A plain-text URI that might contain dangerous protocols.
  317. *
  318. * @return string
  319. * A plain-text URI stripped of dangerous protocols. As with all plain-text
  320. * strings, this return value must not be output to an HTML page without
  321. * being sanitized first. However, it can be passed to functions
  322. * expecting plain-text strings.
  323. *
  324. * @see \Drupal\Component\Utility\Html::escape()
  325. * @see \Drupal\Core\Url::toString()
  326. * @see \Drupal\Core\Routing\UrlGeneratorTrait::url()
  327. * @see \Drupal\Core\Url::fromUri()
  328. */
  329. public static function stripDangerousProtocols($uri) {
  330. $allowed_protocols = array_flip(static::$allowedProtocols);
  331. // Iteratively remove any invalid protocol found.
  332. do {
  333. $before = $uri;
  334. $colonpos = strpos($uri, ':');
  335. if ($colonpos > 0) {
  336. // We found a colon, possibly a protocol. Verify.
  337. $protocol = substr($uri, 0, $colonpos);
  338. // If a colon is preceded by a slash, question mark or hash, it cannot
  339. // possibly be part of the URL scheme. This must be a relative URL, which
  340. // inherits the (safe) protocol of the base document.
  341. if (preg_match('![/?#]!', $protocol)) {
  342. break;
  343. }
  344. // Check if this is a disallowed protocol. Per RFC2616, section 3.2.3
  345. // (URI Comparison) scheme comparison must be case-insensitive.
  346. if (!isset($allowed_protocols[strtolower($protocol)])) {
  347. $uri = substr($uri, $colonpos + 1);
  348. }
  349. }
  350. } while ($before != $uri);
  351. return $uri;
  352. }
  353. /**
  354. * Verifies the syntax of the given URL.
  355. *
  356. * This function should only be used on actual URLs. It should not be used for
  357. * Drupal menu paths, which can contain arbitrary characters.
  358. * Valid values per RFC 3986.
  359. *
  360. * @param string $url
  361. * The URL to verify.
  362. * @param bool $absolute
  363. * Whether the URL is absolute (beginning with a scheme such as "http:").
  364. *
  365. * @return bool
  366. * TRUE if the URL is in a valid format, FALSE otherwise.
  367. */
  368. public static function isValid($url, $absolute = FALSE) {
  369. if ($absolute) {
  370. return (bool) preg_match("
  371. /^ # Start at the beginning of the text
  372. (?:ftp|https?|feed):\/\/ # Look for ftp, http, https or feed schemes
  373. (?: # Userinfo (optional) which is typically
  374. (?:(?:[\w\.\-\+!$&'\(\)*\+,;=]|%[0-9a-f]{2})+:)* # a username or a username and password
  375. (?:[\w\.\-\+%!$&'\(\)*\+,;=]|%[0-9a-f]{2})+@ # combination
  376. )?
  377. (?:
  378. (?:[a-z0-9\-\.]|%[0-9a-f]{2})+ # A domain name or a IPv4 address
  379. |(?:\[(?:[0-9a-f]{0,4}:)*(?:[0-9a-f]{0,4})\]) # or a well formed IPv6 address
  380. )
  381. (?::[0-9]+)? # Server port number (optional)
  382. (?:[\/|\?]
  383. (?:[\w#!:\.\?\+=&@$'~*,;\/\(\)\[\]\-]|%[0-9a-f]{2}) # The path and query (optional)
  384. *)?
  385. $/xi", $url);
  386. }
  387. else {
  388. return (bool) preg_match("/^(?:[\w#!:\.\?\+=&@$'~*,;\/\(\)\[\]\-]|%[0-9a-f]{2})+$/i", $url);
  389. }
  390. }
  391. }