Xss.php 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349
  1. <?php
  2. namespace Drupal\Component\Utility;
  3. /**
  4. * Provides helper to filter for cross-site scripting.
  5. *
  6. * @ingroup utility
  7. */
  8. class Xss {
  9. /**
  10. * The list of HTML tags allowed by filterAdmin().
  11. *
  12. * @var array
  13. *
  14. * @see \Drupal\Component\Utility\Xss::filterAdmin()
  15. */
  16. protected static $adminTags = ['a', 'abbr', 'acronym', 'address', 'article', 'aside', 'b', 'bdi', 'bdo', 'big', 'blockquote', 'br', 'caption', 'cite', 'code', 'col', 'colgroup', 'command', 'dd', 'del', 'details', 'dfn', 'div', 'dl', 'dt', 'em', 'figcaption', 'figure', 'footer', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'i', 'img', 'ins', 'kbd', 'li', 'mark', 'menu', 'meter', 'nav', 'ol', 'output', 'p', 'pre', 'progress', 'q', 'rp', 'rt', 'ruby', 's', 'samp', 'section', 'small', 'span', 'strong', 'sub', 'summary', 'sup', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'time', 'tr', 'tt', 'u', 'ul', 'var', 'wbr'];
  17. /**
  18. * The default list of HTML tags allowed by filter().
  19. *
  20. * @var array
  21. *
  22. * @see \Drupal\Component\Utility\Xss::filter()
  23. */
  24. protected static $htmlTags = ['a', 'em', 'strong', 'cite', 'blockquote', 'code', 'ul', 'ol', 'li', 'dl', 'dt', 'dd'];
  25. /**
  26. * Filters HTML to prevent cross-site-scripting (XSS) vulnerabilities.
  27. *
  28. * Based on kses by Ulf Harnhammar, see http://sourceforge.net/projects/kses.
  29. * For examples of various XSS attacks, see: http://ha.ckers.org/xss.html.
  30. *
  31. * This code does four things:
  32. * - Removes characters and constructs that can trick browsers.
  33. * - Makes sure all HTML entities are well-formed.
  34. * - Makes sure all HTML tags and attributes are well-formed.
  35. * - Makes sure no HTML tags contain URLs with a disallowed protocol (e.g.
  36. * javascript:).
  37. *
  38. * @param $string
  39. * The string with raw HTML in it. It will be stripped of everything that
  40. * can cause an XSS attack.
  41. * @param array $html_tags
  42. * An array of HTML tags.
  43. *
  44. * @return string
  45. * An XSS safe version of $string, or an empty string if $string is not
  46. * valid UTF-8.
  47. *
  48. * @see \Drupal\Component\Utility\Unicode::validateUtf8()
  49. *
  50. * @ingroup sanitization
  51. */
  52. public static function filter($string, array $html_tags = NULL) {
  53. if (is_null($html_tags)) {
  54. $html_tags = static::$htmlTags;
  55. }
  56. // Only operate on valid UTF-8 strings. This is necessary to prevent cross
  57. // site scripting issues on Internet Explorer 6.
  58. if (!Unicode::validateUtf8($string)) {
  59. return '';
  60. }
  61. // Remove NULL characters (ignored by some browsers).
  62. $string = str_replace(chr(0), '', $string);
  63. // Remove Netscape 4 JS entities.
  64. $string = preg_replace('%&\s*\{[^}]*(\}\s*;?|$)%', '', $string);
  65. // Defuse all HTML entities.
  66. $string = str_replace('&', '&amp;', $string);
  67. // Change back only well-formed entities in our whitelist:
  68. // Decimal numeric entities.
  69. $string = preg_replace('/&amp;#([0-9]+;)/', '&#\1', $string);
  70. // Hexadecimal numeric entities.
  71. $string = preg_replace('/&amp;#[Xx]0*((?:[0-9A-Fa-f]{2})+;)/', '&#x\1', $string);
  72. // Named entities.
  73. $string = preg_replace('/&amp;([A-Za-z][A-Za-z0-9]*;)/', '&\1', $string);
  74. $html_tags = array_flip($html_tags);
  75. // Late static binding does not work inside anonymous functions.
  76. $class = get_called_class();
  77. $splitter = function ($matches) use ($html_tags, $class) {
  78. return $class::split($matches[1], $html_tags, $class);
  79. };
  80. // Strip any tags that are not in the whitelist.
  81. return preg_replace_callback('%
  82. (
  83. <(?=[^a-zA-Z!/]) # a lone <
  84. | # or
  85. <!--.*?--> # a comment
  86. | # or
  87. <[^>]*(>|$) # a string that starts with a <, up until the > or the end of the string
  88. | # or
  89. > # just a >
  90. )%x', $splitter, $string);
  91. }
  92. /**
  93. * Applies a very permissive XSS/HTML filter for admin-only use.
  94. *
  95. * Use only for fields where it is impractical to use the
  96. * whole filter system, but where some (mainly inline) mark-up
  97. * is desired (so \Drupal\Component\Utility\Html::escape() is
  98. * not acceptable).
  99. *
  100. * Allows all tags that can be used inside an HTML body, save
  101. * for scripts and styles.
  102. *
  103. * @param string $string
  104. * The string to apply the filter to.
  105. *
  106. * @return string
  107. * The filtered string.
  108. *
  109. * @ingroup sanitization
  110. *
  111. * @see \Drupal\Component\Utility\Xss::getAdminTagList()
  112. */
  113. public static function filterAdmin($string) {
  114. return static::filter($string, static::$adminTags);
  115. }
  116. /**
  117. * Processes an HTML tag.
  118. *
  119. * @param string $string
  120. * The HTML tag to process.
  121. * @param array $html_tags
  122. * An array where the keys are the allowed tags and the values are not
  123. * used.
  124. * @param string $class
  125. * The called class. This method is called from an anonymous function which
  126. * breaks late static binding. See https://bugs.php.net/bug.php?id=66622 for
  127. * more information.
  128. *
  129. * @return string
  130. * If the element isn't allowed, an empty string. Otherwise, the cleaned up
  131. * version of the HTML element.
  132. */
  133. protected static function split($string, $html_tags, $class) {
  134. if (substr($string, 0, 1) != '<') {
  135. // We matched a lone ">" character.
  136. return '&gt;';
  137. }
  138. elseif (strlen($string) == 1) {
  139. // We matched a lone "<" character.
  140. return '&lt;';
  141. }
  142. if (!preg_match('%^<\s*(/\s*)?([a-zA-Z0-9\-]+)\s*([^>]*)>?|(<!--.*?-->)$%', $string, $matches)) {
  143. // Seriously malformed.
  144. return '';
  145. }
  146. $slash = trim($matches[1]);
  147. $elem = &$matches[2];
  148. $attrlist = &$matches[3];
  149. $comment = &$matches[4];
  150. if ($comment) {
  151. $elem = '!--';
  152. }
  153. // When in whitelist mode, an element is disallowed when not listed.
  154. if ($class::needsRemoval($html_tags, $elem)) {
  155. return '';
  156. }
  157. if ($comment) {
  158. return $comment;
  159. }
  160. if ($slash != '') {
  161. return "</$elem>";
  162. }
  163. // Is there a closing XHTML slash at the end of the attributes?
  164. $attrlist = preg_replace('%(\s?)/\s*$%', '\1', $attrlist, -1, $count);
  165. $xhtml_slash = $count ? ' /' : '';
  166. // Clean up attributes.
  167. $attr2 = implode(' ', $class::attributes($attrlist));
  168. $attr2 = preg_replace('/[<>]/', '', $attr2);
  169. $attr2 = strlen($attr2) ? ' ' . $attr2 : '';
  170. return "<$elem$attr2$xhtml_slash>";
  171. }
  172. /**
  173. * Processes a string of HTML attributes.
  174. *
  175. * @param string $attributes
  176. * The html attribute to process.
  177. *
  178. * @return string
  179. * Cleaned up version of the HTML attributes.
  180. */
  181. protected static function attributes($attributes) {
  182. $attributes_array = [];
  183. $mode = 0;
  184. $attribute_name = '';
  185. $skip = FALSE;
  186. $skip_protocol_filtering = FALSE;
  187. while (strlen($attributes) != 0) {
  188. // Was the last operation successful?
  189. $working = 0;
  190. switch ($mode) {
  191. case 0:
  192. // Attribute name, href for instance.
  193. if (preg_match('/^([-a-zA-Z][-a-zA-Z0-9]*)/', $attributes, $match)) {
  194. $attribute_name = strtolower($match[1]);
  195. $skip = ($attribute_name == 'style' || substr($attribute_name, 0, 2) == 'on');
  196. // Values for attributes of type URI should be filtered for
  197. // potentially malicious protocols (for example, an href-attribute
  198. // starting with "javascript:"). However, for some non-URI
  199. // attributes performing this filtering causes valid and safe data
  200. // to be mangled. We prevent this by skipping protocol filtering on
  201. // such attributes.
  202. // @see \Drupal\Component\Utility\UrlHelper::filterBadProtocol()
  203. // @see http://www.w3.org/TR/html4/index/attributes.html
  204. $skip_protocol_filtering = substr($attribute_name, 0, 5) === 'data-' || in_array($attribute_name, [
  205. 'title',
  206. 'alt',
  207. 'rel',
  208. 'property',
  209. ]);
  210. $working = $mode = 1;
  211. $attributes = preg_replace('/^[-a-zA-Z][-a-zA-Z0-9]*/', '', $attributes);
  212. }
  213. break;
  214. case 1:
  215. // Equals sign or valueless ("selected").
  216. if (preg_match('/^\s*=\s*/', $attributes)) {
  217. $working = 1; $mode = 2;
  218. $attributes = preg_replace('/^\s*=\s*/', '', $attributes);
  219. break;
  220. }
  221. if (preg_match('/^\s+/', $attributes)) {
  222. $working = 1; $mode = 0;
  223. if (!$skip) {
  224. $attributes_array[] = $attribute_name;
  225. }
  226. $attributes = preg_replace('/^\s+/', '', $attributes);
  227. }
  228. break;
  229. case 2:
  230. // Attribute value, a URL after href= for instance.
  231. if (preg_match('/^"([^"]*)"(\s+|$)/', $attributes, $match)) {
  232. $thisval = $skip_protocol_filtering ? $match[1] : UrlHelper::filterBadProtocol($match[1]);
  233. if (!$skip) {
  234. $attributes_array[] = "$attribute_name=\"$thisval\"";
  235. }
  236. $working = 1;
  237. $mode = 0;
  238. $attributes = preg_replace('/^"[^"]*"(\s+|$)/', '', $attributes);
  239. break;
  240. }
  241. if (preg_match("/^'([^']*)'(\s+|$)/", $attributes, $match)) {
  242. $thisval = $skip_protocol_filtering ? $match[1] : UrlHelper::filterBadProtocol($match[1]);
  243. if (!$skip) {
  244. $attributes_array[] = "$attribute_name='$thisval'";
  245. }
  246. $working = 1; $mode = 0;
  247. $attributes = preg_replace("/^'[^']*'(\s+|$)/", '', $attributes);
  248. break;
  249. }
  250. if (preg_match("%^([^\s\"']+)(\s+|$)%", $attributes, $match)) {
  251. $thisval = $skip_protocol_filtering ? $match[1] : UrlHelper::filterBadProtocol($match[1]);
  252. if (!$skip) {
  253. $attributes_array[] = "$attribute_name=\"$thisval\"";
  254. }
  255. $working = 1; $mode = 0;
  256. $attributes = preg_replace("%^[^\s\"']+(\s+|$)%", '', $attributes);
  257. }
  258. break;
  259. }
  260. if ($working == 0) {
  261. // Not well formed; remove and try again.
  262. $attributes = preg_replace('/
  263. ^
  264. (
  265. "[^"]*("|$) # - a string that starts with a double quote, up until the next double quote or the end of the string
  266. | # or
  267. \'[^\']*(\'|$)| # - a string that starts with a quote, up until the next quote or the end of the string
  268. | # or
  269. \S # - a non-whitespace character
  270. )* # any number of the above three
  271. \s* # any number of whitespaces
  272. /x', '', $attributes);
  273. $mode = 0;
  274. }
  275. }
  276. // The attribute list ends with a valueless attribute like "selected".
  277. if ($mode == 1 && !$skip) {
  278. $attributes_array[] = $attribute_name;
  279. }
  280. return $attributes_array;
  281. }
  282. /**
  283. * Whether this element needs to be removed altogether.
  284. *
  285. * @param $html_tags
  286. * The list of HTML tags.
  287. * @param $elem
  288. * The name of the HTML element.
  289. *
  290. * @return bool
  291. * TRUE if this element needs to be removed.
  292. */
  293. protected static function needsRemoval($html_tags, $elem) {
  294. return !isset($html_tags[strtolower($elem)]);
  295. }
  296. /**
  297. * Gets the list of HTML tags allowed by Xss::filterAdmin().
  298. *
  299. * @return array
  300. * The list of HTML tags allowed by filterAdmin().
  301. */
  302. public static function getAdminTagList() {
  303. return static::$adminTags;
  304. }
  305. /**
  306. * Gets the standard list of HTML tags allowed by Xss::filter().
  307. *
  308. * @return array
  309. * The list of HTML tags allowed by Xss::filter().
  310. */
  311. public static function getHtmlTagList() {
  312. return static::$htmlTags;
  313. }
  314. }