wysiwyg_filter.pages.inc 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530
  1. <?php
  2. /**
  3. * @file
  4. * User land code for the WYSIWYG Filter module.
  5. */
  6. /**
  7. * WYSIWYG Filter. Provides filtering of input into accepted HTML.
  8. *
  9. * This function is based on Drupal's filter_xss() with a few additions:
  10. * - Validates HTML input against whitelists of HTML elements, attributes
  11. * and style properties.
  12. * - Optionally apply rel="nofollow" rules to links.
  13. * - Rules for the above can be specified by site administrators from the
  14. * filter settings form.
  15. *
  16. * @param string $text
  17. * HTML text to be filtered.
  18. * @param int $format
  19. * Input format identifier.
  20. * @return string
  21. * Filtered HTML text.
  22. */
  23. function wysiwyg_filter_filter_wysiwyg_process($text, $filter, $format, $langcode = NULL, $cache = NULL, $cache_id = NULL) {
  24. // Only operate on valid UTF-8 strings. This is necessary to prevent cross
  25. // site scripting issues on Internet Explorer 6.
  26. if (!drupal_validate_utf8($text)) {
  27. return '';
  28. }
  29. // Load common functions.
  30. module_load_include('inc', 'wysiwyg_filter');
  31. // Store input filter options.
  32. _wysiwyg_filter_xss_split(wysiwyg_filter_get_filter_options($format->format, $filter->settings), TRUE);
  33. // Remove NUL characters (ignored by some browsers).
  34. $text = str_replace(chr(0), '', $text);
  35. // Remove Netscape 4 JS entities.
  36. $text = preg_replace('%&\s*\{[^}]*(\}\s*;?|$)%', '', $text);
  37. // Defuse all HTML entities.
  38. $text = str_replace('&', '&amp;', $text);
  39. // Change back only well-formed entities in our whitelist
  40. // Decimal numeric entities.
  41. $text = preg_replace('/&amp;#([0-9]+;)/', '&#\1', $text);
  42. // Hexadecimal numeric entities.
  43. $text = preg_replace('/&amp;#[Xx]0*((?:[0-9A-Fa-f]{2})+;)/', '&#x\1', $text);
  44. // Named entities.
  45. $text = preg_replace('/&amp;([A-Za-z][A-Za-z0-9]*;)/', '&\1', $text);
  46. return preg_replace_callback('%
  47. (
  48. <(?=[^a-zA-Z!/]) # a lone <
  49. | # or
  50. <!--.*?--> # a comment
  51. | # or
  52. <[^>]*(>|$) # a string that starts with a <, up until the > or the end of the string
  53. | # or
  54. > # just a >
  55. )%x', '_wysiwyg_filter_xss_split', $text);
  56. }
  57. /**
  58. * Processes an HTML tag.
  59. *
  60. * @param $m
  61. * An array with various meaning depending on the value of $store.
  62. * If $store is TRUE then the array contains the allowed tags.
  63. * If $store is FALSE then the array has one element, the HTML tag to process.
  64. * @param $store
  65. * Whether to store $m.
  66. * @return
  67. * If the element isn't allowed, an empty string. Otherwise, the cleaned up
  68. * version of the HTML element.
  69. */
  70. function _wysiwyg_filter_xss_split($m, $store = FALSE) {
  71. static $filter_options;
  72. if ($store) {
  73. _wysiwyg_filter_xss_attributes($filter_options = $m);
  74. return;
  75. }
  76. $string = $m[1];
  77. if (substr($string, 0, 1) != '<') {
  78. // We matched a lone ">" character
  79. return '&gt;';
  80. }
  81. else if (strlen($string) == 1) {
  82. // We matched a lone "<" character
  83. return '&lt;';
  84. }
  85. if (!preg_match('%^<\s*(/\s*)?([a-zA-Z0-9]+)([^>]*)>?|(<!--.*?-->)$%', $string, $matches)) {
  86. // Seriously malformed
  87. return '';
  88. }
  89. $slash = trim($matches[1]);
  90. $elem = strtolower($matches[2]);
  91. $attrlist = &$matches[3];
  92. $comment = &$matches[4];
  93. if (!empty($comment)) {
  94. // Allow or disallow HTML comments.
  95. return (!empty($filter_options['allow_comments']) ? $comment : '');
  96. }
  97. elseif (!isset($filter_options['valid_elements'][$elem])) {
  98. // Disallowed HTML element.
  99. return '';
  100. }
  101. if ($slash != '') {
  102. return "</$elem>";
  103. }
  104. // Is there a closing XHTML slash at the end of the attributes?
  105. // In PHP 5.1.0+ we could count the changes, currently we need a separate match
  106. $xhtml_slash = preg_match('%\s?/\s*$%', $attrlist) ? ' /' : '';
  107. $attrlist = preg_replace('%(\s?)/\s*$%', '\1', $attrlist);
  108. // Clean up attributes
  109. if (($attr2 = _wysiwyg_filter_xss_attributes($attrlist, $elem)) === FALSE) {
  110. // Disallowed HTML element because it does not contain required attribute.
  111. return '';
  112. }
  113. $attr2 = implode(' ', $attr2);
  114. $attr2 = preg_replace('/[<>]/', '', $attr2);
  115. $attr2 = strlen($attr2) ? ' ' . $attr2 : '';
  116. return "<$elem$attr2$xhtml_slash>";
  117. }
  118. /**
  119. * Processes a string of HTML attributes.
  120. *
  121. * @param mixed $attr
  122. * String with attributes list to be checked.
  123. * Array with whitelist of all HTML elements and their allowed attributes.
  124. * @param string $element
  125. * Current element for specified attributes lists.
  126. * @return
  127. * Cleaned up version of the HTML attributes.
  128. */
  129. function _wysiwyg_filter_xss_attributes($attr, $element = '') {
  130. static $filter_options;
  131. if (is_array($attr)) {
  132. $filter_options = $attr;
  133. return;
  134. }
  135. // Shortcuts for filter options.
  136. $allowed_attributes = &$filter_options['valid_elements'][$element];
  137. $allowed_properties = &$filter_options['style_properties'];
  138. $allowed_style_urls = &$filter_options['style_urls'];
  139. $allowed_class_names = &$filter_options['valid_classes'];
  140. $allowed_element_ids = &$filter_options['valid_ids'];
  141. $nofollow_policy = &$filter_options['nofollow_policy'];
  142. $nofollow_domains = &$filter_options['nofollow_domains'];
  143. $attrarr = array();
  144. $mode = 0;
  145. $attrname = '';
  146. while (strlen($attr) != 0) {
  147. // Was the last operation successful?
  148. $working = 0;
  149. switch ($mode) {
  150. case 0:
  151. // Attribute name, href for instance.
  152. if (preg_match('/^([-a-zA-Z]+)/', $attr, $match)) {
  153. $attrname = strtolower($match[1]);
  154. $skip = (substr($attrname, 0, 2) == 'on' || (!isset($allowed_attributes[$attrname]) && !isset($allowed_attributes['*'])));
  155. $working = $mode = 1;
  156. $attr = preg_replace('/^[-a-zA-Z]+/', '', $attr);
  157. }
  158. break;
  159. case 1:
  160. // Equals sign or valueless ("selected").
  161. if (preg_match('/^\s*=\s*/', $attr)) {
  162. $working = 1;
  163. $mode = 2;
  164. $attr = preg_replace('/^\s*=\s*/', '', $attr);
  165. break;
  166. }
  167. if (preg_match('/^\s+/', $attr)) {
  168. $working = 1;
  169. $mode = 0;
  170. if (!$skip) {
  171. $attrarr[$attrname] = array();
  172. }
  173. $attr = preg_replace('/^\s+/', '', $attr);
  174. }
  175. break;
  176. case 2:
  177. // Attribute value, a URL after href= for instance.
  178. if (preg_match('/^"([^"]*)"(\s+|$)/', $attr, $match)) {
  179. if (!$skip) {
  180. $attrarr[$attrname] = array(
  181. 'value' => $match[1],
  182. 'delimiter' => '"',
  183. );
  184. }
  185. $working = 1;
  186. $mode = 0;
  187. $attr = preg_replace('/^"[^"]*"(\s+|$)/', '', $attr);
  188. break;
  189. }
  190. if (preg_match("/^'([^']*)'(\s+|$)/", $attr, $match)) {
  191. if (!$skip) {
  192. $attrarr[$attrname] = array(
  193. 'value' => $match[1],
  194. 'delimiter' => '\'',
  195. );
  196. }
  197. $working = 1;
  198. $mode = 0;
  199. $attr = preg_replace("/^'[^']*'(\s+|$)/", '', $attr);
  200. break;
  201. }
  202. if (preg_match("%^([^\s\"']+)(\s+|$)%", $attr, $match)) {
  203. if (!$skip) {
  204. $attrarr[$attrname] = array(
  205. 'value' => $match[1],
  206. 'delimiter' => '"',
  207. );
  208. }
  209. $working = 1;
  210. $mode = 0;
  211. $attr = preg_replace("%^[^\s\"']+(\s+|$)%", '', $attr);
  212. }
  213. break;
  214. }
  215. if ($working == 0) {
  216. // not well formed, remove and try again.
  217. $attr = preg_replace('/
  218. ^
  219. (
  220. "[^"]*("|$) # - a string that starts with a double quote, up until the next double quote or the end of the string
  221. | # or
  222. \'[^\']*(\'|$)| # - a string that starts with a quote, up until the next quote or the end of the string
  223. | # or
  224. \S # - a non-whitespace character
  225. )* # any number of the above three
  226. \s* # any number of whitespaces
  227. /x', '', $attr);
  228. $mode = 0;
  229. }
  230. }
  231. // The attribute list ends with a valueless attribute like "selected".
  232. if ($mode == 1 && !$skip) {
  233. $attrarr[$attrname] = array();
  234. }
  235. // Check the current HTML element for required attributes.
  236. foreach ($allowed_attributes as $attrname => $attrinfo) {
  237. if (!empty($attrinfo['required']) && !isset($attrarr[$attrname])) {
  238. // Ignore the whole element if required attribute is not present.
  239. return FALSE;
  240. }
  241. // When no attribute value has been specified in parsed HTML stream,
  242. // then supply default value if provided by input format settings.
  243. if (!isset($attrinfo['value']) && isset($allowed_attributes[$attrname]['default'])) {
  244. $attrarr[$attrname] = array(
  245. 'value' => $allowed_attributes[$attrname]['default'],
  246. 'delimiter' => '"',
  247. );
  248. }
  249. }
  250. // Check the current HTML element for additional attribute rules.
  251. $parsed_attributes = array();
  252. $add_nofollow = FALSE;
  253. foreach ($attrarr as $attrname => $attrinfo) {
  254. $parsed_attribute = $attrname;
  255. $attribute_options = (isset($allowed_attributes[$attrname]) ? $allowed_attributes[$attrname] : array());
  256. if (isset($attrinfo['value'])) {
  257. // Supply forced attribute value as defined by input format?
  258. if (isset($attribute_options['forced'])) {
  259. $attrinfo['value'] = $attribute_options['forced'];
  260. }
  261. else if (isset($attribute_options['values']) && !in_array($attrinfo['value'], $attribute_options['values'])) {
  262. // Ignore attribute if value is not present in whitelist.
  263. continue;
  264. }
  265. // Additional validation of attribute values.
  266. if ($attrname == 'style') {
  267. // Ok, let us validate individual style properties (decode entities now).
  268. $dirty_properties = array_filter(array_map('trim', explode(';', decode_entities($attrinfo['value']))));
  269. $sanitized_properties = array();
  270. foreach ($dirty_properties as $dirty_property) {
  271. // Separate property name from its value.
  272. if (!preg_match('#^([a-zA-Z][-a-zA-Z]*)\s*:\s*(.*)$#', $dirty_property, $property_matches)) {
  273. // Ignore properties that do not match the format "property-name: value".
  274. continue;
  275. }
  276. $property_name = strtolower($property_matches[1]);
  277. $property_value = &$property_matches[2];
  278. if (!isset($allowed_properties[$property_name])) {
  279. // Ignore property if not whitelisted in filter settings.
  280. continue;
  281. }
  282. // Check style property syntax.
  283. if (!preg_match($allowed_properties[$property_name], $property_value)) {
  284. // Ignore property if value does not match syntax rules.
  285. continue;
  286. }
  287. // If property value comes with url(...), then we want to check if it's allowed or not.
  288. if (strpos($property_value, 'url(') !== FALSE) {
  289. if (count($allowed_style_urls) <= 0) {
  290. // Ignore property if no rules have been specified.
  291. continue;
  292. }
  293. // This is like $regexp_uri in wysiwyg_filter_get_style_property_groups(), but it now contains 2 capturing
  294. // groups [1] for the URL itself (including delimiters) and [2] the first delimiter (if any).
  295. if (!preg_match('`url\(\s*(([\'"]?)(?:[^)]|(?<=\\\\)\\))+[\'"]?)\s*\)`', $property_value, $url) || empty($url[1])) {
  296. // Ignore property if found to be malformed here.
  297. continue;
  298. }
  299. if (!empty($url[2])) {
  300. if (substr($url[1], -1) != $url[2]) {
  301. // Ignore property if start and end delimiters don't match.
  302. continue;
  303. }
  304. // Remove delimiters.
  305. $url[1] = substr($url[1], 1, -1);
  306. }
  307. // Remove backslashes that could have been used to escape parentheses,
  308. // commas, whitespace characters, single quotes or double quotes.
  309. // http://www.w3.org/TR/CSS2/syndata.html#uri
  310. $url = preg_replace('`\\\\([(),\'"\s])`', '\1', $url[1]);
  311. // Ignore property if URL fails the check for bad protocols.
  312. if (wysiwyg_filter_xss_bad_protocol($url) != $url) {
  313. continue;
  314. }
  315. // Check URL against advanced filter rules.
  316. $match_found = FALSE;
  317. foreach ($allowed_style_urls as $regexp) {
  318. if (preg_match($regexp, $url)) {
  319. $match_found = TRUE;
  320. break;
  321. }
  322. }
  323. if (!$match_found) {
  324. // Ignore property if URL does not match any rule.
  325. continue;
  326. }
  327. }
  328. else {
  329. // Filter property value for bad protocols (note that property value has already been decoded).
  330. $property_value = wysiwyg_filter_xss_bad_protocol($property_value);
  331. }
  332. // Sanitized property name and value (check_plain'd here).
  333. $sanitized_properties[] = $property_name . ':' . check_plain($property_value);
  334. }
  335. if (empty($sanitized_properties)) {
  336. // Ignore the whole style attribute if no property remains.
  337. continue;
  338. }
  339. $attrinfo['value'] = implode('; ', $sanitized_properties);
  340. }
  341. else if ($attrname == 'class') {
  342. // Validate class names based on advanced rules specified in filter settings panel.
  343. // Note that property value is decoded now and check_plain'd at end. Since the colon
  344. // sign is not allowed, there's no need here to check for bad protocols.
  345. $dirty_names = array_filter(array_map('trim', explode(' ', decode_entities($attrinfo['value']))));
  346. $valid_names = array();
  347. foreach ($dirty_names as $dirty_name) {
  348. foreach ($allowed_class_names as $regexp) {
  349. if (preg_match($regexp, $dirty_name)) {
  350. $valid_names[] = $dirty_name;
  351. }
  352. }
  353. }
  354. if (empty($valid_names)) {
  355. // Ignore attribute if no class name remains after validation.
  356. continue;
  357. }
  358. $attrinfo['value'] = check_plain(implode(' ', $valid_names));
  359. }
  360. else if ($attrname == 'id') {
  361. // Validate element IDs based on advanced rules specified in filter settings panel.
  362. // Note that property value is decoded now and check_plain'd at end. Since the colon
  363. // sign is not allowed, there's no need here to check for bad protocols.
  364. if (count($allowed_element_ids) <= 0) {
  365. // Ignore attribute if no rules have been specified.
  366. continue;
  367. }
  368. // Decode value so we can easilly check it.
  369. $attrinfo['value'] = decode_entities($attrinfo['value']);
  370. // Pattern starts valid, but it should match all specified rules.
  371. $match_found = FALSE;
  372. foreach ($allowed_element_ids as $regexp) {
  373. if (preg_match($regexp, $attrinfo['value'])) {
  374. $match_found = TRUE;
  375. break;
  376. }
  377. }
  378. if (!$match_found) {
  379. // Ignore attribute if it contains invalid value.
  380. continue;
  381. }
  382. // Element ID is valid, check_plain result.
  383. $attrinfo['value'] = check_plain($attrinfo['value']);
  384. }
  385. elseif ($attrname == 'media') {
  386. $attrinfo['value'] = check_plain($attrinfo['value']);
  387. }
  388. else {
  389. // All attribute values are checked for bad protocols. This is the same
  390. // exact method used by Drupal's filter_xss().
  391. $attrinfo['value'] = filter_xss_bad_protocol($attrinfo['value']);
  392. // If this is <a href> element, then check domain name for rel="nofollow" policies in effect.
  393. if ($element == 'a' && $attrname == 'href' && $nofollow_policy != 'disabled' && !$add_nofollow) {
  394. $domain_found = FALSE;
  395. foreach ($nofollow_domains as $domain) {
  396. $domain = str_replace('.', '\.', $domain); // escape dots
  397. if (preg_match('#://.*' . $domain . '([^a-z0-9]|$)#i', $attrinfo['value'])) {
  398. $domain_found = TRUE;
  399. break;
  400. }
  401. }
  402. if (($nofollow_policy == 'blacklist' && $domain_found) || ($nofollow_policy == 'whitelist' && !$domain_found)) {
  403. $add_nofollow = TRUE;
  404. }
  405. }
  406. }
  407. // Fix for IE8 broken handling of ` character.
  408. if (strpos($attrinfo['value'], '`') !== FALSE) {
  409. // IE8 quoting would already be triggered by the presence of any "' <>
  410. if (!preg_match('/["\' <>]/', $attrinfo['value'])) {
  411. // Trailing space triggers IE8 to correctly quote the value.
  412. $attrinfo['value'] .= ' ';
  413. }
  414. }
  415. // Build parsed attribute value.
  416. $parsed_attribute .= '=' . $attrinfo['delimiter'] . $attrinfo['value'] . $attrinfo['delimiter'];
  417. }
  418. $parsed_attributes[$attrname] = $parsed_attribute;
  419. }
  420. // Do we have a link where rel="nofollow" should be added?
  421. if ($add_nofollow) {
  422. if (empty($parsed_attributes['rel'])) {
  423. $parsed_attributes['rel'] = 'rel="nofollow"';
  424. }
  425. else if (strpos($parsed_attributes['rel'], 'nofollow') === FALSE) {
  426. // Since we know the attribute is well formed, we can use substr(), which is faster than preg_replace().
  427. $parsed_attributes['rel'] = substr($parsed_attributes['rel'], 0, -1) . ' nofollow' . substr($parsed_attributes['rel'], -1);
  428. }
  429. }
  430. return $parsed_attributes;
  431. }
  432. /**
  433. * Processes an style property value and ensures it does not contain an URL
  434. * with a disallowed protocol (only http/https are allowed here).
  435. *
  436. * This function is based on Drupal's filter_xss_bad_protocol(). Differences are:
  437. * 1) It does not decode input string.
  438. * It should be done by the caller before calling us.
  439. * 2) It does not apply check_plain() to result.
  440. * It should be done by the caller after calling us.
  441. * 3) It allows a lot less protocols.
  442. *
  443. * @param $string
  444. * The string with the style property value.
  445. * @return
  446. * Cleaned up version of $string.
  447. */
  448. function wysiwyg_filter_xss_bad_protocol($string) {
  449. $allowed_protocols = array(
  450. 'http' => 1,
  451. 'https' => 1,
  452. );
  453. // Iteratively remove any invalid protocol found.
  454. do {
  455. $before = $string;
  456. $colonpos = strpos($string, ':');
  457. if ($colonpos > 0) {
  458. // We found a colon, possibly a protocol. Verify.
  459. $protocol = substr($string, 0, $colonpos);
  460. // If a colon is preceded by a slash, question mark or hash, it cannot
  461. // possibly be part of the URL scheme. This must be a relative URL,
  462. // which inherits the (safe) protocol of the base document.
  463. if (preg_match('![/?#]!', $protocol)) {
  464. break;
  465. }
  466. // Per RFC2616, section 3.2.3 (URI Comparison) scheme comparison must be case-insensitive
  467. // Check if this is a disallowed protocol.
  468. if (!isset($allowed_protocols[strtolower($protocol)])) {
  469. $string = substr($string, $colonpos + 1);
  470. }
  471. }
  472. } while ($before != $string);
  473. return $string;
  474. }