wysiwyg_filter.pages.inc 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576
  1. <?php
  2. /**
  3. * @file
  4. * User land code for the WYSIWYG Filter module.
  5. */
  6. /**
  7. * WYSIWYG Filter. Provides filtering of input into accepted HTML.
  8. *
  9. * This function is based on Drupal's filter_xss() with a few additions:
  10. * - Validates HTML input against whitelists of HTML elements, attributes
  11. * and style properties.
  12. * - Optionally apply rel="nofollow" rules to links.
  13. * - Rules for the above can be specified by site administrators from the
  14. * filter settings form.
  15. *
  16. * @param string $text
  17. * HTML text to be filtered.
  18. * @param int $format
  19. * Input format identifier.
  20. * @return string
  21. * Filtered HTML text.
  22. */
  23. function wysiwyg_filter_filter_wysiwyg_process($text, $filter, $format, $langcode = NULL, $cache = NULL, $cache_id = NULL) {
  24. // Only operate on valid UTF-8 strings. This is necessary to prevent cross
  25. // site scripting issues on Internet Explorer 6.
  26. if (!drupal_validate_utf8($text)) {
  27. return '';
  28. }
  29. // Load common functions.
  30. module_load_include('inc', 'wysiwyg_filter');
  31. // Store input filter options.
  32. _wysiwyg_filter_xss_split(wysiwyg_filter_get_filter_options($format->format, $filter->settings), TRUE);
  33. // Remove NUL characters (ignored by some browsers).
  34. $text = str_replace(chr(0), '', $text);
  35. // Remove Netscape 4 JS entities.
  36. $text = preg_replace('%&\s*\{[^}]*(\}\s*;?|$)%', '', $text);
  37. // Defuse all HTML entities.
  38. $text = str_replace('&', '&amp;', $text);
  39. // Change back only well-formed entities in our whitelist
  40. // Decimal numeric entities.
  41. $text = preg_replace('/&amp;#([0-9]+;)/', '&#\1', $text);
  42. // Hexadecimal numeric entities.
  43. $text = preg_replace('/&amp;#[Xx]0*((?:[0-9A-Fa-f]{2})+;)/', '&#x\1', $text);
  44. // Named entities.
  45. $text = preg_replace('/&amp;([A-Za-z][A-Za-z0-9]*;)/', '&\1', $text);
  46. // Preg modifiers:
  47. // - x=extended (pattern with comments)
  48. // - s=dotall (here for multiline comments)
  49. // - m=multiline (so $ only matches EOF)
  50. // - u=unicode
  51. return preg_replace_callback('%
  52. (
  53. <(?=[^a-zA-Z!/]) # a lone <
  54. | # or
  55. <!--.*?--> # a comment
  56. | # or
  57. < # a string that starts with a <
  58. ( # ...and contains any number of
  59. "[^"]*" # double-quoted strings
  60. |
  61. \'[^\']*\' # single-quoted strings
  62. |
  63. [^"\'>] # any other char
  64. )*
  65. (>|$) # up until the > or the end of the string
  66. | # or
  67. > # just a >
  68. )%xsmu', '_wysiwyg_filter_xss_split', $text);
  69. }
  70. /**
  71. * Processes an HTML tag.
  72. *
  73. * @param $m
  74. * An array with various meaning depending on the value of $store.
  75. * If $store is TRUE then the array contains the allowed tags.
  76. * If $store is FALSE then the array has one element, the HTML tag to process.
  77. * @param $store
  78. * Whether to store $m.
  79. * @return
  80. * If the element isn't allowed, an empty string. Otherwise, the cleaned up
  81. * version of the HTML element.
  82. */
  83. function _wysiwyg_filter_xss_split($m, $store = FALSE) {
  84. static $filter_options;
  85. if ($store) {
  86. _wysiwyg_filter_xss_attributes($filter_options = $m);
  87. return;
  88. }
  89. $string = $m[1];
  90. if (substr($string, 0, 1) != '<') {
  91. // We matched a lone ">" character
  92. return '&gt;';
  93. }
  94. else if (strlen($string) == 1) {
  95. // We matched a lone "<" character
  96. return '&lt;';
  97. }
  98. if (!preg_match('%^<\s*(/\s*)?([a-zA-Z0-9-/]+)([^>]*)>?|(<!--.*?-->)$%', $string, $matches)) {
  99. // Seriously malformed
  100. return '';
  101. }
  102. $slash = trim($matches[1]);
  103. $elem = strtolower($matches[2]);
  104. $attrlist = &$matches[3];
  105. $comment = &$matches[4];
  106. // Convert synonyms to the element they get converted to.
  107. if (!empty($filter_options['valid_elements'][$elem]) && is_string($filter_options['valid_elements'][$elem])) {
  108. $elem = $filter_options['valid_elements'][$elem];
  109. }
  110. if (!empty($comment)) {
  111. // Allow or disallow HTML comments.
  112. return (!empty($filter_options['allow_comments']) ? $comment : '');
  113. }
  114. elseif (!isset($filter_options['valid_elements'][$elem])) {
  115. // Disallowed HTML element.
  116. return '';
  117. }
  118. if ($slash != '') {
  119. return "</$elem>";
  120. }
  121. // Is there a closing XHTML slash at the end of the attributes?
  122. // In PHP 5.1.0+ we could count the changes, currently we need a separate match
  123. $xhtml_slash = preg_match('%\s?/\s*$%', $attrlist) ? ' /' : '';
  124. $attrlist = preg_replace('%(\s?)/\s*$%', '\1', $attrlist);
  125. // Clean up attributes
  126. if (($attr2 = _wysiwyg_filter_xss_attributes($attrlist, $elem)) === FALSE) {
  127. // Disallowed HTML element because it does not contain required attribute.
  128. return '';
  129. }
  130. $attr2 = implode(' ', $attr2);
  131. $attr2 = preg_replace('/[<>]/', '', $attr2);
  132. $attr2 = strlen($attr2) ? ' ' . $attr2 : '';
  133. return "<$elem$attr2$xhtml_slash>";
  134. }
  135. /**
  136. * Processes a string of HTML attributes.
  137. *
  138. * @param mixed $attr
  139. * String with attributes list to be checked.
  140. * Array with whitelist of all HTML elements and their allowed attributes.
  141. * @param string $element
  142. * Current element for specified attributes lists.
  143. * @return
  144. * Cleaned up version of the HTML attributes.
  145. */
  146. function _wysiwyg_filter_xss_attributes($attr, $element = '') {
  147. static $filter_options;
  148. if (is_array($attr)) {
  149. $filter_options = $attr;
  150. return;
  151. }
  152. // Shortcuts for filter options.
  153. $allowed_attributes = &$filter_options['valid_elements'][$element];
  154. $allowed_properties = &$filter_options['style_properties'];
  155. if ($filter_options['rule_bypass_style_urls']) {
  156. $allowed_style_urls = array();
  157. }
  158. else {
  159. $allowed_style_urls = &$filter_options['style_urls'];
  160. }
  161. $bypass_valid_classes = $filter_options['rule_bypass_valid_classes'];
  162. if (!$bypass_valid_classes) {
  163. $allowed_class_names = &$filter_options['valid_classes'];
  164. }
  165. $bypass_valid_ids = $filter_options['rule_bypass_valid_ids'];
  166. if ($bypass_valid_ids) {
  167. $allowed_element_ids = array('/.*/');
  168. }
  169. else {
  170. $allowed_element_ids = &$filter_options['valid_ids'];
  171. }
  172. $nofollow_policy = &$filter_options['nofollow_policy'];
  173. $nofollow_domains = &$filter_options['nofollow_domains'];
  174. $attrarr = array();
  175. $mode = 0;
  176. $attrname = '';
  177. while (strlen($attr) != 0) {
  178. // Was the last operation successful?
  179. $working = 0;
  180. switch ($mode) {
  181. case 0:
  182. // Attribute name, href for instance.
  183. if (preg_match('/^([-a-zA-Z]+)/', $attr, $match)) {
  184. $attrname = strtolower($match[1]);
  185. $skip = (substr($attrname, 0, 2) == 'on' || (!isset($allowed_attributes[$attrname]) && !isset($allowed_attributes['*'])));
  186. $working = $mode = 1;
  187. $attr = preg_replace('/^[-a-zA-Z]+/', '', $attr);
  188. }
  189. break;
  190. case 1:
  191. // Equals sign or valueless ("selected").
  192. if (preg_match('/^\s*=\s*/', $attr)) {
  193. $working = 1;
  194. $mode = 2;
  195. $attr = preg_replace('/^\s*=\s*/', '', $attr);
  196. break;
  197. }
  198. if (preg_match('/^\s+/', $attr)) {
  199. $working = 1;
  200. $mode = 0;
  201. if (!$skip) {
  202. $attrarr[$attrname] = array();
  203. }
  204. $attr = preg_replace('/^\s+/', '', $attr);
  205. }
  206. break;
  207. case 2:
  208. // Attribute value, a URL after href= for instance.
  209. if (preg_match('/^"([^"]*)"(\s+|$)/', $attr, $match)) {
  210. if (!$skip) {
  211. $attrarr[$attrname] = array(
  212. 'value' => $match[1],
  213. 'delimiter' => '"',
  214. );
  215. }
  216. $working = 1;
  217. $mode = 0;
  218. $attr = preg_replace('/^"[^"]*"(\s+|$)/', '', $attr);
  219. break;
  220. }
  221. if (preg_match("/^'([^']*)'(\s+|$)/", $attr, $match)) {
  222. if (!$skip) {
  223. $attrarr[$attrname] = array(
  224. 'value' => $match[1],
  225. 'delimiter' => '\'',
  226. );
  227. }
  228. $working = 1;
  229. $mode = 0;
  230. $attr = preg_replace("/^'[^']*'(\s+|$)/", '', $attr);
  231. break;
  232. }
  233. if (preg_match("%^([^\s\"']+)(\s+|$)%", $attr, $match)) {
  234. if (!$skip) {
  235. $attrarr[$attrname] = array(
  236. 'value' => $match[1],
  237. 'delimiter' => '"',
  238. );
  239. }
  240. $working = 1;
  241. $mode = 0;
  242. $attr = preg_replace("%^[^\s\"']+(\s+|$)%", '', $attr);
  243. }
  244. break;
  245. }
  246. if ($working == 0) {
  247. // not well formed, remove and try again.
  248. $attr = preg_replace('/
  249. ^
  250. (
  251. "[^"]*("|$) # - a string that starts with a double quote, up until the next double quote or the end of the string
  252. | # or
  253. \'[^\']*(\'|$)| # - a string that starts with a quote, up until the next quote or the end of the string
  254. | # or
  255. \S # - a non-whitespace character
  256. )* # any number of the above three
  257. \s* # any number of whitespaces
  258. /x', '', $attr);
  259. $mode = 0;
  260. }
  261. }
  262. // The attribute list ends with a valueless attribute like "selected".
  263. // is_array() ensures this isn't run for synonyms.
  264. if ($mode == 1 && !$skip && is_array($attrarr[$attrname])) {
  265. $attrarr[$attrname] = array();
  266. }
  267. // Check the current HTML element for required attributes.
  268. foreach ($allowed_attributes as $attrname => $attrinfo) {
  269. if (!empty($attrinfo['required']) && empty($attrarr[$attrname]['value'])) {
  270. // Ignore the whole element if required attribute is not present.
  271. return FALSE;
  272. }
  273. // When no attribute value has been specified in parsed HTML stream,
  274. // then supply default value if provided by input format settings.
  275. if (!isset($attrinfo['value']) && isset($allowed_attributes[$attrname]['default'])) {
  276. $attrarr[$attrname] = array(
  277. 'value' => $allowed_attributes[$attrname]['default'],
  278. 'delimiter' => '"',
  279. );
  280. }
  281. }
  282. // Check the current HTML element for additional attribute rules.
  283. $parsed_attributes = array();
  284. $add_nofollow = FALSE;
  285. foreach ($attrarr as $attrname => $attrinfo) {
  286. $parsed_attribute = $attrname;
  287. $attribute_options = (isset($allowed_attributes[$attrname]) ? $allowed_attributes[$attrname] : array());
  288. if (isset($attrinfo['value'])) {
  289. // Supply forced attribute value as defined by input format?
  290. if (isset($attribute_options['forced'])) {
  291. $attrinfo['value'] = $attribute_options['forced'];
  292. }
  293. else if (isset($attribute_options['values']) && !in_array($attrinfo['value'], $attribute_options['values'])) {
  294. // Ignore attribute if value is not present in whitelist.
  295. continue;
  296. }
  297. // Additional validation of attribute values.
  298. if ($attrname == 'style') {
  299. // Ok, let us validate individual style properties (decode entities now).
  300. $dirty_properties = array_filter(array_map('trim', explode(';', decode_entities($attrinfo['value']))));
  301. $sanitized_properties = array();
  302. foreach ($dirty_properties as $dirty_property) {
  303. // Separate property name from its value.
  304. if (!preg_match('#^([a-zA-Z][-a-zA-Z]*)\s*:\s*(.*)$#', $dirty_property, $property_matches)) {
  305. // Ignore properties that do not match the format "property-name: value".
  306. continue;
  307. }
  308. $property_name = strtolower($property_matches[1]);
  309. $property_value = &$property_matches[2];
  310. if (!isset($allowed_properties[$property_name])) {
  311. // Ignore property if not whitelisted in filter settings.
  312. continue;
  313. }
  314. // Check style property syntax.
  315. if (!preg_match($allowed_properties[$property_name], $property_value)) {
  316. // Ignore property if value does not match syntax rules.
  317. continue;
  318. }
  319. // If property value comes with url(...), then we want to check if it's allowed or not.
  320. if (strpos($property_value, 'url(') !== FALSE) {
  321. if (count($allowed_style_urls) <= 0) {
  322. // Ignore property if no rules have been specified.
  323. continue;
  324. }
  325. // This is like $regexp_uri in wysiwyg_filter_get_style_property_groups(), but it now contains 2 capturing
  326. // groups [1] for the URL itself (including delimiters) and [2] the first delimiter (if any).
  327. if (!preg_match('`url\(\s*(([\'"]?)(?:[^)]|(?<=\\\\)\\))+[\'"]?)\s*\)`', $property_value, $url) || empty($url[1])) {
  328. // Ignore property if found to be malformed here.
  329. continue;
  330. }
  331. if (!empty($url[2])) {
  332. if (substr($url[1], -1) != $url[2]) {
  333. // Ignore property if start and end delimiters don't match.
  334. continue;
  335. }
  336. // Remove delimiters.
  337. $url[1] = substr($url[1], 1, -1);
  338. }
  339. // Remove backslashes that could have been used to escape parentheses,
  340. // commas, whitespace characters, single quotes or double quotes.
  341. // http://www.w3.org/TR/CSS2/syndata.html#uri
  342. $url = preg_replace('`\\\\([(),\'"\s])`', '\1', $url[1]);
  343. // Ignore property if URL fails the check for bad protocols.
  344. if (wysiwyg_filter_xss_bad_protocol($url) != $url) {
  345. continue;
  346. }
  347. // Check URL against advanced filter rules.
  348. $match_found = FALSE;
  349. foreach ($allowed_style_urls as $regexp) {
  350. if (preg_match($regexp, $url)) {
  351. $match_found = TRUE;
  352. break;
  353. }
  354. }
  355. if (!$match_found) {
  356. // Ignore property if URL does not match any rule.
  357. continue;
  358. }
  359. }
  360. else {
  361. // Filter property value for bad protocols (note that property value has already been decoded).
  362. $property_value = wysiwyg_filter_xss_bad_protocol($property_value);
  363. }
  364. // Sanitized property name and value (check_plain'd here).
  365. $sanitized_properties[] = $property_name . ':' . check_plain($property_value);
  366. }
  367. if (empty($sanitized_properties)) {
  368. // Ignore the whole style attribute if no property remains.
  369. continue;
  370. }
  371. $attrinfo['value'] = implode('; ', $sanitized_properties);
  372. }
  373. else if ($attrname == 'class') {
  374. // Validate class names based on advanced rules specified in filter settings panel.
  375. // Note that property value is decoded now and check_plain'd at end. Since the colon
  376. // sign is not allowed, there's no need here to check for bad protocols.
  377. $dirty_names = array_filter(array_map('trim', explode(' ', decode_entities($attrinfo['value']))));
  378. $valid_names = array();
  379. if ($bypass_valid_classes) {
  380. $valid_names = $dirty_names;
  381. }
  382. else {
  383. foreach ($dirty_names as $dirty_name) {
  384. foreach ($allowed_class_names as $regexp) {
  385. if (preg_match($regexp, $dirty_name)) {
  386. $valid_names[] = $dirty_name;
  387. }
  388. }
  389. }
  390. }
  391. if (empty($valid_names)) {
  392. // Ignore attribute if no class name remains after validation.
  393. continue;
  394. }
  395. $attrinfo['value'] = check_plain(implode(' ', $valid_names));
  396. }
  397. else if ($attrname == 'id') {
  398. // Validate element IDs based on advanced rules specified in filter settings panel.
  399. // Note that property value is decoded now and check_plain'd at end. Since the colon
  400. // sign is not allowed, there's no need here to check for bad protocols.
  401. if (count($allowed_element_ids) <= 0) {
  402. // Ignore attribute if no rules have been specified.
  403. continue;
  404. }
  405. // Decode value so we can easilly check it.
  406. $attrinfo['value'] = decode_entities($attrinfo['value']);
  407. // Pattern starts valid, but it should match all specified rules.
  408. $match_found = FALSE;
  409. foreach ($allowed_element_ids as $regexp) {
  410. if (preg_match($regexp, $attrinfo['value'])) {
  411. $match_found = TRUE;
  412. break;
  413. }
  414. }
  415. if (!$match_found) {
  416. // Ignore attribute if it contains invalid value.
  417. continue;
  418. }
  419. // Element ID is valid, check_plain result.
  420. $attrinfo['value'] = check_plain($attrinfo['value']);
  421. }
  422. elseif ($attrname == 'media') {
  423. $attrinfo['value'] = check_plain($attrinfo['value']);
  424. }
  425. else {
  426. // All attribute values are checked for bad protocols. This is the same
  427. // exact method used by Drupal's filter_xss().
  428. $attrinfo['value'] = filter_xss_bad_protocol($attrinfo['value']);
  429. // If this is <a href> element, then check domain name for rel="nofollow" policies in effect.
  430. if ($element == 'a' && $attrname == 'href' && $nofollow_policy != 'disabled' && !$add_nofollow) {
  431. $domain_found = FALSE;
  432. if ($nofollow_policy == 'whitelist_current') {
  433. global $base_url;
  434. $parts = parse_url($base_url);
  435. $nofollow_domains = array($parts['host']);
  436. }
  437. foreach ($nofollow_domains as $domain) {
  438. $domain = str_replace('.', '\.', $domain); // escape dots
  439. if (preg_match('#://.*' . $domain . '([^a-z0-9]|$)#i', $attrinfo['value'])) {
  440. $domain_found = TRUE;
  441. break;
  442. }
  443. }
  444. $link_is_relative = !parse_url($attrinfo['value'], PHP_URL_HOST);
  445. if (($nofollow_policy == 'blacklist' && $domain_found) || (($nofollow_policy == 'whitelist' || $nofollow_policy == 'whitelist_current') && !$domain_found && !$link_is_relative)) {
  446. $add_nofollow = TRUE;
  447. }
  448. }
  449. }
  450. // Fix for IE8 broken handling of ` character.
  451. if (strpos($attrinfo['value'], '`') !== FALSE) {
  452. // IE8 quoting would already be triggered by the presence of any "' <>
  453. if (!preg_match('/["\' <>]/', $attrinfo['value'])) {
  454. // Trailing space triggers IE8 to correctly quote the value.
  455. $attrinfo['value'] .= ' ';
  456. }
  457. }
  458. // Build parsed attribute value.
  459. $parsed_attribute .= '=' . $attrinfo['delimiter'] . $attrinfo['value'] . $attrinfo['delimiter'];
  460. }
  461. $parsed_attributes[$attrname] = $parsed_attribute;
  462. }
  463. // Do we have a link where rel="nofollow" should be added?
  464. if ($add_nofollow) {
  465. if (empty($parsed_attributes['rel'])) {
  466. $parsed_attributes['rel'] = 'rel="nofollow"';
  467. }
  468. else if (strpos($parsed_attributes['rel'], 'nofollow') === FALSE) {
  469. // Since we know the attribute is well formed, we can use substr(), which is faster than preg_replace().
  470. $parsed_attributes['rel'] = substr($parsed_attributes['rel'], 0, -1) . ' nofollow' . substr($parsed_attributes['rel'], -1);
  471. }
  472. }
  473. return $parsed_attributes;
  474. }
  475. /**
  476. * Processes an style property value and ensures it does not contain an URL
  477. * with a disallowed protocol (only http/https are allowed here).
  478. *
  479. * This function is based on Drupal's filter_xss_bad_protocol(). Differences are:
  480. * 1) It does not decode input string.
  481. * It should be done by the caller before calling us.
  482. * 2) It does not apply check_plain() to result.
  483. * It should be done by the caller after calling us.
  484. * 3) It allows a lot less protocols.
  485. *
  486. * @param $string
  487. * The string with the style property value.
  488. * @return
  489. * Cleaned up version of $string.
  490. */
  491. function wysiwyg_filter_xss_bad_protocol($string) {
  492. $allowed_protocols = array(
  493. 'http' => 1,
  494. 'https' => 1,
  495. );
  496. // Iteratively remove any invalid protocol found.
  497. do {
  498. $before = $string;
  499. $colonpos = strpos($string, ':');
  500. if ($colonpos > 0) {
  501. // We found a colon, possibly a protocol. Verify.
  502. $protocol = substr($string, 0, $colonpos);
  503. // If a colon is preceded by a slash, question mark or hash, it cannot
  504. // possibly be part of the URL scheme. This must be a relative URL,
  505. // which inherits the (safe) protocol of the base document.
  506. if (preg_match('![/?#]!', $protocol)) {
  507. break;
  508. }
  509. // Per RFC2616, section 3.2.3 (URI Comparison) scheme comparison must be case-insensitive
  510. // Check if this is a disallowed protocol.
  511. if (!isset($allowed_protocols[strtolower($protocol)])) {
  512. $string = substr($string, $colonpos + 1);
  513. }
  514. }
  515. } while ($before != $string);
  516. return $string;
  517. }