cleanstring.inc 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205
  1. <?php
  2. /**
  3. * @file
  4. * Helper class to clean strings to make them URL safe and translatable.
  5. *
  6. * This was copied directly from pathauto and put here to be made available
  7. * to all, because more things than just pathauto want URL safe strings.
  8. *
  9. * To use, simply:
  10. * @code
  11. * ctools_include('cleanstring');
  12. * $output = ctools_cleanstring($string);
  13. *
  14. * You can add a variety of settings as an array in the second argument,
  15. * including words to ignore, how to deal with punctuation, length
  16. * limits, and more. See the function itself for options.
  17. */
  18. /**
  19. * Matches Unicode character classes.
  20. *
  21. * See: http://www.unicode.org/Public/UNIDATA/UCD.html#General_Category_Values.
  22. *
  23. * The index only contains the following character classes:
  24. * Lu Letter, Uppercase
  25. * Ll Letter, Lowercase
  26. * Lt Letter, Titlecase
  27. * Lo Letter, Other
  28. * Nd Number, Decimal Digit
  29. * No Number, Other
  30. *
  31. * Copied from search.module's PREG_CLASS_SEARCH_EXCLUDE.
  32. */
  33. define('CTOOLS_PREG_CLASS_ALNUM',
  34. '\x{0}-\x{2f}\x{3a}-\x{40}\x{5b}-\x{60}\x{7b}-\x{bf}\x{d7}\x{f7}\x{2b0}-' .
  35. '\x{385}\x{387}\x{3f6}\x{482}-\x{489}\x{559}-\x{55f}\x{589}-\x{5c7}\x{5f3}-' .
  36. '\x{61f}\x{640}\x{64b}-\x{65e}\x{66a}-\x{66d}\x{670}\x{6d4}\x{6d6}-\x{6ed}' .
  37. '\x{6fd}\x{6fe}\x{700}-\x{70f}\x{711}\x{730}-\x{74a}\x{7a6}-\x{7b0}\x{901}-' .
  38. '\x{903}\x{93c}\x{93e}-\x{94d}\x{951}-\x{954}\x{962}-\x{965}\x{970}\x{981}-' .
  39. '\x{983}\x{9bc}\x{9be}-\x{9cd}\x{9d7}\x{9e2}\x{9e3}\x{9f2}-\x{a03}\x{a3c}-' .
  40. '\x{a4d}\x{a70}\x{a71}\x{a81}-\x{a83}\x{abc}\x{abe}-\x{acd}\x{ae2}\x{ae3}' .
  41. '\x{af1}-\x{b03}\x{b3c}\x{b3e}-\x{b57}\x{b70}\x{b82}\x{bbe}-\x{bd7}\x{bf0}-' .
  42. '\x{c03}\x{c3e}-\x{c56}\x{c82}\x{c83}\x{cbc}\x{cbe}-\x{cd6}\x{d02}\x{d03}' .
  43. '\x{d3e}-\x{d57}\x{d82}\x{d83}\x{dca}-\x{df4}\x{e31}\x{e34}-\x{e3f}\x{e46}-' .
  44. '\x{e4f}\x{e5a}\x{e5b}\x{eb1}\x{eb4}-\x{ebc}\x{ec6}-\x{ecd}\x{f01}-\x{f1f}' .
  45. '\x{f2a}-\x{f3f}\x{f71}-\x{f87}\x{f90}-\x{fd1}\x{102c}-\x{1039}\x{104a}-' .
  46. '\x{104f}\x{1056}-\x{1059}\x{10fb}\x{10fc}\x{135f}-\x{137c}\x{1390}-\x{1399}' .
  47. '\x{166d}\x{166e}\x{1680}\x{169b}\x{169c}\x{16eb}-\x{16f0}\x{1712}-\x{1714}' .
  48. '\x{1732}-\x{1736}\x{1752}\x{1753}\x{1772}\x{1773}\x{17b4}-\x{17db}\x{17dd}' .
  49. '\x{17f0}-\x{180e}\x{1843}\x{18a9}\x{1920}-\x{1945}\x{19b0}-\x{19c0}\x{19c8}' .
  50. '\x{19c9}\x{19de}-\x{19ff}\x{1a17}-\x{1a1f}\x{1d2c}-\x{1d61}\x{1d78}\x{1d9b}-' .
  51. '\x{1dc3}\x{1fbd}\x{1fbf}-\x{1fc1}\x{1fcd}-\x{1fcf}\x{1fdd}-\x{1fdf}\x{1fed}-' .
  52. '\x{1fef}\x{1ffd}-\x{2070}\x{2074}-\x{207e}\x{2080}-\x{2101}\x{2103}-\x{2106}' .
  53. '\x{2108}\x{2109}\x{2114}\x{2116}-\x{2118}\x{211e}-\x{2123}\x{2125}\x{2127}' .
  54. '\x{2129}\x{212e}\x{2132}\x{213a}\x{213b}\x{2140}-\x{2144}\x{214a}-\x{2b13}' .
  55. '\x{2ce5}-\x{2cff}\x{2d6f}\x{2e00}-\x{3005}\x{3007}-\x{303b}\x{303d}-\x{303f}' .
  56. '\x{3099}-\x{309e}\x{30a0}\x{30fb}-\x{30fe}\x{3190}-\x{319f}\x{31c0}-\x{31cf}' .
  57. '\x{3200}-\x{33ff}\x{4dc0}-\x{4dff}\x{a015}\x{a490}-\x{a716}\x{a802}\x{a806}' .
  58. '\x{a80b}\x{a823}-\x{a82b}\x{e000}-\x{f8ff}\x{fb1e}\x{fb29}\x{fd3e}\x{fd3f}' .
  59. '\x{fdfc}-\x{fe6b}\x{feff}-\x{ff0f}\x{ff1a}-\x{ff20}\x{ff3b}-\x{ff40}\x{ff5b}-' .
  60. '\x{ff65}\x{ff70}\x{ff9e}\x{ff9f}\x{ffe0}-\x{fffd}');
  61. /**
  62. * Clean up a string value provided by a module.
  63. *
  64. * Resulting string contains only alphanumerics and separators.
  65. *
  66. * @param $string
  67. * A string to clean.
  68. * @param $settings
  69. * An optional array of settings to use.
  70. * - 'clean slash': If set, slashes will be cleaned. Defaults to TRUE,
  71. * so you have to explicitly set this to FALSE to not clean the
  72. * slashes.
  73. * - 'ignore words': Set to an array of words that will be removed
  74. * rather than made safe. Defaults to an empty array.
  75. * - 'separator': Change spaces and untranslatable characters to
  76. * this character. Defaults to '-' .
  77. * - 'replacements': An array of direct replacements to be made that will
  78. * be implemented via strtr(). Defaults to an empty array.
  79. * - 'transliterate': If set, use the transliteration replacements. If set
  80. * to an array, use these replacements instead of the defaults in CTools.
  81. * Defaults to FALSE.
  82. * - 'reduce ascii': If set to TRUE further reduce to ASCII96 only. Defaults
  83. * to TRUE.
  84. * - 'max length': If set to a number, reduce the resulting string to this
  85. * maximum length. Defaults to no maximum length.
  86. * - 'lower case': If set to TRUE, convert the result to lower case.
  87. * Defaults to false.
  88. * These settings will be passed through drupal_alter.
  89. *
  90. * @return
  91. * The cleaned string.
  92. */
  93. function ctools_cleanstring($string, $settings = array()) {
  94. $settings += array(
  95. 'clean slash' => TRUE,
  96. 'ignore words' => array(),
  97. 'separator' => '-',
  98. 'replacements' => array(),
  99. 'transliterate' => FALSE,
  100. 'reduce ascii' => TRUE,
  101. 'max length' => FALSE,
  102. 'lower case' => FALSE,
  103. );
  104. // Allow modules to make other changes to the settings.
  105. if (isset($settings['clean id'])) {
  106. drupal_alter('ctools_cleanstring_' . $settings['clean id'], $settings);
  107. }
  108. drupal_alter('ctools_cleanstring', $settings);
  109. $output = $string;
  110. // Do any replacements the user selected up front.
  111. if (!empty($settings['replacements'])) {
  112. $output = strtr($output, $settings['replacements']);
  113. }
  114. // Remove slashes if instructed to do so.
  115. if ($settings['clean slash']) {
  116. $output = str_replace('/', '', $output);
  117. }
  118. if (!empty($settings['transliterate']) && module_exists('transliteration')) {
  119. $output = transliteration_get($output);
  120. }
  121. // Reduce to the subset of ASCII96 letters and numbers.
  122. if ($settings['reduce ascii']) {
  123. $pattern = '/[^a-zA-Z0-9\/]+/';
  124. $output = preg_replace($pattern, $settings['separator'], $output);
  125. }
  126. // Get rid of words that are on the ignore list.
  127. if (!empty($settings['ignore words'])) {
  128. $ignore_re = '\b' . preg_replace('/,/', '\b|\b', $settings['ignore words']) . '\b';
  129. if (function_exists('mb_eregi_replace')) {
  130. $output = mb_eregi_replace($ignore_re, '', $output);
  131. }
  132. else {
  133. $output = preg_replace("/$ignore_re/i", '', $output);
  134. }
  135. }
  136. // Always replace whitespace with the separator.
  137. $output = preg_replace('/\s+/', $settings['separator'], $output);
  138. // In preparation for pattern matching,
  139. // escape the separator if and only if it is not alphanumeric.
  140. if (isset($settings['separator'])) {
  141. if (preg_match('/^[^' . CTOOLS_PREG_CLASS_ALNUM . ']+$/uD', $settings['separator'])) {
  142. $seppattern = $settings['separator'];
  143. }
  144. else {
  145. $seppattern = '\\' . $settings['separator'];
  146. }
  147. // Trim any leading or trailing separators (note the need to.
  148. $output = preg_replace("/^$seppattern+|$seppattern+$/", '', $output);
  149. // Replace multiple separators with a single one.
  150. $output = preg_replace("/$seppattern+/", $settings['separator'], $output);
  151. }
  152. // Enforce the maximum component length.
  153. if (!empty($settings['max length'])) {
  154. $output = ctools_cleanstring_truncate($output, $settings['max length'], $settings['separator']);
  155. }
  156. if (!empty($settings['lower case'])) {
  157. $output = drupal_strtolower($output);
  158. }
  159. return $output;
  160. }
  161. /**
  162. * A friendly version of truncate_utf8.
  163. *
  164. * @param $string
  165. * The string to be truncated.
  166. * @param $length
  167. * An integer for the maximum desired length.
  168. * @param $separator
  169. * A string which contains the word boundary such as - or _.
  170. *
  171. * @return
  172. * The string truncated below the maxlength.
  173. */
  174. function ctools_cleanstring_truncate($string, $length, $separator) {
  175. if (drupal_strlen($string) > $length) {
  176. // Leave one more character.
  177. $string = drupal_substr($string, 0, $length + 1);
  178. // Space exists AND is not on position 0.
  179. if ($last_break = strrpos($string, $separator)) {
  180. $string = substr($string, 0, $last_break);
  181. }
  182. else {
  183. $string = drupal_substr($string, 0, $length);
  184. }
  185. }
  186. return $string;
  187. }