transliteration.inc 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207
  1. <?php
  2. /**
  3. * @file
  4. * Transliteration processing functions.
  5. */
  6. /**
  7. * Transliterates UTF-8 encoded text to US-ASCII.
  8. *
  9. * Based on Mediawiki's UtfNormal::quickIsNFCVerify().
  10. *
  11. * @param $string
  12. * UTF-8 encoded text input.
  13. * @param $unknown
  14. * Replacement string for characters that do not have a suitable ASCII
  15. * equivalent.
  16. * @param $source_langcode
  17. * Optional ISO 639 language code that denotes the language of the input and
  18. * is used to apply language-specific variations. If the source language is
  19. * not known at the time of transliteration, it is recommended to set this
  20. * argument to the site default language to produce consistent results.
  21. * Otherwise the current display language will be used.
  22. * @return
  23. * Transliterated text.
  24. */
  25. function _transliteration_process($string, $unknown = '?', $source_langcode = NULL) {
  26. // ASCII is always valid NFC! If we're only ever given plain ASCII, we can
  27. // avoid the overhead of initializing the decomposition tables by skipping
  28. // out early.
  29. if (!preg_match('/[\x80-\xff]/', $string)) {
  30. return $string;
  31. }
  32. static $tail_bytes;
  33. if (!isset($tail_bytes)) {
  34. // Each UTF-8 head byte is followed by a certain number of tail bytes.
  35. $tail_bytes = array();
  36. for ($n = 0; $n < 256; $n++) {
  37. if ($n < 0xc0) {
  38. $remaining = 0;
  39. }
  40. elseif ($n < 0xe0) {
  41. $remaining = 1;
  42. }
  43. elseif ($n < 0xf0) {
  44. $remaining = 2;
  45. }
  46. elseif ($n < 0xf8) {
  47. $remaining = 3;
  48. }
  49. elseif ($n < 0xfc) {
  50. $remaining = 4;
  51. }
  52. elseif ($n < 0xfe) {
  53. $remaining = 5;
  54. }
  55. else {
  56. $remaining = 0;
  57. }
  58. $tail_bytes[chr($n)] = $remaining;
  59. }
  60. }
  61. // Chop the text into pure-ASCII and non-ASCII areas; large ASCII parts can
  62. // be handled much more quickly. Don't chop up Unicode areas for punctuation,
  63. // though, that wastes energy.
  64. preg_match_all('/[\x00-\x7f]+|[\x80-\xff][\x00-\x40\x5b-\x5f\x7b-\xff]*/', $string, $matches);
  65. $result = '';
  66. foreach ($matches[0] as $str) {
  67. if ($str[0] < "\x80") {
  68. // ASCII chunk: guaranteed to be valid UTF-8 and in normal form C, so
  69. // skip over it.
  70. $result .= $str;
  71. continue;
  72. }
  73. // We'll have to examine the chunk byte by byte to ensure that it consists
  74. // of valid UTF-8 sequences, and to see if any of them might not be
  75. // normalized.
  76. //
  77. // Since PHP is not the fastest language on earth, some of this code is a
  78. // little ugly with inner loop optimizations.
  79. $head = '';
  80. $chunk = strlen($str);
  81. // Counting down is faster. I'm *so* sorry.
  82. $len = $chunk + 1;
  83. for ($i = -1; --$len; ) {
  84. $c = $str[++$i];
  85. if ($remaining = $tail_bytes[$c]) {
  86. // UTF-8 head byte!
  87. $sequence = $head = $c;
  88. do {
  89. // Look for the defined number of tail bytes...
  90. if (--$len && ($c = $str[++$i]) >= "\x80" && $c < "\xc0") {
  91. // Legal tail bytes are nice.
  92. $sequence .= $c;
  93. }
  94. else {
  95. if ($len == 0) {
  96. // Premature end of string! Drop a replacement character into
  97. // output to represent the invalid UTF-8 sequence.
  98. $result .= $unknown;
  99. break 2;
  100. }
  101. else {
  102. // Illegal tail byte; abandon the sequence.
  103. $result .= $unknown;
  104. // Back up and reprocess this byte; it may itself be a legal
  105. // ASCII or UTF-8 sequence head.
  106. --$i;
  107. ++$len;
  108. continue 2;
  109. }
  110. }
  111. } while (--$remaining);
  112. $n = ord($head);
  113. if ($n <= 0xdf) {
  114. $ord = ($n - 192) * 64 + (ord($sequence[1]) - 128);
  115. }
  116. elseif ($n <= 0xef) {
  117. $ord = ($n - 224) * 4096 + (ord($sequence[1]) - 128) * 64 + (ord($sequence[2]) - 128);
  118. }
  119. elseif ($n <= 0xf7) {
  120. $ord = ($n - 240) * 262144 + (ord($sequence[1]) - 128) * 4096 + (ord($sequence[2]) - 128) * 64 + (ord($sequence[3]) - 128);
  121. }
  122. elseif ($n <= 0xfb) {
  123. $ord = ($n - 248) * 16777216 + (ord($sequence[1]) - 128) * 262144 + (ord($sequence[2]) - 128) * 4096 + (ord($sequence[3]) - 128) * 64 + (ord($sequence[4]) - 128);
  124. }
  125. elseif ($n <= 0xfd) {
  126. $ord = ($n - 252) * 1073741824 + (ord($sequence[1]) - 128) * 16777216 + (ord($sequence[2]) - 128) * 262144 + (ord($sequence[3]) - 128) * 4096 + (ord($sequence[4]) - 128) * 64 + (ord($sequence[5]) - 128);
  127. }
  128. $result .= _transliteration_replace($ord, $unknown, $source_langcode);
  129. $head = '';
  130. }
  131. elseif ($c < "\x80") {
  132. // ASCII byte.
  133. $result .= $c;
  134. $head = '';
  135. }
  136. elseif ($c < "\xc0") {
  137. // Illegal tail bytes.
  138. if ($head == '') {
  139. $result .= $unknown;
  140. }
  141. }
  142. else {
  143. // Miscellaneous freaks.
  144. $result .= $unknown;
  145. $head = '';
  146. }
  147. }
  148. }
  149. return $result;
  150. }
  151. /**
  152. * Replaces a Unicode character using the transliteration database.
  153. *
  154. * @param $ord
  155. * An ordinal Unicode character code.
  156. * @param $unknown
  157. * Replacement string for characters that do not have a suitable ASCII
  158. * equivalent.
  159. * @param $langcode
  160. * Optional ISO 639 language code that denotes the language of the input and
  161. * is used to apply language-specific variations. Defaults to the current
  162. * display language.
  163. * @return
  164. * ASCII replacement character.
  165. */
  166. function _transliteration_replace($ord, $unknown = '?', $langcode = NULL) {
  167. static $map = array();
  168. if (!isset($langcode)) {
  169. global $language;
  170. $langcode = $language->language;
  171. }
  172. $bank = $ord >> 8;
  173. if (!isset($map[$bank][$langcode])) {
  174. $file = drupal_get_path('module', 'transliteration') . '/data/' . sprintf('x%02x', $bank) . '.php';
  175. if (file_exists($file)) {
  176. include $file;
  177. if ($langcode != 'en' && isset($variant[$langcode])) {
  178. // Merge in language specific mappings.
  179. $map[$bank][$langcode] = $variant[$langcode] + $base;
  180. }
  181. else {
  182. $map[$bank][$langcode] = $base;
  183. }
  184. }
  185. else {
  186. $map[$bank][$langcode] = array();
  187. }
  188. }
  189. $ord = $ord & 255;
  190. return isset($map[$bank][$langcode][$ord]) ? $map[$bank][$langcode][$ord] : $unknown;
  191. }