PhpTransliteration.php 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325
  1. <?php
  2. namespace Drupal\Component\Transliteration;
  3. /**
  4. * Implements transliteration without using the PECL extensions.
  5. *
  6. * Transliterations are done character-by-character, by looking up non-US-ASCII
  7. * characters in a transliteration database.
  8. *
  9. * The database comes from two types of files, both of which are searched for in
  10. * the PhpTransliteration::$dataDirectory directory. First, language-specific
  11. * overrides are searched (see PhpTransliteration::readLanguageOverrides()). If
  12. * there is no language-specific override for a character, the generic
  13. * transliteration character tables are searched (see
  14. * PhpTransliteration::readGenericData()). If looking up the character in the
  15. * generic table results in a NULL value, or an illegal character is
  16. * encountered, then a substitute character is returned.
  17. *
  18. * Some parts of this code were derived from the MediaWiki project's UtfNormal
  19. * class, Copyright © 2004 Brion Vibber <brion@pobox.com>,
  20. * http://www.mediawiki.org/
  21. */
  22. class PhpTransliteration implements TransliterationInterface {
  23. /**
  24. * Directory where data for transliteration resides.
  25. *
  26. * The constructor sets this (by default) to subdirectory 'data' underneath
  27. * the directory where the class's PHP file resides.
  28. *
  29. * @var string
  30. */
  31. protected $dataDirectory;
  32. /**
  33. * Associative array of language-specific character transliteration tables.
  34. *
  35. * The outermost array keys are language codes. For each language code key,
  36. * the value is an array whose keys are Unicode character codes, and whose
  37. * values are the transliterations of those characters to US-ASCII. This is
  38. * set up as needed in PhpTransliteration::replace() by calling
  39. * PhpTransliteration::readLanguageOverrides().
  40. *
  41. * @var array
  42. */
  43. protected $languageOverrides = [];
  44. /**
  45. * Non-language-specific transliteration tables.
  46. *
  47. * Array whose keys are the upper two bytes of the Unicode character, and
  48. * whose values are an array of transliterations for each lower-two bytes
  49. * character code. This is set up as needed in PhpTransliteration::replace()
  50. * by calling PhpTransliteration::readGenericData().
  51. *
  52. * @var array
  53. */
  54. protected $genericMap = [];
  55. /**
  56. * Special characters for ::removeDiacritics().
  57. *
  58. * Characters which have accented variants but their base character
  59. * transliterates to more than one ASCII character require special
  60. * treatment: we want to remove their accent and use the un-
  61. * transliterated base character.
  62. */
  63. protected $fixTransliterateForRemoveDiacritics = [
  64. 'AE' => 'Æ',
  65. 'ae' => 'æ',
  66. 'ZH' => 'Ʒ',
  67. 'zh' => 'ʒ',
  68. ];
  69. /**
  70. * Constructs a transliteration object.
  71. *
  72. * @param string $data_directory
  73. * (optional) The directory where data files reside. If omitted, defaults
  74. * to subdirectory 'data' underneath the directory where the class's PHP
  75. * file resides.
  76. */
  77. public function __construct($data_directory = NULL) {
  78. $this->dataDirectory = (isset($data_directory)) ? $data_directory : __DIR__ . '/data';
  79. }
  80. /**
  81. * {@inheritdoc}
  82. */
  83. public function removeDiacritics($string) {
  84. $result = '';
  85. foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) {
  86. $code = self::ordUTF8($character);
  87. // These two Unicode ranges include the accented US-ASCII letters, with a
  88. // few characters that aren't accented letters mixed in. So define the
  89. // ranges and the excluded characters.
  90. $range1 = $code > 0x00bf && $code < 0x017f;
  91. $exclusions_range1 = [0x00d0, 0x00d7, 0x00f0, 0x00f7, 0x0138, 0x014a, 0x014b];
  92. $range2 = $code > 0x01cc && $code < 0x0250;
  93. $exclusions_range2 = [0x01DD, 0x01f7, 0x021c, 0x021d, 0x0220, 0x0221, 0x0241, 0x0242, 0x0245];
  94. $replacement = $character;
  95. if (($range1 && !in_array($code, $exclusions_range1)) || ($range2 && !in_array($code, $exclusions_range2))) {
  96. $to_add = $this->lookupReplacement($code, 'xyz');
  97. if (strlen($to_add) === 1) {
  98. $replacement = $to_add;
  99. }
  100. elseif (isset($this->fixTransliterateForRemoveDiacritics[$to_add])) {
  101. $replacement = $this->fixTransliterateForRemoveDiacritics[$to_add];
  102. }
  103. }
  104. $result .= $replacement;
  105. }
  106. return $result;
  107. }
  108. /**
  109. * {@inheritdoc}
  110. */
  111. public function transliterate($string, $langcode = 'en', $unknown_character = '?', $max_length = NULL) {
  112. $result = '';
  113. $length = 0;
  114. $hash = FALSE;
  115. // Replace question marks with a unique hash if necessary. This because
  116. // mb_convert_encoding() replaces all invalid characters with a question
  117. // mark.
  118. if ($unknown_character != '?' && strpos($string, '?') !== FALSE) {
  119. $hash = hash('sha256', $string);
  120. $string = str_replace('?', $hash, $string);
  121. }
  122. // Ensure the string is valid UTF8 for preg_split(). Unknown characters will
  123. // be replaced by a question mark.
  124. $string = mb_convert_encoding($string, 'UTF-8', 'UTF-8');
  125. // Use the provided unknown character instead of a question mark.
  126. if ($unknown_character != '?') {
  127. $string = str_replace('?', $unknown_character, $string);
  128. // Restore original question marks if necessary.
  129. if ($hash !== FALSE) {
  130. $string = str_replace($hash, '?', $string);
  131. }
  132. }
  133. // Split into Unicode characters and transliterate each one.
  134. foreach (preg_split('//u', $string, 0, PREG_SPLIT_NO_EMPTY) as $character) {
  135. $code = self::ordUTF8($character);
  136. if ($code == -1) {
  137. $to_add = $unknown_character;
  138. }
  139. else {
  140. $to_add = $this->replace($code, $langcode, $unknown_character);
  141. }
  142. // Check if this exceeds the maximum allowed length.
  143. if (isset($max_length)) {
  144. $length += strlen($to_add);
  145. if ($length > $max_length) {
  146. // There is no more space.
  147. return $result;
  148. }
  149. }
  150. $result .= $to_add;
  151. }
  152. return $result;
  153. }
  154. /**
  155. * Finds the character code for a UTF-8 character: like ord() but for UTF-8.
  156. *
  157. * @param string $character
  158. * A single UTF-8 character.
  159. *
  160. * @return int
  161. * The character code, or -1 if an illegal character is found.
  162. */
  163. protected static function ordUTF8($character) {
  164. $first_byte = ord($character[0]);
  165. if (($first_byte & 0x80) == 0) {
  166. // Single-byte form: 0xxxxxxxx.
  167. return $first_byte;
  168. }
  169. if (($first_byte & 0xe0) == 0xc0) {
  170. // Two-byte form: 110xxxxx 10xxxxxx.
  171. return (($first_byte & 0x1f) << 6) + (ord($character[1]) & 0x3f);
  172. }
  173. if (($first_byte & 0xf0) == 0xe0) {
  174. // Three-byte form: 1110xxxx 10xxxxxx 10xxxxxx.
  175. return (($first_byte & 0x0f) << 12) + ((ord($character[1]) & 0x3f) << 6) + (ord($character[2]) & 0x3f);
  176. }
  177. if (($first_byte & 0xf8) == 0xf0) {
  178. // Four-byte form: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx.
  179. return (($first_byte & 0x07) << 18) + ((ord($character[1]) & 0x3f) << 12) + ((ord($character[2]) & 0x3f) << 6) + (ord($character[3]) & 0x3f);
  180. }
  181. // Other forms are not legal.
  182. return -1;
  183. }
  184. /**
  185. * Replaces a single Unicode character using the transliteration database.
  186. *
  187. * @param int $code
  188. * The character code of a Unicode character.
  189. * @param string $langcode
  190. * The language code of the language the character is in.
  191. * @param string $unknown_character
  192. * The character to substitute for characters without transliterated
  193. * equivalents.
  194. *
  195. * @return string
  196. * US-ASCII replacement character. If it has a mapping, it is returned;
  197. * otherwise, $unknown_character is returned. The replacement can contain
  198. * multiple characters.
  199. */
  200. protected function replace($code, $langcode, $unknown_character) {
  201. if ($code < 0x80) {
  202. // Already lower ASCII.
  203. return chr($code);
  204. }
  205. // See if there is a language-specific override for this character.
  206. if (!isset($this->languageOverrides[$langcode])) {
  207. $this->readLanguageOverrides($langcode);
  208. }
  209. if (isset($this->languageOverrides[$langcode][$code])) {
  210. return $this->languageOverrides[$langcode][$code];
  211. }
  212. return $this->lookupReplacement($code, $unknown_character);
  213. }
  214. /**
  215. * Look up the generic replacement for a UTF-8 character code.
  216. *
  217. * @param $code
  218. * The UTF-8 character code.
  219. * @param string $unknown_character
  220. * (optional) The character to substitute for characters without entries in
  221. * the replacement tables.
  222. *
  223. * @return string
  224. * US-ASCII replacement characters. If it has a mapping, it is returned;
  225. * otherwise, $unknown_character is returned. The replacement can contain
  226. * multiple characters.
  227. */
  228. protected function lookupReplacement($code, $unknown_character = '?') {
  229. // See if there is a generic mapping for this character.
  230. $bank = $code >> 8;
  231. if (!isset($this->genericMap[$bank])) {
  232. $this->readGenericData($bank);
  233. }
  234. $code = $code & 0xff;
  235. return isset($this->genericMap[$bank][$code]) ? $this->genericMap[$bank][$code] : $unknown_character;
  236. }
  237. /**
  238. * Reads in language overrides for a language code.
  239. *
  240. * The data is read from files named "$langcode.php" in
  241. * PhpTransliteration::$dataDirectory. These files should set up an array
  242. * variable $overrides with an element whose key is $langcode and whose value
  243. * is an array whose keys are character codes, and whose values are their
  244. * transliterations in this language. The character codes can be for any valid
  245. * Unicode character, independent of the number of bytes.
  246. *
  247. * @param $langcode
  248. * Code for the language to read.
  249. */
  250. protected function readLanguageOverrides($langcode) {
  251. // Figure out the file name to use by sanitizing the language code,
  252. // just in case.
  253. $file = $this->dataDirectory . '/' . preg_replace('/[^a-zA-Z\-]/', '', $langcode) . '.php';
  254. // Read in this file, which should set up a variable called $overrides,
  255. // which will be local to this function.
  256. if (is_file($file)) {
  257. include $file;
  258. }
  259. if (!isset($overrides) || !is_array($overrides)) {
  260. $overrides = [$langcode => []];
  261. }
  262. $this->languageOverrides[$langcode] = $overrides[$langcode];
  263. }
  264. /**
  265. * Reads in generic transliteration data for a bank of characters.
  266. *
  267. * The data is read in from a file named "x$bank.php" (with $bank in
  268. * hexadecimal notation) in PhpTransliteration::$dataDirectory. These files
  269. * should set up a variable $bank containing an array whose numerical indices
  270. * are the remaining two bytes of the character code, and whose values are the
  271. * transliterations of these characters into US-ASCII. Note that the maximum
  272. * Unicode character that can be encoded in this way is 4 bytes.
  273. *
  274. * @param $bank
  275. * First two bytes of the Unicode character, or 0 for the ASCII range.
  276. */
  277. protected function readGenericData($bank) {
  278. // Figure out the file name.
  279. $file = $this->dataDirectory . '/x' . sprintf('%02x', $bank) . '.php';
  280. // Read in this file, which should set up a variable called $base, which
  281. // will be local to this function.
  282. if (is_file($file)) {
  283. include $file;
  284. }
  285. if (!isset($base) || !is_array($base)) {
  286. $base = [];
  287. }
  288. // Save this data.
  289. $this->genericMap[$bank] = $base;
  290. }
  291. }