transliteration_data.php.txt 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664
  1. <?php
  2. /**
  3. * @file
  4. * Unifies formats of transliteration data from various sources.
  5. *
  6. * A few notes about this script:
  7. * - The functions in this file are NOT SECURE, because they use PHP functions
  8. * like eval(). Absolutely do not run this script unless you trust the data
  9. * files used for input.
  10. * - You will need to change the name of this file to remove the .txt extension
  11. * before running it (it has been given this name so that you cannot run it
  12. * by mistake). When you do that, move it out of your web root as well so
  13. * that it cannot be run via a URL, and run the script via the PHP command
  14. * at a command prompt.
  15. * - This script, depending on which portions of it you run, depends on having
  16. * input data from various sources in sub-directories below where this file
  17. * is located. The data inputs are as follows:
  18. * - Existing Drupal Core transliteration data: Sub-directory 'data'; comes
  19. * from core/lib/Drupal/Component/Transliteration/data
  20. * - Midgardmvc data: Sub-directory 'utf8_to_ascii_db'; download from
  21. * https://github.com/bergie/midgardmvc_helper_urlize/downloads
  22. * - CPAN Text-Unidecode data: Sub-directory 'Unidecode'; download from
  23. * http://search.cpan.org/~sburke/Text-Unidecode-0.04/lib/Text/Unidecode.pm
  24. * - Node.js project: Sub-directory 'unidecoder_data'; download from
  25. * https://github.com/bitwalker/stringex/downloads
  26. * - JUnidecode project: Sub-directory 'junidecode'; download source from
  27. * http://www.ippatsuman.com/projects/junidecode/index.html
  28. * - You will also need to make directory 'outdata' to hold output.
  29. * - If you plan to use the 'intl' data, you will also need to have the PECL
  30. * packages 'yaml' and 'intl' installed. See
  31. * http://php.net/manual/install.pecl.downloads.php for generic PECL
  32. * package installation instructions. The following commands on Ubuntu Linux
  33. * will install yaml and intl packages:
  34. * @code
  35. * sudo apt-get install libyaml-dev
  36. * sudo pecl install yaml
  37. * sudo apt-get install php5-intl
  38. * sudo apt-get install libicu-dev
  39. * sudo pecl install intl
  40. * @endcode
  41. * After running these commands, you will need to make sure
  42. * 'extension=intl.so' and 'extension=yaml.so' are added to the php.ini file
  43. * that is in use for the PHP command-line command.
  44. * - When you have collected all of the data and installed the required
  45. * packages, you will need to find the specific commands below that you want
  46. * to use and un-comment them. The preferred data source for Drupal Core is
  47. * the PECL 'intl' package, and the line that needs to be un-commented in
  48. * order to make a Drupal Core patch is:
  49. * @code
  50. * patch_drupal('outdata');
  51. * @endcode
  52. * - The functions are documented in more detail in their headers where they
  53. * are defined. Many have parameters that you can use to change the output.
  54. */
  55. // Commands to read various data sources:
  56. // $data = read_drupal_data();
  57. // $data = read_midgard_data();
  58. // $data = read_cpan_data();
  59. // $data = read_nodejs_data();
  60. // $data = read_intl_data();
  61. // $data = read_junidecode_data();
  62. // After running a read_*_data() function, you can print out the data
  63. // (it will make a LOT of output):
  64. // print_r($data);
  65. // Command to read in all of data sources and output in CSV format, explaining
  66. // the differences:
  67. // read_all_to_csv();
  68. // Command to patch Drupal Core data, using the intl data set, and put the
  69. // resulting changed data files in the 'outdata' directory:
  70. patch_drupal('outdata');
  71. /**
  72. * Reads in all transliteration data and outputs differences in CSV format.
  73. *
  74. * Each data set is compared to the Drupal Core reference data set, and the
  75. * differences are noted. The data must be in the locations noted in the
  76. * file header above. The CSV output has several columns. The first one is the
  77. * Unicode character code. The next columns contain the transliteration of
  78. * that character in each of the data sets. The last column, tells what the
  79. * differences are between the Drupal Core reference set and the other data
  80. * sets:
  81. * - missing: The target set is missing data that the Drupal set has.
  82. * - provided: The target set has provided data that Drupal does not have.
  83. * - case: The target and Drupal set output differ only in upper/lower case.
  84. * - different: The target and Drupal set output differ in more than just case.
  85. *
  86. * @param bool $print_all
  87. * TRUE to print all data; FALSE (default) to print just data where there
  88. * are differences between the Drupal set and other data sources.
  89. * @param bool $print_missing
  90. * TRUE to print cases where one of the non-Drupal sets is missing information
  91. * and that is the only difference; FALSE (default) to include these rows.
  92. */
  93. function read_all_to_csv($print_all = FALSE, $print_missing = FALSE) {
  94. $data = array();
  95. $types = array('drupal', 'midgard', 'cpan', 'nodejs', 'junidecode', 'intl');
  96. // Alternatively, if you just want to compare a couple of data sets, you can
  97. // uncomment and edit the following line:
  98. // $types = array('drupal', 'intl');
  99. // Read in all the data.
  100. foreach ($types as $type) {
  101. $data[$type] = call_user_func('read_' . $type . '_data');
  102. }
  103. // Print CSV header row.
  104. print "character,";
  105. print implode(',', $types);
  106. print ",why\n";
  107. // Go through all the banks of character data.
  108. for ($bank = 0; $bank < 256; $bank++) {
  109. // Go through characters in bank; skip pure ASCII characters.
  110. $start = ($bank == 0) ? 0x80 : 0;
  111. for ($chr = $start; $chr < 256; $chr++) {
  112. // Gather the data together for this character.
  113. $row = array();
  114. foreach ($types as $type) {
  115. $row[$type] = (isset($data[$type][$bank][$chr]) && is_string($data[$type][$bank][$chr])) ? $data[$type][$bank][$chr] : '';
  116. }
  117. // Only print if there are differences or we are printing all data.
  118. $print = $print_all;
  119. $ref = $row['drupal'];
  120. $why = array();
  121. foreach ($types as $type) {
  122. // Try to characterize what the differences are.
  123. if ($row[$type] != $ref) {
  124. if ($row[$type] == '') {
  125. $why['missing'] = 'missing';
  126. if ($print_missing) {
  127. $print = TRUE;
  128. }
  129. }
  130. elseif ($ref == '') {
  131. $why['provided'] = 'provided';
  132. $print = TRUE;
  133. }
  134. elseif ($row[$type] == strtolower($ref) || $row[$type] == strtoupper($ref)) {
  135. $why['case'] = 'case';
  136. $print = TRUE;
  137. }
  138. else {
  139. $why['different'] = 'different';
  140. $print = TRUE;
  141. }
  142. }
  143. }
  144. // Print the data line.
  145. if ($print) {
  146. print '0x' . sprintf('%04x', 256 * $bank + $chr) . ',';
  147. foreach ($row as $out) {
  148. print '"' . addcslashes($out, '"') . '", ';
  149. }
  150. print implode(':', $why);
  151. print "\n";
  152. }
  153. }
  154. }
  155. }
  156. /**
  157. * Reads in 'intl' transliteration data and writes out changed Drupal files.
  158. *
  159. * Writes out the Drupal data files that would have to change to make our data
  160. * match the intl data set.
  161. *
  162. * @param string $outdir
  163. * Directory to put the patched data files in (under where the script is
  164. * being run).
  165. */
  166. function patch_drupal($outdir) {
  167. $data = array();
  168. // Note that this is hard-wired below. Changing this line will have no
  169. // effect except to break this function.
  170. $types = array('drupal', 'intl');
  171. // Read in all the data.
  172. foreach ($types as $type) {
  173. $data[$type] = call_user_func('read_' . $type . '_data');
  174. }
  175. // Go through all the banks of character data.
  176. for ($bank = 0; $bank < 256; $bank++) {
  177. $print_bank = FALSE;
  178. // Go through characters in bank; skip pure ASCII characters.
  179. $start = ($bank == 0) ? 0x80 : 0;
  180. $newdata = array();
  181. for ($chr = 0; $chr < 256; $chr++) {
  182. // Fill up the start of the ASCII range.
  183. if ($chr < $start) {
  184. $newdata[$chr] = chr($chr);
  185. continue;
  186. }
  187. // Figure out what characters we actually have.
  188. $drupal = isset($data['drupal'][$bank][$chr]) ? $data['drupal'][$bank][$chr] : NULL;
  189. // Note that for intl, we only want to keep the transliteration if it
  190. // has something other than '' in it.
  191. $intl = isset($data['intl'][$bank][$chr]) && $data['intl'][$bank][$chr] != '' ? $data['intl'][$bank][$chr] : NULL;
  192. // Make sure we have something in the Drupal data set, in case we need
  193. // to print.
  194. $newdata[$chr] = $drupal;
  195. if (!isset($intl)) {
  196. continue;
  197. }
  198. if (!isset($drupal) || $drupal != $intl) {
  199. $print_bank = TRUE;
  200. $newdata[$chr] = $intl;
  201. }
  202. }
  203. // If we found a difference, output a data file.
  204. if ($print_bank) {
  205. write_data_file($newdata, $bank, $outdir);
  206. }
  207. }
  208. }
  209. /**
  210. * Reads in the Drupal Core generic transliteration data set.
  211. *
  212. * The data is expected to be in files xNN.php in directory 'data' under
  213. * this file's directory.
  214. *
  215. * @return array
  216. * Nested array of transliteration data. Outer keys are the first two
  217. * bytes of Unicode characters (or 0 for base ASCII characters). The next
  218. * level is the other two bytes, and the values are the transliterations.
  219. *
  220. * @see PhpTransliteration::readGenericData()
  221. */
  222. function read_drupal_data() {
  223. $dir = __DIR__ . '/data';
  224. $out = array();
  225. // Read data files.
  226. for ($bank = 0; $bank < 256; $bank++) {
  227. $base = array();
  228. $file = $dir . '/x' . sprintf('%02x', $bank) . '.php';
  229. if (is_file($file)) {
  230. include($file);
  231. }
  232. $out[$bank] = $base;
  233. }
  234. return $out;
  235. }
  236. /**
  237. * Reads in the MidgardMVC transliteration data.
  238. *
  239. * The data is expected to be in files xNN.php in directory utf8_to_ascii_db
  240. * under the directory where this file resides. It can be downloaded from
  241. * https://github.com/bergie/midgardmvc_helper_urlize/downloads.
  242. *
  243. * @return array
  244. * Nested array of transliteration data. Outer keys are the first two
  245. * bytes of Unicode characters (or 0 for base ASCII characters). The next
  246. * level is the other two bytes, and the values are the transliterations.
  247. */
  248. function read_midgard_data() {
  249. $dir = __DIR__ . '/utf8_to_ascii_db';
  250. $out = array();
  251. // Read data files.
  252. for ($bank = 0; $bank < 256; $bank++) {
  253. $UTF8_TO_ASCII = array($bank => array());
  254. $file = $dir . '/x' . sprintf('%02x', $bank) . '.php';
  255. if (is_file($file)) {
  256. include($file);
  257. }
  258. $base = $UTF8_TO_ASCII[$bank];
  259. // For unknown characters, these files have '[?]' in them. Replace with
  260. // NULL for compatibility with our data.
  261. $base = array_map('_replace_question_with_null', $base);
  262. $out[$bank] = $base;
  263. }
  264. return $out;
  265. }
  266. /**
  267. * Reads in the CPAN Text::Unidecode data set.
  268. *
  269. * The data is expected to be in files xNN.pm in directory 'Unidecode' under
  270. * this file's directory. It can be downloaded from
  271. * http://search.cpan.org/~sburke/Text-Unidecode-0.04/lib/Text/Unidecode.pm.
  272. *
  273. * @return array
  274. * Nested array of transliteration data. Outer keys are the first two
  275. * bytes of Unicode characters (or 0 for base ASCII characters). The next
  276. * level is the other two bytes, and the values are the transliterations.
  277. */
  278. function read_cpan_data() {
  279. $dir = __DIR__ . '/Unidecode';
  280. $out = array();
  281. // Read data files.
  282. for ($bank = 0; $bank < 256; $bank++) {
  283. $base = array();
  284. $file = $dir . '/x' . sprintf('%02x', $bank) . '.pm';
  285. if (is_file($file)) {
  286. $base = _cpan_read_file($file);
  287. }
  288. $out[$bank] = $base;
  289. }
  290. return $out;
  291. }
  292. /**
  293. * Reads in the data in a single file from the Text::Unidecode CPAN project.
  294. *
  295. * @param string $file
  296. * File to read from.
  297. *
  298. * @return array
  299. * Data read from the file.
  300. *
  301. * @see read_cpan_data()
  302. */
  303. function _cpan_read_file($file) {
  304. $contents = file($file);
  305. $save = '';
  306. foreach ($contents as $line) {
  307. // Discard lines starting with # or $. The first line seems to have a
  308. // comment starting with #, the second has a Perl line like
  309. // $Text::Unidecode::Char[0x04] = [, -- and we do not want either.
  310. if (preg_match('|^\s*[#\$]|', $line)) {
  311. continue;
  312. }
  313. // Discard lines ending with semi-colons, which we also don't want
  314. // (there seem to be two of these lines at the end of the files).
  315. if (preg_match('|;\s*$|', $line)) {
  316. continue;
  317. }
  318. // Replace '[?]' with nothing (that means "don't know how to
  319. // transliterate"). In some files, this is encoded as qq{[?]} or
  320. // qq{[?] } instead.
  321. $line = str_replace('qq{[?]}', 'NULL', $line);
  322. $line = str_replace('qq{[?] }', 'NULL', $line);
  323. $line = str_replace("'[?]'", 'NULL', $line);
  324. // Replace qq{} with either "" or '' or nothing, depending on what is
  325. // inside it.
  326. $line = str_replace('qq{\{}', "'{'", $line);
  327. $line = str_replace('qq{\}}', "'}'", $line);
  328. $line = str_replace('qq{\} }', "'} '", $line);
  329. $line = str_replace("qq{\\\\}", '"\\\\"', $line);
  330. $line = str_replace("qq{\\", "qq{'", $line);
  331. $line = str_replace("qq{\"'}", "\"\\\"'\"", $line);
  332. $line = preg_replace('|qq\{([^\'\}]+)\}|', "'$1'", $line);
  333. $line = preg_replace('|qq\{([^\}]+)\}|', '"$1"', $line);
  334. $save .= $line;
  335. }
  336. // Now we should have a string that looks like:
  337. // 'a', 'b', ...
  338. // Evaluate as an array.
  339. $save = 'return array(' . $save . ');';
  340. $data = @eval($save);
  341. if (isset($data) && is_array($data)) {
  342. $data = array_map('_replace_hex_with_character', $data);
  343. }
  344. else {
  345. // There was a problem, so throw an error and exit.
  346. print "Problem in evaluating $file\n";
  347. print $save;
  348. eval($save);
  349. exit();
  350. }
  351. // For unknown characters, these files may still have '[?]' in them. Replace
  352. // with NULL for compatibility with our data.
  353. $data = array_map('_replace_question_with_null', $data);
  354. return $data;
  355. }
  356. /**
  357. * Reads in the Node.js transliteration data.
  358. *
  359. * The data is expected to be in files xNN.yml in directory unidecoder_data
  360. * under the directory where this file resides. It can be downloaded from
  361. * https://github.com/bitwalker/stringex/downloads. You also need the PECL
  362. * 'yaml' extension installed for this function to work.
  363. *
  364. * @return array
  365. * Nested array of transliteration data. Outer keys are the first two
  366. * bytes of Unicode characters (or 0 for base ASCII characters). The next
  367. * level is the other two bytes, and the values are the transliterations.
  368. */
  369. function read_nodejs_data() {
  370. $dir = __DIR__ . '/unidecoder_data';
  371. $out = array();
  372. // Read data files.
  373. for ($bank = 0; $bank < 256; $bank++) {
  374. $base = array();
  375. $file = $dir . '/x' . sprintf('%02x', $bank) . '.yml';
  376. if (is_file($file)) {
  377. $base = yaml_parse_file($file);
  378. // For unknown characters, these files have '[?]' in them. Replace with
  379. // NULL for compatibility with our data.
  380. $base = array_map('_replace_question_with_null', $base);
  381. }
  382. $out[$bank] = $base;
  383. }
  384. return $out;
  385. }
  386. /**
  387. * Loads the PECL 'intl' Transliterator class's transliteration data.
  388. *
  389. * You need to have the PECL 'intl' package installed for this to work.
  390. *
  391. * @return array
  392. * Nested array of transliteration data. Outer keys are the first two
  393. * bytes of Unicode characters (or 0 for base ASCII characters). The next
  394. * level is the other two bytes, and the values are the transliterations.
  395. */
  396. function read_intl_data() {
  397. // In order to transliterate, you first have to create a transliterator
  398. // object. This needs a list of transliteration operations. You can get a
  399. // list of available operations with:
  400. // print_r(Transliterator::listIDs()); exit();
  401. // And a few of these are documented on
  402. // http://userguide.icu-project.org/transforms/general and
  403. // http://www.unicode.org/reports/tr15/ (for normalizations).
  404. // There are also maps to the Unicode characters at:
  405. // http://www.unicode.org/roadmaps/bmp/
  406. // http://www.unicode.org/charts/nameslist/
  407. $ops = '';
  408. // The first step in any transform: separate out accents and remove them.
  409. $ops .= 'NFD; [:Nonspacing Mark:] Remove; NFC;';
  410. // Then you need to do a bunch of language-specific or script-specific
  411. // transliterations. Here is hopefully a representative set. There are
  412. // quite a few scripts that don't appear to have rules currently, such
  413. // as Etheopian.
  414. $ops .= 'Greek-Latin; ';
  415. $ops .= 'Cyrillic-Latin; ';
  416. $ops .= 'Armenian-Latin; ';
  417. $ops .= 'Hebrew-Latin; ';
  418. $ops .= 'Arabic-Latin; ';
  419. $ops .= 'Syriac-Latin; ';
  420. $ops .= 'Thaana-Latin; ';
  421. $ops .= 'Devanagari-Latin; ';
  422. $ops .= 'Bengali-Latin; ';
  423. $ops .= 'Gurmukhi-Latin; ';
  424. $ops .= 'Gujarati-Latin; ';
  425. $ops .= 'Oriya-Latin; ';
  426. $ops .= 'Tamil-Latin; ';
  427. $ops .= 'Telugu-Latin; ';
  428. $ops .= 'Kannada-Latin; ';
  429. $ops .= 'Malayalam-Latin; ';
  430. $ops .= 'Thai-Latin; ';
  431. $ops .= 'Georgian-Latin; ';
  432. $ops .= 'Hangul-Latin; ';
  433. $ops .= 'Mongolian-Latin/BGN; ';
  434. $ops .= 'Jamo-Latin; ';
  435. $ops .= 'Katakana-Latin; ';
  436. $ops .= 'Any-Latin; ';
  437. // Finally, after transforming to Latin, transform to ASCII.
  438. $ops .= 'Latin-ASCII; ';
  439. // Remove any remaining accents and recompose.
  440. $ops .= 'NFD; [:Nonspacing Mark:] Remove; NFC;';
  441. $trans = Transliterator::create($ops);
  442. $out = array();
  443. // Transliterate all possible characters.
  444. for ($bank = 0; $bank < 256; $bank++) {
  445. $data = array();
  446. for ($chr = 0; $chr < 256; $chr++) {
  447. // Skip the UTF-16 and "private use" ranges completely.
  448. $OK = ($bank <= 0xd8 || $bank > 0xf8);
  449. $result = $OK ? $trans->transliterate(mb_convert_encoding(pack('n', 256 * $bank + $chr), 'UTF-8', 'UTF-16BE')) : '';
  450. // See if we have managed to transliterate this to ASCII or not. If not,
  451. // return NULL instead of this character.
  452. $max = chr(127);
  453. foreach (preg_split('//u', $result, 0, PREG_SPLIT_NO_EMPTY) as $character) {
  454. if ($character > $max) {
  455. $OK = $OK && FALSE;
  456. break;
  457. }
  458. }
  459. $data[$chr] = ($OK) ? $result : NULL;
  460. }
  461. $out[$bank] = $data;
  462. }
  463. return $out;
  464. }
  465. /**
  466. * Reads in the JUnidecode data set.
  467. *
  468. * The data is expected to be in files XNN.java in directory 'junidecode' under
  469. * this file's directory. It can be downloaded from
  470. * http://www.ippatsuman.com/projects/junidecode/index.html
  471. *
  472. * @return array
  473. * Nested array of transliteration data. Outer keys are the first two
  474. * bytes of Unicode characters (or 0 for base ASCII characters). The next
  475. * level is the other two bytes, and the values are the transliterations.
  476. */
  477. function read_junidecode_data() {
  478. $dir = __DIR__ . '/junidecode';
  479. $out = array();
  480. // Read data files.
  481. for ($bank = 0; $bank < 256; $bank++) {
  482. $base = array();
  483. $file = $dir . '/X' . sprintf('%02x', $bank) . '.java';
  484. if (is_file($file)) {
  485. $base = _junidecode_read_file($file);
  486. }
  487. $out[$bank] = $base;
  488. }
  489. return $out;
  490. }
  491. /**
  492. * Reads in the data in a single file from the JUnidecode project.
  493. *
  494. * @param string $file
  495. * File to read from.
  496. *
  497. * @return array
  498. * Data read from the file.
  499. *
  500. * @see read_junidecode_data()
  501. */
  502. function _junidecode_read_file($file) {
  503. $contents = file($file);
  504. $save = '';
  505. foreach ($contents as $line) {
  506. // Discard lines starting with * or / or package or class or public or },
  507. // to get rid of comments and Java code.
  508. if (preg_match('|^\s*[\*/\}]|', $line)) {
  509. continue;
  510. }
  511. if (preg_match('/^\s*package|public|class/', $line)) {
  512. continue;
  513. }
  514. // Some of the lines look like this:
  515. // new String("" + (char) 0x00), // 0x00
  516. // Transform to be '0x00,'
  517. $line = preg_replace('|^\s*new\s+String\s*\(\s*""\s*\+\s*\(char\)\s+0x([0-9]+).*$|', '0x$1,', $line);
  518. // Strings are in double quotes, yet many have \' in them.
  519. $line = str_replace("\'", "'", $line);
  520. // Everything else should probably be OK -- the lines are like:
  521. // "Ie", // 0x00
  522. $save .= $line;
  523. }
  524. // Evaluate as an array.
  525. $save = 'return array(' . $save . ');';
  526. $data = @eval($save);
  527. if (isset($data) && is_array($data)) {
  528. $data = array_map('_replace_hex_with_character', $data);
  529. $data = array_map('_replace_question_with_null', $data);
  530. }
  531. else {
  532. // There was a problem, so throw an error and exit.
  533. print "Problem in evaluating $file\n";
  534. print $save;
  535. eval($save);
  536. exit();
  537. }
  538. return $data;
  539. }
  540. /**
  541. * Callback for array_map(): Returns $data, with '[?]' replaced with NULL.
  542. */
  543. function _replace_question_with_null($data) {
  544. return ($data == '[?]' || $data == '[?] ') ? NULL : $data;
  545. }
  546. /**
  547. * Callback for array_map(): Replaces '\xNN' with the actual character.
  548. */
  549. function _replace_hex_with_character($item) {
  550. if (strpos($item, '\x') === 0) {
  551. $item = eval($item);
  552. }
  553. return $item;
  554. }
  555. /**
  556. * Writes a data file out in the standard Drupal Core data format.
  557. *
  558. * @param array $data
  559. * Array of data to write out.
  560. * @param string $bank
  561. * Bank of characters it belongs to.
  562. * @param string $dir
  563. * Output directory.
  564. */
  565. function write_data_file($data, $bank, $outdir) {
  566. $dir = __DIR__ . '/' . $outdir;
  567. $file = $dir . '/x' . sprintf('%02x', $bank) . '.php';
  568. $out = '';
  569. $out .= "<?php\n\n/**\n * @file\n * Generic transliteration data for the PhpTransliteration class.\n */\n\n\$base = array(\n";
  570. // The 00 file skips the ASCII range
  571. $start = 0;
  572. if ($bank == 0) {
  573. $start = 0x80;
  574. $out .= " // Note: to save memory plain ASCII mappings have been left out.\n";
  575. }
  576. for ($line = $start; $line <= 0xf0; $line += 0x10) {
  577. $out .= ' 0x' . sprintf('%02X', $line) . ' =>';
  578. $elems = array_values(array_slice($data, $line, 16));
  579. for ($i = 0; $i < 16; $i++ ) {
  580. if (isset($elems[$i])) {
  581. $out .= " '" . addcslashes($elems[$i], "'\\") . "',";
  582. }
  583. else {
  584. $out .= ' NULL,';
  585. }
  586. }
  587. $out .= "\n";
  588. }
  589. $out .= ");\n";
  590. file_put_contents($file, $out);
  591. }