tex2hpb.js 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711
  1. /* eslint-disable complexity */
  2. /* eslint-disable security/detect-non-literal-fs-filename */
  3. /* eslint-disable no-console, no-sync */
  4. /* eslint-env node */
  5. /*
  6. * Convert TeX-patterns to hpb: tex2hpb – Version 1.0
  7. *
  8. * This tool converts hyphenation patterns from TeX
  9. * (https://ctan.org/tex-archive/language/hyph-utf8)
  10. * to the binary format used in Hyphenopoly.js
  11. * (https://github.com/mnater/Hyphenopoly)
  12. *
  13. * Usage:
  14. * # node tex2hpb.js lic.txt chars.txt pat.txt [exc.txt | null] outname
  15. *
  16. * This creates a new file called input.hpb in pwd
  17. *
  18. * All input files must be utf-8 encoded files.
  19. *
  20. * license.txt
  21. * Some licenses require to be included in every distribution of the work.
  22. * If not empty the license.txt file must contain the license of the patterns.
  23. * newline (0x0a).
  24. *
  25. * characters.txt
  26. * When creating the pattern trie, characters of the languages alphabet are
  27. * mapped to internal small integers. The program thus need to know which
  28. * characters are used in the language.
  29. * in the patterns followed by other representations of the same
  30. * character, if any (e.g. its uppercase form):
  31. * Example:
  32. * aA
  33. * äÄ
  34. * sSſ
  35. * ß
  36. * Character groups are not supported, yet (e.g. german ßSS is invalid).
  37. *
  38. * patterns.txt must be a utf-8 encoded file of two parts:
  39. * rightmin. Typically: 22
  40. * For many patterns you'll find these numbers in the license text or on
  41. * http://www.hyphenation.org/#languages
  42. * If this information is missing, 22 is assumed.
  43. * 2: TeX hyphenation patterns where each pattern is separated by a
  44. * newline (0x0a).
  45. *
  46. * The optional exceptions.txt contains exceptional hyphenations that are
  47. * hyphenation points.
  48. * Example:
  49. * ta-ble
  50. * project
  51. * Internally exceptions are converted to special patterns. The examples above
  52. * will become:
  53. * _10t10a11b10l10e10_
  54. * _10p10r10o10j10e10c10t10_
  55. * So "tables" will not by hyphenated by this pattern.
  56. *
  57. * If there's no exceptions file, use "null" as a placeholder.
  58. *
  59. * outname
  60. * outname (typically the language code) is the filename where patterns will
  61. * be stored. The .hpb ending is added automatically.
  62. */
  63. /*
  64. * Binary format: .hpb (hyphenopoly patterns binary)
  65. * The hyphenopoly patterns binary stores hyphenation patterns
  66. * for one language in a tight format shaped for fast loading
  67. * and execution by Hyphenopoly.js
  68. * Unlike in other hyphenation binaries (like e.g. .hyb files) the
  69. * patterns are not stored as a trie. The trie is (even when packed)
  70. * slightly larger then the raw patterns.
  71. * The trie has to be built by the consumer of the patterns.
  72. * The binary file consists of four parts: HEADER, LICENSE, TRANSLATE and
  73. * PATTERNS. All data is little endian.
  74. *
  75. * HEADER
  76. * Uint32Array of length 8
  77. * [0]: magic number 0x01627068 (\hpb1, 1 is the version)
  78. * [1]: TRANSLATE offset (to skip LICENSE)
  79. * [2]: PATTERNS offset (skip LICENSE + TRANSLATE)
  80. * [3]: patternlength (bytes)
  81. * [4]: leftmin
  82. * [5]: rightmin
  83. * [6]: Trie Array Size (needed to preallocate memory)
  84. * [7]: Values Size (needed to preallocate memory)
  85. *
  86. * LICENSE
  87. * UTF-8 encoded license text, padded to 4 bytes
  88. *
  89. * TRANSLATE
  90. * When creating a trie, the characters of the alphabet have to be
  91. * mapped to uInts from 1 to the lenght of the alphabet.
  92. * (Since wasm memory is initialized with zeros, 0 is reserved here).
  93. * Also upon creating the trie, the number of characters has to be
  94. * known.
  95. * The mapping from utf16 characters to these internal uInts is stored
  96. * in the TRANSLATE Table which is an Uint16Array of variable length.
  97. * The characters of the patterns are stored on locations with odd
  98. * adresses (starting at 1) while there other cases are stored on even
  99. * adresses. Adress 0 denotes the length of the alphabet.
  100. * By using 16bits the characters are restricted to the BMP.
  101. * Characters in the TRANSLATE table are sorted by their Unicode code
  102. * point in increasing order (except substitutions, see below)
  103. * Characters that don't have a upperCase are followed by 0.
  104. * Characters that are a SUBSTITUTION for an other character in the
  105. * alphabet are stored at the end of the list, preceded by their
  106. * substituted character.
  107. * The underline character (_) is reserved to mark the beginning and
  108. * the end of the word (TeX patterns use the dot (.) for this purpose)
  109. * and is always the first character in the TRANSLATE.
  110. * Example:
  111. * For the characters '_rst' and 'ſ' (LATIN SMALL LETTER LONG S) the
  112. * TRANSLATE is '4_\0rRsStTſs' (substitutes dont count for the length)
  113. * The TRANSLATE Table has several impacts on the behaviour of the
  114. * hyphenation algorithm:
  115. * - words containing a character that is not in the translate are
  116. * not hyphenated
  117. * - words don't need to be lowerCase'd before hyphenation
  118. * - substitute characters are not needed to be substituted before
  119. * hyphenation
  120. *
  121. * PATTERNS
  122. * A Uint8Array of variable length.
  123. * Patterns from the input file are mapped by the TRANSLATE Table.
  124. * TeX patterns contain (a) numbers from 1 to 9 to indicate hyphenation points
  125. * and (b) the characters of the alphabet.
  126. * The numbers are directly stored with their value. The word boundary
  127. * marker (_) has always the value 12 (0xC). The other characters are
  128. * stored with values >= 13 (0xD). Thus the maximum alphabet length is
  129. * 255 - 12 = 243 which should be enough for most use cases.
  130. * Example 3:
  131. * Given the TRANSLATE '_\0aAbBcC'
  132. * (0x0400 0x5f00 0x0000 0x6100 0x4100 0x6200 0x4200 0x6300 0x4300)
  133. * the characters for the pattern '1ba' are stored as
  134. * '0x01 0x0e 0x0d' = '01 14 13'
  135. * Individual patterns are not separated. Instead patterns of the same
  136. * length and the same prefix are grouped. 0 (zero) marks the beginning of a new
  137. * group with same length, 255 marks a new prefix group.
  138. * Example 4:
  139. * The patterns '1ba 1be 1abd 1abf 1bba' are grouped as
  140. * follows
  141. * '0 3 255 1 b a e 0 4 255 1 a b d b f 255 1 b b a'
  142. */
  143. "use strict";
  144. const fs = require("fs");
  145. const VERSION = 2;
  146. const licenseFileName = process.argv[2];
  147. const charactersFileName = process.argv[3];
  148. const patternsFileName = process.argv[4];
  149. const exceptionsFileName = process.argv[5];
  150. const saveFileName = process.argv[6];
  151. let leftmin = 2;
  152. let rightmin = 2;
  153. const logger = (function createLogger() {
  154. let msgNr = 1;
  155. /**
  156. * Logs text to console.
  157. * @param {string} text - The text to be logged
  158. * @param {boolean} indent - If true, indents log by 4 spaces
  159. */
  160. function log(text, indent) {
  161. if (indent) {
  162. console.log(` \x1b[34m${text}\x1b[0m`);
  163. } else {
  164. console.log(`\x1b[33m(${msgNr.toString(16)})\x1b[0m: \x1b[34m${text}\x1b[0m`);
  165. msgNr += 1;
  166. }
  167. }
  168. return {
  169. "log": log
  170. };
  171. }());
  172. /**
  173. * Create the magic number (the first 32Bits of the .hpb-file).
  174. * The MagicNumber contains the utf8 code for the letters "hpb" and
  175. * the version as digit (e.g. hpb1 -> 104,112,98,1 -> 68,70,62,01 in hex)
  176. */
  177. function createMagicNumber() {
  178. const mnstring = "hpb";
  179. const mnarray = [];
  180. let i = 0;
  181. while (i < mnstring.length) {
  182. mnarray.push(mnstring.codePointAt(i));
  183. i += 1;
  184. }
  185. mnarray.push(VERSION);
  186. const mnui8 = Uint8Array.from(mnarray);
  187. const mnui32 = new Uint32Array(mnui8.buffer);
  188. return mnui32[0];
  189. }
  190. /**
  191. * Create the header of the hpb file. The header contains 8 32bit values:
  192. * [0]: magic number 0x01627068 (\hpb1, 1 is the version)
  193. * [1]: TRANSLATE offset (to skip LICENSE)
  194. * [2]: PATTERNS offset (skip LICENSE + TRANSLATE)
  195. * [3]: patternlength (bytes)
  196. * [4]: leftmin
  197. * [5]: rightmin
  198. * [6]: Trie Array Size (needed to preallocate memory)
  199. * [7]: Values Size (needed to preallocate memory)
  200. * @param {number} licenseLength - Length of the licence
  201. * @param {number} translateLength - Length of the translate
  202. * @param {number} trieLength - Length of the pattern trie
  203. * @param {number} valueLength - Length of the value list
  204. * @param {number} patternLength - Length of the raw patterns
  205. */
  206. function createHeader(
  207. licenseLength,
  208. translateLength,
  209. trieLength,
  210. valueLength,
  211. patternLength
  212. ) {
  213. const headerui32 = new Uint32Array(8);
  214. const translateByteOffset = headerui32.byteLength + licenseLength;
  215. const patternByteOffset = translateByteOffset + translateLength;
  216. headerui32[0] = createMagicNumber();
  217. headerui32[1] = translateByteOffset;
  218. headerui32[2] = patternByteOffset;
  219. headerui32[3] = patternLength;
  220. headerui32[4] = leftmin;
  221. headerui32[5] = rightmin;
  222. headerui32[6] = trieLength;
  223. headerui32[7] = valueLength;
  224. return headerui32;
  225. }
  226. /**
  227. * Read .lic.txt File
  228. */
  229. function getLicenseFileBuffer() {
  230. logger.log(`read license file: ${licenseFileName} (${fs.statSync(licenseFileName).size} Bytes)`);
  231. const licensefile = fs.readFileSync("./" + licenseFileName);
  232. return licensefile;
  233. }
  234. /**
  235. * Read .chr.txt File
  236. */
  237. function getCharactersFile() {
  238. logger.log(`read characters file: ${charactersFileName} (${fs.statSync(charactersFileName).size} Bytes)`);
  239. let charactersfile = fs.readFileSync("./" + charactersFileName, "utf8");
  240. charactersfile = charactersfile.trim();
  241. return charactersfile;
  242. }
  243. /**
  244. * Read .pat.txt File
  245. */
  246. function getPatternsFile() {
  247. logger.log(`read patterns file: ${patternsFileName} (${fs.statSync(patternsFileName).size} Bytes)`);
  248. let patternsfile = fs.readFileSync("./" + patternsFileName, "utf8");
  249. patternsfile = patternsfile.trim();
  250. // eslint-disable-next-line prefer-named-capture-group
  251. patternsfile = patternsfile.replace(/(\d{2})\n/, function repl(ignore, p1) {
  252. const digits = p1.split("");
  253. leftmin = parseInt(digits[0], 10);
  254. rightmin = parseInt(digits[1], 10);
  255. logger.log(`set leftmin: ${leftmin}, rightmin: ${rightmin}`);
  256. return "";
  257. });
  258. patternsfile = patternsfile.replace(/\./g, "_");
  259. patternsfile = patternsfile.replace(/\n/g, " ");
  260. return patternsfile;
  261. }
  262. /**
  263. * Read .hyp.txt File
  264. */
  265. function getExceptionsFile() {
  266. if (exceptionsFileName && exceptionsFileName !== "null") {
  267. logger.log(`read exceptions file: ${exceptionsFileName} (${fs.statSync(exceptionsFileName).size} Bytes)`);
  268. const exceptionsfile = fs.readFileSync("./" + exceptionsFileName, "utf8");
  269. return exceptionsfile;
  270. }
  271. logger.log("no exceptions");
  272. return null;
  273. }
  274. /**
  275. * Create translateTable
  276. * @param {string} characters - List of chars
  277. */
  278. function createTranslate(characters) {
  279. const lines = characters.split("\n");
  280. // At index 0: alphabet length
  281. const translateTable = [0];
  282. const substitutions = [];
  283. const logalpha = [];
  284. const logsubst = [];
  285. const wordDelim = "_";
  286. translateTable.push(wordDelim.charCodeAt(0));
  287. translateTable.push(0);
  288. translateTable[0] += 1;
  289. lines.forEach(function eachLine(value) {
  290. translateTable[0] += 1;
  291. if (value.length === 2) {
  292. translateTable.push(value.charCodeAt(0));
  293. translateTable.push(value.charCodeAt(1));
  294. logalpha.push(value.charAt(0));
  295. logalpha.push(value.charAt(1));
  296. } else if (value.length === 1) {
  297. translateTable.push(value.charCodeAt(0));
  298. translateTable.push(0);
  299. logalpha.push(value.charAt(0));
  300. logalpha.push("⎵");
  301. } else if (value.length > 2) {
  302. // Substitutions
  303. translateTable.push(value.charCodeAt(0));
  304. translateTable.push(value.charCodeAt(1));
  305. logalpha.push(value.charAt(0));
  306. logalpha.push(value.charAt(1));
  307. let i = 2;
  308. while (i < value.length) {
  309. substitutions.push(value.charCodeAt(i));
  310. substitutions.push(value.charCodeAt(0));
  311. logsubst.push(value.charAt(i));
  312. logsubst.push(value.charAt(0));
  313. i += 1;
  314. }
  315. }
  316. });
  317. logger.log(`collected alphabet of length ${logalpha.length} (${logalpha.length / 2}):`);
  318. logger.log(`${logalpha.join("")}`, true);
  319. logger.log(`collected substitutions: ${logsubst.join("")}`);
  320. const ui16 = Uint16Array.from(translateTable.concat(substitutions));
  321. return ui16;
  322. }
  323. /**
  324. * Create a lookup table from the translate table
  325. * @param {Uint16Array} translate - Translate table
  326. */
  327. function createTranslateLookUpTable(translate) {
  328. // eslint-disable-next-line no-bitwise
  329. const lookuptable = new Uint16Array(2 << 15);
  330. let i = 1;
  331. let k = 12;
  332. while (i < translate.length) {
  333. if (lookuptable[translate[i + 1]] === 0) {
  334. // eslint-disable-next-line security/detect-object-injection
  335. lookuptable[translate[i]] = k;
  336. if (translate[i + 1] !== 0) {
  337. lookuptable[translate[i + 1]] = k;
  338. }
  339. k += 1;
  340. } else {
  341. // Substitute
  342. // eslint-disable-next-line security/detect-object-injection
  343. lookuptable[translate[i]] = lookuptable[translate[i + 1]];
  344. }
  345. i += 2;
  346. }
  347. logger.log(`mapped chars of alphabet to internal numbers in range [12, ${k})`);
  348. return lookuptable;
  349. }
  350. /**
  351. * Create special patterns from exceptions
  352. * @param {string} exceptions - List of exceptions
  353. */
  354. function createExceptionPatterns(exceptions) {
  355. const lines = exceptions.split("\n");
  356. const ret = [];
  357. lines.forEach(function eachLine(value, index) {
  358. if (value !== "") {
  359. // eslint-disable-next-line security/detect-object-injection
  360. ret[index] = "_" + value.split("").map(function mapper(c) {
  361. if (c === "-") {
  362. return "11";
  363. }
  364. return "10" + c;
  365. }).
  366. join("").
  367. replace(/1110/gi, "11") + "10_";
  368. }
  369. });
  370. return ret.join(" ");
  371. }
  372. /**
  373. * Convert TeX-patterns to hpb-patterns
  374. * @param {Uint16Array} translate - The translate table
  375. * @param {string} patterns - The TeX-patterns
  376. * @param {string} exceptionsfile - The content of the .hyp.txt
  377. */
  378. function createPatterns(translate, patterns, exceptionsfile) {
  379. /* eslint-disable security/detect-object-injection */
  380. const lookuptable = createTranslateLookUpTable(translate);
  381. const allExceptions = createExceptionPatterns(exceptionsfile);
  382. if (allExceptions !== "") {
  383. patterns = patterns + " " + allExceptions;
  384. }
  385. const allPatterns = patterns.split(" ");
  386. const exceptions = [];
  387. // eslint-disable-next-line complexity
  388. const translatedPatterns = allPatterns.map(function mapper(pat) {
  389. let i = 0;
  390. let cP1 = 0;
  391. let cP2 = 0;
  392. const ret = [];
  393. let isException = false;
  394. while (i < pat.length) {
  395. cP1 = pat.codePointAt(i);
  396. if (cP1 > 57 || cP1 < 49) {
  397. ret.push(lookuptable[cP1]);
  398. } else {
  399. cP2 = pat.codePointAt(i + 1);
  400. if (cP2 && (cP2 < 57 && cP2 > 47)) {
  401. isException = true;
  402. ret.push((10 * (cP1 - 48)) + (cP2 - 48));
  403. i += 1;
  404. } else {
  405. ret.push(cP1 - 48);
  406. }
  407. }
  408. i += 1;
  409. }
  410. if (isException) {
  411. exceptions.push(pat);
  412. }
  413. return ret;
  414. });
  415. logger.log(`found ${exceptions.length} pattern exceptions`);
  416. const groupedPatterns = {};
  417. let patternLength = 0;
  418. let i = 0;
  419. let longestP = 0;
  420. let shortestP = Number.MAX_SAFE_INTEGER;
  421. while (i < translatedPatterns.length) {
  422. patternLength = translatedPatterns[i].length;
  423. // eslint-disable-next-line no-prototype-builtins
  424. if (groupedPatterns.hasOwnProperty(patternLength)) {
  425. groupedPatterns[patternLength].push(translatedPatterns[i]);
  426. } else {
  427. groupedPatterns[patternLength] = [translatedPatterns[i]];
  428. }
  429. i += 1;
  430. }
  431. const outPatterns = [];
  432. Object.keys(groupedPatterns).forEach(function eachPatternLength(k) {
  433. groupedPatterns[k].sort();
  434. let l = 0;
  435. let j = 0;
  436. const currentLength = parseInt(k, 10);
  437. let currentFirst = 0;
  438. let currentSecond = 0;
  439. longestP = Math.max(longestP, parseInt(k, 10));
  440. shortestP = Math.min(shortestP, parseInt(k, 10));
  441. outPatterns.push(0);
  442. outPatterns.push(currentLength);
  443. while (l < groupedPatterns[k].length) {
  444. j = 2;
  445. if (currentFirst !== groupedPatterns[k][l][0] ||
  446. currentSecond !== groupedPatterns[k][l][1]
  447. ) {
  448. currentFirst = groupedPatterns[k][l][0];
  449. currentSecond = groupedPatterns[k][l][1];
  450. outPatterns.push(255);
  451. outPatterns.push(currentFirst);
  452. outPatterns.push(currentSecond);
  453. }
  454. while (j < groupedPatterns[k][l].length) {
  455. outPatterns.push(groupedPatterns[k][l][j]);
  456. j += 1;
  457. }
  458. l += 1;
  459. }
  460. });
  461. /*
  462. * Object.keys(groupedPatterns).forEach(function eachPatternLength(k) {
  463. * groupedPatterns[k].sort();
  464. * outPatterns.push(58);
  465. * outPatterns.push(parseInt(k, 10));
  466. * outPatterns.push(58);
  467. * let l = 0;
  468. * let j = 0;
  469. * longestP = Math.max(longestP, parseInt(k, 10));
  470. * shortestP = Math.min(shortestP, parseInt(k, 10));
  471. * while (l < groupedPatterns[k].length) {
  472. * j = 0;
  473. * while (j < groupedPatterns[k][l].length) {
  474. * outPatterns.push(groupedPatterns[k][l][j]);
  475. * j += 1;
  476. * }
  477. * l += 1;
  478. * }
  479. * });
  480. */
  481. logger.log(`grouped and sorted patterns: shortest: ${shortestP}, longest: ${longestP}`);
  482. return Uint8Array.from(outPatterns);
  483. // eslint-enable security/detect-object-injection
  484. }
  485. /**
  486. * Create the patterns trie
  487. * @param {Uint8Array} patterns - hpb-patterns
  488. * @param {number} trieRowLength - number of characters
  489. */
  490. function TrieCreator(patterns, trieRowLength) {
  491. let i = 0;
  492. let patternlength = 0;
  493. let count = 0;
  494. let rowStart = 0;
  495. let nextRowStart = 0;
  496. let prevWasDigit = false;
  497. let trieNextEmptyRow = 0;
  498. let rowOffset = 0;
  499. let valueStoreNextStartIndex = 0;
  500. let valueStoreCurrentIdx = 0;
  501. let valueStorePrevIdx = 0;
  502. const patternTrie = [];
  503. const valueStore = [];
  504. /**
  505. * Add 0 to value store
  506. */
  507. function add0ToValueStore() {
  508. valueStore[valueStoreCurrentIdx] = 0;
  509. valueStoreCurrentIdx += 1;
  510. }
  511. /**
  512. * Add a value to value store
  513. * @param {number} p - Value to be added
  514. */
  515. function addToValueStore(p) {
  516. valueStore[valueStoreCurrentIdx] = p;
  517. valueStorePrevIdx = valueStoreCurrentIdx;
  518. valueStoreCurrentIdx += 1;
  519. }
  520. /**
  521. * Get link to value store index
  522. */
  523. function getLinkToValueStore() {
  524. const start = valueStoreNextStartIndex;
  525. // Mark end of pattern:
  526. valueStore[valueStorePrevIdx + 1] = 255;
  527. valueStoreNextStartIndex = valueStorePrevIdx + 2;
  528. valueStoreCurrentIdx = valueStoreNextStartIndex;
  529. return start;
  530. }
  531. /**
  532. * Add a new Row filled with 0
  533. * @param {number} startIndex - From this index
  534. */
  535. function makeRow(startIndex) {
  536. let s = startIndex;
  537. while (s < (trieRowLength + startIndex)) {
  538. patternTrie[s] = 0;
  539. s += 1;
  540. }
  541. return startIndex;
  542. }
  543. /**
  544. * Add a codePoint to the Trie
  545. * @param {number} codePoint - Translated code Point
  546. */
  547. function addToTrie(codePoint) {
  548. if (codePoint > 11) {
  549. // It's a char
  550. if (!prevWasDigit) {
  551. add0ToValueStore();
  552. }
  553. prevWasDigit = false;
  554. if (nextRowStart === -1) {
  555. // Start a new row
  556. trieNextEmptyRow = trieNextEmptyRow + trieRowLength + 1;
  557. nextRowStart = trieNextEmptyRow;
  558. patternTrie[rowStart + rowOffset] = makeRow(nextRowStart);
  559. }
  560. rowOffset = (codePoint - 12) * 2;
  561. rowStart = nextRowStart;
  562. nextRowStart = patternTrie[rowStart + rowOffset];
  563. if (nextRowStart === 0) {
  564. patternTrie[rowStart + rowOffset] = -1;
  565. nextRowStart = -1;
  566. }
  567. } else {
  568. // It's a digit
  569. addToValueStore(codePoint);
  570. prevWasDigit = true;
  571. }
  572. }
  573. /**
  574. * Add last codePoint of a pattern to the Trie
  575. * @param {number} codePoint - Translated code Point
  576. */
  577. function terminateTrie() {
  578. patternTrie[rowStart + rowOffset + 1] = getLinkToValueStore();
  579. }
  580. makeRow(0);
  581. let first = 0;
  582. let second = 0;
  583. while (i < patterns.length) {
  584. if (patterns[i] === 0) {
  585. patternlength = patterns[i + 1];
  586. i += 2;
  587. } else {
  588. if (patterns[i] === 255) {
  589. first = patterns[i + 1];
  590. second = patterns[i + 2];
  591. i += 3;
  592. }
  593. while (count < patternlength) {
  594. switch (count) {
  595. case 0:
  596. addToTrie(first);
  597. count += 1;
  598. break;
  599. case 1:
  600. addToTrie(second);
  601. count += 1;
  602. break;
  603. default:
  604. addToTrie(patterns[i]);
  605. count += 1;
  606. i += 1;
  607. }
  608. }
  609. terminateTrie();
  610. // Reset indizes
  611. count = 0;
  612. rowStart = 0;
  613. nextRowStart = 0;
  614. prevWasDigit = 0;
  615. }
  616. }
  617. logger.log("created Trie.");
  618. logger.log(`trieLength: ${patternTrie.length}`, true);
  619. logger.log(`valueStoreLength: ${valueStore.length}`, true);
  620. return {
  621. "trieLength": patternTrie.length,
  622. "valueStoreLength": valueStore.length
  623. };
  624. }
  625. /**
  626. * The one function to rule them all...
  627. */
  628. function main() {
  629. const start = process.hrtime();
  630. console.log(`\x1b[35mRunning tex2hbp.js (v${VERSION}) on node.js (${process.version})\x1b[0m`);
  631. const licenseBuf = getLicenseFileBuffer();
  632. const paddedLicenseBuf = licenseBuf.byteLength + 4 -
  633. (licenseBuf.byteLength % 4);
  634. const charactersfile = getCharactersFile();
  635. const patternsfile = getPatternsFile();
  636. const exceptionsfile = getExceptionsFile();
  637. const translate = createTranslate(charactersfile);
  638. const patterns = createPatterns(translate, patternsfile, exceptionsfile);
  639. const dummyTrie = new TrieCreator(patterns, translate[0] * 2);
  640. const header = createHeader(
  641. paddedLicenseBuf,
  642. translate.byteLength,
  643. dummyTrie.trieLength,
  644. dummyTrie.valueStoreLength,
  645. patterns.byteLength
  646. );
  647. let fileBufferSize = header.byteLength + paddedLicenseBuf +
  648. translate.byteLength + patterns.byteLength;
  649. const pad = 4 - (fileBufferSize % 4);
  650. fileBufferSize += pad;
  651. const fileBuffer = new ArrayBuffer(fileBufferSize);
  652. const fileBufferui32 = new Uint32Array(fileBuffer);
  653. const fileBufferui16 = new Uint16Array(fileBuffer);
  654. const fileBufferui8 = new Uint8Array(fileBuffer);
  655. fileBufferui32.set(header, 0);
  656. fileBufferui8.set(licenseBuf, header.byteLength);
  657. // eslint-disable-next-line no-bitwise
  658. fileBufferui16.set(translate, (header.byteLength + paddedLicenseBuf) >> 1);
  659. fileBufferui8.set(
  660. patterns,
  661. header.byteLength + paddedLicenseBuf + translate.byteLength
  662. );
  663. fs.writeFile(saveFileName + ".hpb", fileBufferui8, function cb(err) {
  664. if (err) {
  665. console.log(err);
  666. } else {
  667. logger.log(`Finish: file saved to '${saveFileName + ".hpb"}' (${fileBufferSize} Bytes)`);
  668. console.log(`\x1b[35mtook ${process.hrtime(start)} seconds\x1b[0m`);
  669. }
  670. });
  671. }
  672. main();