rewrite-pattern.js 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190
  1. var generate = require('regjsgen').generate;
  2. var parse = require('regjsparser').parse;
  3. var regenerate = require('regenerate');
  4. var iuMappings = require('./data/iu-mappings.json');
  5. var ESCAPE_SETS = require('./data/character-class-escape-sets.js');
  6. function getCharacterClassEscapeSet(character) {
  7. if (unicode) {
  8. if (ignoreCase) {
  9. return ESCAPE_SETS.UNICODE_IGNORE_CASE[character];
  10. }
  11. return ESCAPE_SETS.UNICODE[character];
  12. }
  13. return ESCAPE_SETS.REGULAR[character];
  14. }
  15. var object = {};
  16. var hasOwnProperty = object.hasOwnProperty;
  17. function has(object, property) {
  18. return hasOwnProperty.call(object, property);
  19. }
  20. // Prepare a Regenerate set containing all code points, used for negative
  21. // character classes (if any).
  22. var UNICODE_SET = regenerate().addRange(0x0, 0x10FFFF);
  23. // Without the `u` flag, the range stops at 0xFFFF.
  24. // https://mths.be/es6#sec-pattern-semantics
  25. var BMP_SET = regenerate().addRange(0x0, 0xFFFF);
  26. // Prepare a Regenerate set containing all code points that are supposed to be
  27. // matched by `/./u`. https://mths.be/es6#sec-atom
  28. var DOT_SET_UNICODE = UNICODE_SET.clone() // all Unicode code points
  29. .remove(
  30. // minus `LineTerminator`s (https://mths.be/es6#sec-line-terminators):
  31. 0x000A, // Line Feed <LF>
  32. 0x000D, // Carriage Return <CR>
  33. 0x2028, // Line Separator <LS>
  34. 0x2029 // Paragraph Separator <PS>
  35. );
  36. // Prepare a Regenerate set containing all code points that are supposed to be
  37. // matched by `/./` (only BMP code points).
  38. var DOT_SET = DOT_SET_UNICODE.clone()
  39. .intersection(BMP_SET);
  40. // Add a range of code points + any case-folded code points in that range to a
  41. // set.
  42. regenerate.prototype.iuAddRange = function(min, max) {
  43. var $this = this;
  44. do {
  45. var folded = caseFold(min);
  46. if (folded) {
  47. $this.add(folded);
  48. }
  49. } while (++min <= max);
  50. return $this;
  51. };
  52. function assign(target, source) {
  53. for (var key in source) {
  54. // Note: `hasOwnProperty` is not needed here.
  55. target[key] = source[key];
  56. }
  57. }
  58. function update(item, pattern) {
  59. // TODO: Test if memoizing `pattern` here is worth the effort.
  60. var tree = parse(pattern, '');
  61. switch (tree.type) {
  62. case 'characterClass':
  63. case 'group':
  64. case 'value':
  65. // No wrapping needed.
  66. break;
  67. default:
  68. // Wrap the pattern in a non-capturing group.
  69. tree = wrap(tree, pattern);
  70. }
  71. assign(item, tree);
  72. }
  73. function wrap(tree, pattern) {
  74. // Wrap the pattern in a non-capturing group.
  75. return {
  76. 'type': 'group',
  77. 'behavior': 'ignore',
  78. 'body': [tree],
  79. 'raw': '(?:' + pattern + ')'
  80. };
  81. }
  82. function caseFold(codePoint) {
  83. return has(iuMappings, codePoint) ? iuMappings[codePoint] : false;
  84. }
  85. var ignoreCase = false;
  86. var unicode = false;
  87. function processCharacterClass(characterClassItem) {
  88. var set = regenerate();
  89. var body = characterClassItem.body.forEach(function(item) {
  90. switch (item.type) {
  91. case 'value':
  92. set.add(item.codePoint);
  93. if (ignoreCase && unicode) {
  94. var folded = caseFold(item.codePoint);
  95. if (folded) {
  96. set.add(folded);
  97. }
  98. }
  99. break;
  100. case 'characterClassRange':
  101. var min = item.min.codePoint;
  102. var max = item.max.codePoint;
  103. set.addRange(min, max);
  104. if (ignoreCase && unicode) {
  105. set.iuAddRange(min, max);
  106. }
  107. break;
  108. case 'characterClassEscape':
  109. set.add(getCharacterClassEscapeSet(item.value));
  110. break;
  111. // The `default` clause is only here as a safeguard; it should never be
  112. // reached. Code coverage tools should ignore it.
  113. /* istanbul ignore next */
  114. default:
  115. throw Error('Unknown term type: ' + item.type);
  116. }
  117. });
  118. if (characterClassItem.negative) {
  119. set = (unicode ? UNICODE_SET : BMP_SET).clone().remove(set);
  120. }
  121. update(characterClassItem, set.toString());
  122. return characterClassItem;
  123. }
  124. function processTerm(item) {
  125. switch (item.type) {
  126. case 'dot':
  127. update(
  128. item,
  129. (unicode ? DOT_SET_UNICODE : DOT_SET).toString()
  130. );
  131. break;
  132. case 'characterClass':
  133. item = processCharacterClass(item);
  134. break;
  135. case 'characterClassEscape':
  136. update(
  137. item,
  138. getCharacterClassEscapeSet(item.value).toString()
  139. );
  140. break;
  141. case 'alternative':
  142. case 'disjunction':
  143. case 'group':
  144. case 'quantifier':
  145. item.body = item.body.map(processTerm);
  146. break;
  147. case 'value':
  148. var codePoint = item.codePoint;
  149. var set = regenerate(codePoint);
  150. if (ignoreCase && unicode) {
  151. var folded = caseFold(codePoint);
  152. if (folded) {
  153. set.add(folded);
  154. }
  155. }
  156. update(item, set.toString());
  157. break;
  158. case 'anchor':
  159. case 'empty':
  160. case 'group':
  161. case 'reference':
  162. // Nothing to do here.
  163. break;
  164. // The `default` clause is only here as a safeguard; it should never be
  165. // reached. Code coverage tools should ignore it.
  166. /* istanbul ignore next */
  167. default:
  168. throw Error('Unknown term type: ' + item.type);
  169. }
  170. return item;
  171. };
  172. module.exports = function(pattern, flags) {
  173. var tree = parse(pattern, flags);
  174. ignoreCase = flags ? flags.indexOf('i') > -1 : false;
  175. unicode = flags ? flags.indexOf('u') > -1 : false;
  176. assign(tree, processTerm(tree));
  177. return generate(tree);
  178. };