tokenizer.js 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. module.exports = function(css) {
  2. var TokenType = require('../token-types');
  3. var tokens = [],
  4. urlMode = false,
  5. blockMode = 0,
  6. c, // current character
  7. cn, // next character
  8. pos = 0,
  9. tn = 0,
  10. ln = 1,
  11. col = 1;
  12. var Punctuation = {
  13. ' ': TokenType.Space,
  14. '\n': TokenType.Newline,
  15. '\r': TokenType.Newline,
  16. '\t': TokenType.Tab,
  17. '!': TokenType.ExclamationMark,
  18. '"': TokenType.QuotationMark,
  19. '#': TokenType.NumberSign,
  20. '$': TokenType.DollarSign,
  21. '%': TokenType.PercentSign,
  22. '&': TokenType.Ampersand,
  23. '\'': TokenType.Apostrophe,
  24. '(': TokenType.LeftParenthesis,
  25. ')': TokenType.RightParenthesis,
  26. '*': TokenType.Asterisk,
  27. '+': TokenType.PlusSign,
  28. ',': TokenType.Comma,
  29. '-': TokenType.HyphenMinus,
  30. '.': TokenType.FullStop,
  31. '/': TokenType.Solidus,
  32. ':': TokenType.Colon,
  33. ';': TokenType.Semicolon,
  34. '<': TokenType.LessThanSign,
  35. '=': TokenType.EqualsSign,
  36. '>': TokenType.GreaterThanSign,
  37. '?': TokenType.QuestionMark,
  38. '@': TokenType.CommercialAt,
  39. '[': TokenType.LeftSquareBracket,
  40. ']': TokenType.RightSquareBracket,
  41. '^': TokenType.CircumflexAccent,
  42. '_': TokenType.LowLine,
  43. '{': TokenType.LeftCurlyBracket,
  44. '|': TokenType.VerticalLine,
  45. '}': TokenType.RightCurlyBracket,
  46. '~': TokenType.Tilde
  47. };
  48. /**
  49. * Add a token to the token list
  50. * @param {string} type
  51. * @param {string} value
  52. */
  53. function pushToken(type, value, column) {
  54. tokens.push({
  55. tn: tn++,
  56. ln: ln,
  57. col: column,
  58. type: type,
  59. value: value
  60. });
  61. }
  62. /**
  63. * Check if a character is a decimal digit
  64. * @param {string} c Character
  65. * @returns {boolean}
  66. */
  67. function isDecimalDigit(c) {
  68. return '0123456789'.indexOf(c) >= 0;
  69. }
  70. /**
  71. * Parse spaces
  72. * @param {string} css Unparsed part of CSS string
  73. */
  74. function parseSpaces(css) {
  75. var start = pos;
  76. // Read the string until we meet a non-space character:
  77. for (; pos < css.length; pos++) {
  78. if (css.charAt(pos) !== ' ') break;
  79. }
  80. // Add a substring containing only spaces to tokens:
  81. pushToken(TokenType.Space, css.substring(start, pos--), col);
  82. col += (pos - start);
  83. }
  84. /**
  85. * Parse a string within quotes
  86. * @param {string} css Unparsed part of CSS string
  87. * @param {string} q Quote (either `'` or `"`)
  88. */
  89. function parseString(css, q) {
  90. var start = pos;
  91. // Read the string until we meet a matching quote:
  92. for (pos++; pos < css.length; pos++) {
  93. // Skip escaped quotes:
  94. if (css.charAt(pos) === '\\') pos++;
  95. else if (css.charAt(pos) === q) break;
  96. }
  97. // Add the string (including quotes) to tokens:
  98. pushToken(q === '"' ? TokenType.StringDQ : TokenType.StringSQ, css.substring(start, pos + 1), col);
  99. col += (pos - start);
  100. }
  101. /**
  102. * Parse numbers
  103. * @param {string} css Unparsed part of CSS string
  104. */
  105. function parseDecimalNumber(css) {
  106. var start = pos;
  107. // Read the string until we meet a character that's not a digit:
  108. for (; pos < css.length; pos++) {
  109. if (!isDecimalDigit(css.charAt(pos))) break;
  110. }
  111. // Add the number to tokens:
  112. pushToken(TokenType.DecimalNumber, css.substring(start, pos--), col);
  113. col += (pos - start);
  114. }
  115. /**
  116. * Parse identifier
  117. * @param {string} css Unparsed part of CSS string
  118. */
  119. function parseIdentifier(css) {
  120. var start = pos;
  121. // Skip all opening slashes:
  122. while (css.charAt(pos) === '/') pos++;
  123. // Read the string until we meet a punctuation mark:
  124. for (; pos < css.length; pos++) {
  125. // Skip all '\':
  126. if (css.charAt(pos) === '\\') pos++;
  127. else if (css.charAt(pos) in Punctuation) break;
  128. }
  129. var ident = css.substring(start, pos--);
  130. // Enter url mode if parsed substring is `url`:
  131. urlMode = urlMode || ident === 'url';
  132. // Add identifier to tokens:
  133. pushToken(TokenType.Identifier, ident, col);
  134. col += (pos - start);
  135. }
  136. /**
  137. * Parse a multiline comment
  138. * @param {string} css Unparsed part of CSS string
  139. */
  140. function parseMLComment(css) {
  141. var start = pos;
  142. // Read the string until we meet `*/`.
  143. // Since we already know first 2 characters (`/*`), start reading
  144. // from `pos + 2`:
  145. for (pos = pos + 2; pos < css.length; pos++) {
  146. if (css.charAt(pos) === '*' && css.charAt(pos + 1) === '/') {
  147. pos++;
  148. break;
  149. }
  150. }
  151. // Add full comment (including `/*` and `*/`) to the list of tokens:
  152. var comment = css.substring(start, pos + 1);
  153. pushToken(TokenType.CommentML, comment, col);
  154. var newlines = comment.split('\n');
  155. if (newlines.length > 1) {
  156. ln += newlines.length - 1;
  157. col = newlines[newlines.length - 1].length;
  158. } else {
  159. col += (pos - start);
  160. }
  161. }
  162. /**
  163. * Parse a single line comment
  164. * @param {string} css Unparsed part of CSS string
  165. */
  166. function parseSLComment(css) {
  167. var start = pos;
  168. // Read the string until we meet line break.
  169. // Since we already know first 2 characters (`//`), start reading
  170. // from `pos + 2`:
  171. for (pos+=2; pos < css.length; pos++) {
  172. if (css.charAt(pos) === '\n' || css.charAt(pos) === '\r') {
  173. break;
  174. }
  175. }
  176. // Add comment (including `//` and line break) to the list of tokens:
  177. pushToken(TokenType.CommentSL, css.substring(start, pos--), col);
  178. col += pos - start;
  179. }
  180. /**
  181. * Convert a CSS string to a list of tokens
  182. * @param {string} css CSS string
  183. * @returns {Array} List of tokens
  184. * @private
  185. */
  186. function getTokens(css) {
  187. // Parse string, character by character:
  188. for (pos = 0; pos < css.length; col++, pos++) {
  189. c = css.charAt(pos);
  190. cn = css.charAt(pos + 1);
  191. // If we meet `/*`, it's a start of a multiline comment.
  192. // Parse following characters as a multiline comment:
  193. if (c === '/' && cn === '*') {
  194. parseMLComment(css);
  195. }
  196. // If we meet `//` and it is not a part of url:
  197. else if (!urlMode && c === '/' && cn === '/') {
  198. // If we're currently inside a block, treat `//` as a start
  199. // of identifier. Else treat `//` as a start of a single-line
  200. // comment:
  201. parseSLComment(css);
  202. }
  203. // If current character is a double or single quote, it's a start
  204. // of a string:
  205. else if (c === '"' || c === "'") {
  206. parseString(css, c);
  207. }
  208. // If current character is a space:
  209. else if (c === ' ') {
  210. parseSpaces(css);
  211. }
  212. // If current character is a punctuation mark:
  213. else if (c in Punctuation) {
  214. // Add it to the list of tokens:
  215. pushToken(Punctuation[c], c, col);
  216. if (c === '\n' || c === '\r') {
  217. ln++;
  218. col = 0;
  219. } // Go to next line
  220. if (c === ')') urlMode = false; // exit url mode
  221. if (c === '{') blockMode++; // enter a block
  222. if (c === '}') blockMode--; // exit a block
  223. }
  224. // If current character is a decimal digit:
  225. else if (isDecimalDigit(c)) {
  226. parseDecimalNumber(css);
  227. }
  228. // If current character is anything else:
  229. else {
  230. parseIdentifier(css);
  231. }
  232. }
  233. return tokens;
  234. }
  235. return getTokens(css);
  236. };