tokenizer.js 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350
  1. module.exports = function(css) {
  2. var TokenType = require('../token-types');
  3. var tokens = [],
  4. urlMode = false,
  5. blockMode = 0,
  6. c, // current character
  7. cn, // next character
  8. pos = 0,
  9. tn = 0,
  10. ln = 1,
  11. col = 1;
  12. var Punctuation = {
  13. ' ': TokenType.Space,
  14. '\n': TokenType.Newline,
  15. '\r': TokenType.Newline,
  16. '\t': TokenType.Tab,
  17. '!': TokenType.ExclamationMark,
  18. '"': TokenType.QuotationMark,
  19. '#': TokenType.NumberSign,
  20. '$': TokenType.DollarSign,
  21. '%': TokenType.PercentSign,
  22. '&': TokenType.Ampersand,
  23. '\'': TokenType.Apostrophe,
  24. '(': TokenType.LeftParenthesis,
  25. ')': TokenType.RightParenthesis,
  26. '*': TokenType.Asterisk,
  27. '+': TokenType.PlusSign,
  28. ',': TokenType.Comma,
  29. '-': TokenType.HyphenMinus,
  30. '.': TokenType.FullStop,
  31. '/': TokenType.Solidus,
  32. ':': TokenType.Colon,
  33. ';': TokenType.Semicolon,
  34. '<': TokenType.LessThanSign,
  35. '=': TokenType.EqualsSign,
  36. '==': TokenType.EqualitySign,
  37. '!=': TokenType.InequalitySign,
  38. '>': TokenType.GreaterThanSign,
  39. '?': TokenType.QuestionMark,
  40. '@': TokenType.CommercialAt,
  41. '[': TokenType.LeftSquareBracket,
  42. ']': TokenType.RightSquareBracket,
  43. '^': TokenType.CircumflexAccent,
  44. '_': TokenType.LowLine,
  45. '{': TokenType.LeftCurlyBracket,
  46. '|': TokenType.VerticalLine,
  47. '}': TokenType.RightCurlyBracket,
  48. '~': TokenType.Tilde
  49. };
  50. /**
  51. * Add a token to the token list
  52. * @param {string} type
  53. * @param {string} value
  54. */
  55. function pushToken(type, value, column) {
  56. tokens.push({
  57. tn: tn++,
  58. ln: ln,
  59. col: column,
  60. type: type,
  61. value: value
  62. });
  63. }
  64. /**
  65. * Check if a character is a decimal digit
  66. * @param {string} c Character
  67. * @returns {boolean}
  68. */
  69. function isDecimalDigit(c) {
  70. return '0123456789'.indexOf(c) >= 0;
  71. }
  72. /**
  73. * Parse spaces
  74. * @param {string} css Unparsed part of CSS string
  75. */
  76. function parseSpaces(css) {
  77. var start = pos;
  78. // Read the string until we meet a non-space character:
  79. for (; pos < css.length; pos++) {
  80. if (css.charAt(pos) !== ' ') break;
  81. }
  82. // Add a substring containing only spaces to tokens:
  83. pushToken(TokenType.Space, css.substring(start, pos--), col);
  84. col += pos - start;
  85. }
  86. /**
  87. * Parse a string within quotes
  88. * @param {string} css Unparsed part of CSS string
  89. * @param {string} q Quote (either `'` or `"`)
  90. */
  91. function parseString(css, q) {
  92. var start = pos;
  93. // Read the string until we meet a matching quote:
  94. for (pos++; pos < css.length; pos++) {
  95. // Skip escaped quotes:
  96. if (css.charAt(pos) === '\\') pos++;
  97. else if (css.charAt(pos) === q) break;
  98. }
  99. // Add the string (including quotes) to tokens:
  100. pushToken(q === '"' ? TokenType.StringDQ : TokenType.StringSQ, css.substring(start, pos + 1), col);
  101. col += pos - start;
  102. }
  103. /**
  104. * Parse numbers
  105. * @param {string} css Unparsed part of CSS string
  106. */
  107. function parseDecimalNumber(css) {
  108. var start = pos;
  109. // Read the string until we meet a character that's not a digit:
  110. for (; pos < css.length; pos++) {
  111. if (!isDecimalDigit(css.charAt(pos))) break;
  112. }
  113. // Add the number to tokens:
  114. pushToken(TokenType.DecimalNumber, css.substring(start, pos--), col);
  115. col += pos - start;
  116. }
  117. /**
  118. * Parse identifier
  119. * @param {string} css Unparsed part of CSS string
  120. */
  121. function parseIdentifier(css) {
  122. var start = pos;
  123. // Skip all opening slashes:
  124. while (css.charAt(pos) === '/') pos++;
  125. // Read the string until we meet a punctuation mark:
  126. for (; pos < css.length; pos++) {
  127. // Skip all '\':
  128. if (css.charAt(pos) === '\\') pos++;
  129. else if (css.charAt(pos) in Punctuation) break;
  130. }
  131. var ident = css.substring(start, pos--);
  132. // Enter url mode if parsed substring is `url`:
  133. urlMode = urlMode || ident === 'url';
  134. // Add identifier to tokens:
  135. pushToken(TokenType.Identifier, ident, col);
  136. col += pos - start;
  137. }
  138. /**
  139. * Parse equality sign
  140. * @param {string} sass Unparsed part of SASS string
  141. */
  142. function parseEquality(css) {
  143. pushToken(TokenType.EqualitySign, '==', col);
  144. pos++;
  145. col++;
  146. }
  147. /**
  148. * Parse inequality sign
  149. * @param {string} sass Unparsed part of SASS string
  150. */
  151. function parseInequality(css) {
  152. pushToken(TokenType.InequalitySign, '!=', col);
  153. pos++;
  154. col++;
  155. }
  156. /**
  157. * Parse a multiline comment
  158. * @param {string} css Unparsed part of CSS string
  159. */
  160. function parseMLComment(css) {
  161. var start = pos;
  162. // Get current indent level:
  163. var il = 0;
  164. for (var _pos = pos - 1; _pos > -1; _pos--) {
  165. // TODO: Can be tabs:
  166. if (css.charAt(_pos) === ' ') il++;
  167. else break;
  168. }
  169. for (pos += 2; pos < css.length; pos++) {
  170. if (css.charAt(pos) === '\n') {
  171. // Get new line's indent level:
  172. var _il = 0;
  173. for (var _pos = pos + 1; _pos < css.length; _pos++) {
  174. if (css.charAt(_pos) === ' ') _il++;
  175. else break;
  176. }
  177. if (_il > il) {
  178. col = 0;
  179. pos += _pos - pos;
  180. } else break;
  181. }
  182. }
  183. // Add full comment (including `/*`) to the list of tokens:
  184. var comment = css.substring(start, pos + 1);
  185. pushToken(TokenType.CommentML, comment, col);
  186. var newlines = comment.split('\n');
  187. if (newlines.length > 1) {
  188. ln += newlines.length - 1;
  189. col = newlines[newlines.length - 1].length;
  190. } else {
  191. col += (pos - start);
  192. }
  193. }
  194. /**
  195. * Parse a single line comment
  196. * @param {string} css Unparsed part of CSS string
  197. */
  198. function parseSLComment(css) {
  199. var start = pos;
  200. // Check if comment is the only token on the line, and if so,
  201. // get current indent level:
  202. var il = 0;
  203. var onlyToken = false;
  204. for (var _pos = pos - 1; _pos > -1; _pos--) {
  205. // TODO: Can be tabs:
  206. if (css.charAt(_pos) === ' ') il++;
  207. else if (css.charAt(_pos) === '\n') {
  208. onlyToken = true;
  209. break;
  210. } else break;
  211. }
  212. if (_pos === -1) onlyToken = true;
  213. // Read the string until we meet comment end.
  214. // Since we already know first 2 characters (`//`), start reading
  215. // from `pos + 2`:
  216. if (!onlyToken) {
  217. for (pos += 2; pos < css.length; pos++) {
  218. if (css.charAt(pos) === '\n' || css.charAt(pos) === '\r') {
  219. break;
  220. }
  221. }
  222. } else {
  223. for (pos+=2; pos < css.length; pos++) {
  224. if (css.charAt(pos) === '\n') {
  225. // Get new line's indent level:
  226. var _il = 0;
  227. for (var _pos = pos + 1; _pos < css.length; _pos++) {
  228. if (css.charAt(_pos) === ' ') _il++;
  229. else break;
  230. }
  231. if (_il > il) {
  232. col = 0;
  233. pos += _pos - pos;
  234. } else break;
  235. }
  236. }
  237. }
  238. // Add comment (including `//` and line break) to the list of tokens:
  239. pushToken(TokenType.CommentSL, css.substring(start, pos--), col);
  240. col += pos - start;
  241. }
  242. /**
  243. * Convert a CSS string to a list of tokens
  244. * @param {string} css CSS string
  245. * @returns {Array} List of tokens
  246. * @private
  247. */
  248. function getTokens(css) {
  249. // Parse string, character by character:
  250. for (pos = 0; pos < css.length; col++, pos++) {
  251. c = css.charAt(pos);
  252. cn = css.charAt(pos + 1);
  253. // If we meet `/*`, it's a start of a multiline comment.
  254. // Parse following characters as a multiline comment:
  255. if (c === '/' && cn === '*') {
  256. parseMLComment(css);
  257. }
  258. // If we meet `//` and it is not a part of url:
  259. else if (!urlMode && c === '/' && cn === '/') {
  260. // If we're currently inside a block, treat `//` as a start
  261. // of identifier. Else treat `//` as a start of a single-line
  262. // comment:
  263. parseSLComment(css);
  264. }
  265. // If current character is a double or single quote, it's a start
  266. // of a string:
  267. else if (c === '"' || c === "'") {
  268. parseString(css, c);
  269. }
  270. // If current character is a space:
  271. else if (c === ' ') {
  272. parseSpaces(css);
  273. }
  274. // If current character is `=`, it must be combined with next `=`
  275. else if (c === '=' && cn === '=') {
  276. parseEquality(css);
  277. }
  278. // If we meet `!=`, this must be inequality
  279. else if (c === '!' && cn === '=') {
  280. parseInequality(css);
  281. }
  282. // If current character is a punctuation mark:
  283. else if (c in Punctuation) {
  284. // Add it to the list of tokens:
  285. pushToken(Punctuation[c], c, col);
  286. if (c === '\n' || c === '\r') {
  287. ln++;
  288. col = 0;
  289. } // Go to next line
  290. if (c === ')') urlMode = false; // exit url mode
  291. if (c === '{') blockMode++; // enter a block
  292. if (c === '}') blockMode--; // exit a block
  293. }
  294. // If current character is a decimal digit:
  295. else if (isDecimalDigit(c)) {
  296. parseDecimalNumber(css);
  297. }
  298. // If current character is anything else:
  299. else {
  300. parseIdentifier(css);
  301. }
  302. }
  303. return tokens;
  304. }
  305. return getTokens(css);
  306. };