| /* Tokenizer for JavaScript code */ |
| |
| var tokenizeJavaScript = (function() { |
| // Advance the stream until the given character (not preceded by a |
| // backslash) is encountered, or the end of the line is reached. |
| function nextUntilUnescaped(source, end) { |
| var escaped = false; |
| while (!source.endOfLine()) { |
| var next = source.next(); |
| if (next == end && !escaped) |
| return false; |
| escaped = !escaped && next == "\\"; |
| } |
| return escaped; |
| } |
| |
| // A map of JavaScript's keywords. The a/b/c keyword distinction is |
| // very rough, but it gives the parser enough information to parse |
| // correct code correctly (we don't care that much how we parse |
| // incorrect code). The style information included in these objects |
| // is used by the highlighter to pick the correct CSS style for a |
| // token. |
| var keywords = function(){ |
| function result(type, style){ |
| return {type: type, style: "js-" + style}; |
| } |
| // keywords that take a parenthised expression, and then a |
| // statement (if) |
| var keywordA = result("keyword a", "keyword"); |
| // keywords that take just a statement (else) |
| var keywordB = result("keyword b", "keyword"); |
| // keywords that optionally take an expression, and form a |
| // statement (return) |
| var keywordC = result("keyword c", "keyword"); |
| var operator = result("operator", "keyword"); |
| var atom = result("atom", "atom"); |
| return { |
| "if": keywordA, "while": keywordA, "with": keywordA, |
| "else": keywordB, "do": keywordB, "try": keywordB, "finally": keywordB, |
| "return": keywordC, "break": keywordC, "continue": keywordC, "new": keywordC, "delete": keywordC, "throw": keywordC, |
| "in": operator, "typeof": operator, "instanceof": operator, |
| "var": result("var", "keyword"), "function": result("function", "keyword"), "catch": result("catch", "keyword"), |
| "for": result("for", "keyword"), "switch": result("switch", "keyword"), |
| "case": result("case", "keyword"), "default": result("default", "keyword"), |
| "true": atom, "false": atom, "null": atom, "undefined": atom, "NaN": atom, "Infinity": atom |
| }; |
| }(); |
| |
| // Some helper regexps |
| var isOperatorChar = /[+\-*&%=<>!?|]/; |
| var isHexDigit = /[0-9A-Fa-f]/; |
| var isWordChar = /[\w\$_]/; |
| |
| // Wrapper around jsToken that helps maintain parser state (whether |
| // we are inside of a multi-line comment and whether the next token |
| // could be a regular expression). |
| function jsTokenState(inside, regexp) { |
| return function(source, setState) { |
| var newInside = inside; |
| var type = jsToken(inside, regexp, source, function(c) {newInside = c;}); |
| var newRegexp = type.type == "operator" || type.type == "keyword c" || type.type.match(/^[\[{}\(,;:]$/); |
| if (newRegexp != regexp || newInside != inside) |
| setState(jsTokenState(newInside, newRegexp)); |
| return type; |
| }; |
| } |
| |
| // The token reader, intended to be used by the tokenizer from |
| // tokenize.js (through jsTokenState). Advances the source stream |
| // over a token, and returns an object containing the type and style |
| // of that token. |
| function jsToken(inside, regexp, source, setInside) { |
| function readHexNumber(){ |
| source.next(); // skip the 'x' |
| source.nextWhileMatches(isHexDigit); |
| return {type: "number", style: "js-atom"}; |
| } |
| |
| function readNumber() { |
| source.nextWhileMatches(/[0-9]/); |
| if (source.equals(".")){ |
| source.next(); |
| source.nextWhileMatches(/[0-9]/); |
| } |
| if (source.equals("e") || source.equals("E")){ |
| source.next(); |
| if (source.equals("-")) |
| source.next(); |
| source.nextWhileMatches(/[0-9]/); |
| } |
| return {type: "number", style: "js-atom"}; |
| } |
| // Read a word, look it up in keywords. If not found, it is a |
| // variable, otherwise it is a keyword of the type found. |
| function readWord() { |
| source.nextWhileMatches(isWordChar); |
| var word = source.get(); |
| var known = keywords.hasOwnProperty(word) && keywords.propertyIsEnumerable(word) && keywords[word]; |
| return known ? {type: known.type, style: known.style, content: word} : |
| {type: "variable", style: "js-variable", content: word}; |
| } |
| function readRegexp() { |
| nextUntilUnescaped(source, "/"); |
| source.nextWhileMatches(/[gi]/); |
| return {type: "regexp", style: "js-string"}; |
| } |
| // Mutli-line comments are tricky. We want to return the newlines |
| // embedded in them as regular newline tokens, and then continue |
| // returning a comment token for every line of the comment. So |
| // some state has to be saved (inside) to indicate whether we are |
| // inside a /* */ sequence. |
| function readMultilineComment(start){ |
| var newInside = "/*"; |
| var maybeEnd = (start == "*"); |
| while (true) { |
| if (source.endOfLine()) |
| break; |
| var next = source.next(); |
| if (next == "/" && maybeEnd){ |
| newInside = null; |
| break; |
| } |
| maybeEnd = (next == "*"); |
| } |
| setInside(newInside); |
| return {type: "comment", style: "js-comment"}; |
| } |
| function readOperator() { |
| source.nextWhileMatches(isOperatorChar); |
| return {type: "operator", style: "js-operator"}; |
| } |
| function readString(quote) { |
| var endBackSlash = nextUntilUnescaped(source, quote); |
| setInside(endBackSlash ? quote : null); |
| return {type: "string", style: "js-string"}; |
| } |
| |
| // Fetch the next token. Dispatches on first character in the |
| // stream, or first two characters when the first is a slash. |
| if (inside == "\"" || inside == "'") |
| return readString(inside); |
| var ch = source.next(); |
| if (inside == "/*") |
| return readMultilineComment(ch); |
| else if (ch == "\"" || ch == "'") |
| return readString(ch); |
| // with punctuation, the type of the token is the symbol itself |
| else if (/[\[\]{}\(\),;\:\.]/.test(ch)) |
| return {type: ch, style: "js-punctuation"}; |
| else if (ch == "0" && (source.equals("x") || source.equals("X"))) |
| return readHexNumber(); |
| else if (/[0-9]/.test(ch)) |
| return readNumber(); |
| else if (ch == "/"){ |
| if (source.equals("*")) |
| { source.next(); return readMultilineComment(ch); } |
| else if (source.equals("/")) |
| { nextUntilUnescaped(source, null); return {type: "comment", style: "js-comment"};} |
| else if (regexp) |
| return readRegexp(); |
| else |
| return readOperator(); |
| } |
| else if (isOperatorChar.test(ch)) |
| return readOperator(); |
| else |
| return readWord(); |
| } |
| |
| // The external interface to the tokenizer. |
| return function(source, startState) { |
| return tokenizer(source, startState || jsTokenState(false, true)); |
| }; |
| })(); |