content/editor/CodeMirror-0.8/js/tokenizejavascript.js - shindig - Git at Google

 /* Tokenizer for JavaScript code */

 var tokenizeJavaScript = (function() {
   // Advance the stream until the given character (not preceded by a
   // backslash) is encountered, or the end of the line is reached.
   function nextUntilUnescaped(source, end) {
     var escaped = false;
     while (!source.endOfLine()) {
       var next = source.next();
       if (next == end && !escaped)
         return false;
       escaped = !escaped && next == "\\";
     }
     return escaped;
   }

   // A map of JavaScript's keywords. The a/b/c keyword distinction is
   // very rough, but it gives the parser enough information to parse
   // correct code correctly (we don't care that much how we parse
   // incorrect code). The style information included in these objects
   // is used by the highlighter to pick the correct CSS style for a
   // token.
   var keywords = function(){
     function result(type, style){
       return {type: type, style: "js-" + style};
     }
     // keywords that take a parenthised expression, and then a
     // statement (if)
     var keywordA = result("keyword a", "keyword");
     // keywords that take just a statement (else)
     var keywordB = result("keyword b", "keyword");
     // keywords that optionally take an expression, and form a
     // statement (return)
     var keywordC = result("keyword c", "keyword");
     var operator = result("operator", "keyword");
     var atom = result("atom", "atom");
     return {
       "if": keywordA, "while": keywordA, "with": keywordA,
       "else": keywordB, "do": keywordB, "try": keywordB, "finally": keywordB,
       "return": keywordC, "break": keywordC, "continue": keywordC, "new": keywordC, "delete": keywordC, "throw": keywordC,
       "in": operator, "typeof": operator, "instanceof": operator,
       "var": result("var", "keyword"), "function": result("function", "keyword"), "catch": result("catch", "keyword"),
       "for": result("for", "keyword"), "switch": result("switch", "keyword"),
       "case": result("case", "keyword"), "default": result("default", "keyword"),
       "true": atom, "false": atom, "null": atom, "undefined": atom, "NaN": atom, "Infinity": atom
     };
   }();

   // Some helper regexps
   var isOperatorChar = /[+\-*&%=<>!?|]/;
   var isHexDigit = /[0-9A-Fa-f]/;
   var isWordChar = /[\w\$_]/;

   // Wrapper around jsToken that helps maintain parser state (whether
   // we are inside of a multi-line comment and whether the next token
   // could be a regular expression).
   function jsTokenState(inside, regexp) {
     return function(source, setState) {
       var newInside = inside;
       var type = jsToken(inside, regexp, source, function(c) {newInside = c;});
       var newRegexp = type.type == "operator" || type.type == "keyword c" || type.type.match(/^[\[{}\(,;:]$/);
       if (newRegexp != regexp || newInside != inside)
         setState(jsTokenState(newInside, newRegexp));
       return type;
     };
   }

   // The token reader, intended to be used by the tokenizer from
   // tokenize.js (through jsTokenState). Advances the source stream
   // over a token, and returns an object containing the type and style
   // of that token.
   function jsToken(inside, regexp, source, setInside) {
     function readHexNumber(){
       source.next(); // skip the 'x'
       source.nextWhileMatches(isHexDigit);
       return {type: "number", style: "js-atom"};
     }

     function readNumber() {
       source.nextWhileMatches(/[0-9]/);
       if (source.equals(".")){
         source.next();
         source.nextWhileMatches(/[0-9]/);
       }
       if (source.equals("e") || source.equals("E")){
         source.next();
         if (source.equals("-"))
           source.next();
         source.nextWhileMatches(/[0-9]/);
       }
       return {type: "number", style: "js-atom"};
     }
     // Read a word, look it up in keywords. If not found, it is a
     // variable, otherwise it is a keyword of the type found.
     function readWord() {
       source.nextWhileMatches(isWordChar);
       var word = source.get();
       var known = keywords.hasOwnProperty(word) && keywords.propertyIsEnumerable(word) && keywords[word];
       return known ? {type: known.type, style: known.style, content: word} :
       {type: "variable", style: "js-variable", content: word};
     }
     function readRegexp() {
       nextUntilUnescaped(source, "/");
       source.nextWhileMatches(/[gi]/);
       return {type: "regexp", style: "js-string"};
     }
     // Mutli-line comments are tricky. We want to return the newlines
     // embedded in them as regular newline tokens, and then continue
     // returning a comment token for every line of the comment. So
     // some state has to be saved (inside) to indicate whether we are
     // inside a /* */ sequence.
     function readMultilineComment(start){
       var newInside = "/*";
       var maybeEnd = (start == "*");
       while (true) {
         if (source.endOfLine())
           break;
         var next = source.next();
         if (next == "/" && maybeEnd){
           newInside = null;
           break;
         }
         maybeEnd = (next == "*");
       }
       setInside(newInside);
       return {type: "comment", style: "js-comment"};
     }
     function readOperator() {
       source.nextWhileMatches(isOperatorChar);
       return {type: "operator", style: "js-operator"};
     }
     function readString(quote) {
       var endBackSlash = nextUntilUnescaped(source, quote);
       setInside(endBackSlash ? quote : null);
       return {type: "string", style: "js-string"};
     }

     // Fetch the next token. Dispatches on first character in the
     // stream, or first two characters when the first is a slash.
     if (inside == "\"" || inside == "'")
       return readString(inside);
     var ch = source.next();
     if (inside == "/*")
       return readMultilineComment(ch);
     else if (ch == "\"" || ch == "'")
       return readString(ch);
     // with punctuation, the type of the token is the symbol itself
     else if (/[\[\]{}\(\),;\:\.]/.test(ch))
       return {type: ch, style: "js-punctuation"};
     else if (ch == "0" && (source.equals("x") || source.equals("X")))
       return readHexNumber();
     else if (/[0-9]/.test(ch))
       return readNumber();
     else if (ch == "/"){
       if (source.equals("*"))
       { source.next(); return readMultilineComment(ch); }
       else if (source.equals("/"))
       { nextUntilUnescaped(source, null); return {type: "comment", style: "js-comment"};}
       else if (regexp)
         return readRegexp();
       else
         return readOperator();
     }
     else if (isOperatorChar.test(ch))
       return readOperator();
     else
       return readWord();
   }

   // The external interface to the tokenizer.
   return function(source, startState) {
     return tokenizer(source, startState || jsTokenState(false, true));
   };
 })();
	/* Tokenizer for JavaScript code */

	var tokenizeJavaScript = (function() {
	// Advance the stream until the given character (not preceded by a
	// backslash) is encountered, or the end of the line is reached.
	function nextUntilUnescaped(source, end) {
	var escaped = false;
	while (!source.endOfLine()) {
	var next = source.next();
	if (next == end && !escaped)
	return false;
	escaped = !escaped && next == "\\";
	}
	return escaped;
	}

	// A map of JavaScript's keywords. The a/b/c keyword distinction is
	// very rough, but it gives the parser enough information to parse
	// correct code correctly (we don't care that much how we parse
	// incorrect code). The style information included in these objects
	// is used by the highlighter to pick the correct CSS style for a
	// token.
	var keywords = function(){
	function result(type, style){
	return {type: type, style: "js-" + style};
	}
	// keywords that take a parenthised expression, and then a
	// statement (if)
	var keywordA = result("keyword a", "keyword");
	// keywords that take just a statement (else)
	var keywordB = result("keyword b", "keyword");
	// keywords that optionally take an expression, and form a
	// statement (return)
	var keywordC = result("keyword c", "keyword");
	var operator = result("operator", "keyword");
	var atom = result("atom", "atom");
	return {
	"if": keywordA, "while": keywordA, "with": keywordA,
	"else": keywordB, "do": keywordB, "try": keywordB, "finally": keywordB,
	"return": keywordC, "break": keywordC, "continue": keywordC, "new": keywordC, "delete": keywordC, "throw": keywordC,
	"in": operator, "typeof": operator, "instanceof": operator,
	"var": result("var", "keyword"), "function": result("function", "keyword"), "catch": result("catch", "keyword"),
	"for": result("for", "keyword"), "switch": result("switch", "keyword"),
	"case": result("case", "keyword"), "default": result("default", "keyword"),
	"true": atom, "false": atom, "null": atom, "undefined": atom, "NaN": atom, "Infinity": atom
	};
	}();

	// Some helper regexps
	var isOperatorChar = /[+\-*&%=<>!?\|]/;
	var isHexDigit = /[0-9A-Fa-f]/;
	var isWordChar = /[\w\$_]/;

	// Wrapper around jsToken that helps maintain parser state (whether
	// we are inside of a multi-line comment and whether the next token
	// could be a regular expression).
	function jsTokenState(inside, regexp) {
	return function(source, setState) {
	var newInside = inside;
	var type = jsToken(inside, regexp, source, function(c) {newInside = c;});
	var newRegexp = type.type == "operator" \|\| type.type == "keyword c" \|\| type.type.match(/^[\[{}\(,;:]$/);
	if (newRegexp != regexp \|\| newInside != inside)
	setState(jsTokenState(newInside, newRegexp));
	return type;
	};
	}

	// The token reader, intended to be used by the tokenizer from
	// tokenize.js (through jsTokenState). Advances the source stream
	// over a token, and returns an object containing the type and style
	// of that token.
	function jsToken(inside, regexp, source, setInside) {
	function readHexNumber(){
	source.next(); // skip the 'x'
	source.nextWhileMatches(isHexDigit);
	return {type: "number", style: "js-atom"};
	}

	function readNumber() {
	source.nextWhileMatches(/[0-9]/);
	if (source.equals(".")){
	source.next();
	source.nextWhileMatches(/[0-9]/);
	}
	if (source.equals("e") \|\| source.equals("E")){
	source.next();
	if (source.equals("-"))
	source.next();
	source.nextWhileMatches(/[0-9]/);
	}
	return {type: "number", style: "js-atom"};
	}
	// Read a word, look it up in keywords. If not found, it is a
	// variable, otherwise it is a keyword of the type found.
	function readWord() {
	source.nextWhileMatches(isWordChar);
	var word = source.get();
	var known = keywords.hasOwnProperty(word) && keywords.propertyIsEnumerable(word) && keywords[word];
	return known ? {type: known.type, style: known.style, content: word} :
	{type: "variable", style: "js-variable", content: word};
	}
	function readRegexp() {
	nextUntilUnescaped(source, "/");
	source.nextWhileMatches(/[gi]/);
	return {type: "regexp", style: "js-string"};
	}
	// Mutli-line comments are tricky. We want to return the newlines
	// embedded in them as regular newline tokens, and then continue
	// returning a comment token for every line of the comment. So
	// some state has to be saved (inside) to indicate whether we are
	// inside a /* */ sequence.
	function readMultilineComment(start){
	var newInside = "/*";
	var maybeEnd = (start == "*");
	while (true) {
	if (source.endOfLine())
	break;
	var next = source.next();
	if (next == "/" && maybeEnd){
	newInside = null;
	break;
	}
	maybeEnd = (next == "*");
	}
	setInside(newInside);
	return {type: "comment", style: "js-comment"};
	}
	function readOperator() {
	source.nextWhileMatches(isOperatorChar);
	return {type: "operator", style: "js-operator"};
	}
	function readString(quote) {
	var endBackSlash = nextUntilUnescaped(source, quote);
	setInside(endBackSlash ? quote : null);
	return {type: "string", style: "js-string"};
	}

	// Fetch the next token. Dispatches on first character in the
	// stream, or first two characters when the first is a slash.
	if (inside == "\"" \|\| inside == "'")
	return readString(inside);
	var ch = source.next();
	if (inside == "/*")
	return readMultilineComment(ch);
	else if (ch == "\"" \|\| ch == "'")
	return readString(ch);
	// with punctuation, the type of the token is the symbol itself
	else if (/[\[\]{}\(\),;\:\.]/.test(ch))
	return {type: ch, style: "js-punctuation"};
	else if (ch == "0" && (source.equals("x") \|\| source.equals("X")))
	return readHexNumber();
	else if (/[0-9]/.test(ch))
	return readNumber();
	else if (ch == "/"){
	if (source.equals("*"))
	{ source.next(); return readMultilineComment(ch); }
	else if (source.equals("/"))
	{ nextUntilUnescaped(source, null); return {type: "comment", style: "js-comment"};}
	else if (regexp)
	return readRegexp();
	else
	return readOperator();
	}
	else if (isOperatorChar.test(ch))
	return readOperator();
	else
	return readWord();
	}

	// The external interface to the tokenizer.
	return function(source, startState) {
	return tokenizer(source, startState \|\| jsTokenState(false, true));
	};
	})();