content/editor/CodeMirror-0.8/js/tokenize.js - shindig - Git at Google

 // A framework for simple tokenizers. Takes care of newlines and
 // white-space, and of getting the text from the source stream into
 // the token object. A state is a function of two arguments -- a
 // string stream and a setState function. The second can be used to
 // change the tokenizer's state, and can be ignored for stateless
 // tokenizers. This function should advance the stream over a token
 // and return a string or object containing information about the next
 // token, or null to pass and have the (new) state be called to finish
 // the token. When a string is given, it is wrapped in a {style, type}
 // object. In the resulting object, the characters consumed are stored
 // under the content property. Any whitespace following them is also
 // automatically consumed, and added to the value property. (Thus,
 // content is the actual meaningful part of the token, while value
 // contains all the text it spans.)

 function tokenizer(source, state) {
   // Newlines are always a separate token.
   function isWhiteSpace(ch) {
     // The messy regexp is because IE's regexp matcher is of the
     // opinion that non-breaking spaces are no whitespace.
     return ch != "\n" && /^[\s\u00a0]*$/.test(ch);
   }

   var tokenizer = {
     state: state,

     take: function(type) {
       if (typeof(type) == "string")
         type = {style: type, type: type};

       type.content = (type.content || "") + source.get();
       if (!/\n$/.test(type.content))
         source.nextWhile(isWhiteSpace);
       type.value = type.content + source.get();
       return type;
     },

     next: function () {
       if (!source.more()) throw StopIteration;

       var type;
       if (source.equals("\n")) {
         source.next();
         return this.take("whitespace");
       }

       if (source.applies(isWhiteSpace))
         type = "whitespace";
       else
         while (!type)
           type = this.state(source, function(s) {tokenizer.state = s;});

       return this.take(type);
     }
   };
   return tokenizer;
 }
	// A framework for simple tokenizers. Takes care of newlines and
	// white-space, and of getting the text from the source stream into
	// the token object. A state is a function of two arguments -- a
	// string stream and a setState function. The second can be used to
	// change the tokenizer's state, and can be ignored for stateless
	// tokenizers. This function should advance the stream over a token
	// and return a string or object containing information about the next
	// token, or null to pass and have the (new) state be called to finish
	// the token. When a string is given, it is wrapped in a {style, type}
	// object. In the resulting object, the characters consumed are stored
	// under the content property. Any whitespace following them is also
	// automatically consumed, and added to the value property. (Thus,
	// content is the actual meaningful part of the token, while value
	// contains all the text it spans.)

	function tokenizer(source, state) {
	// Newlines are always a separate token.
	function isWhiteSpace(ch) {
	// The messy regexp is because IE's regexp matcher is of the
	// opinion that non-breaking spaces are no whitespace.
	return ch != "\n" && /^[\s\u00a0]*$/.test(ch);
	}

	var tokenizer = {
	state: state,

	take: function(type) {
	if (typeof(type) == "string")
	type = {style: type, type: type};

	type.content = (type.content \|\| "") + source.get();
	if (!/\n$/.test(type.content))
	source.nextWhile(isWhiteSpace);
	type.value = type.content + source.get();
	return type;
	},

	next: function () {
	if (!source.more()) throw StopIteration;

	var type;
	if (source.equals("\n")) {
	source.next();
	return this.take("whitespace");
	}

	if (source.applies(isWhiteSpace))
	type = "whitespace";
	else
	while (!type)
	type = this.state(source, function(s) {tokenizer.state = s;});

	return this.take(type);
	}
	};
	return tokenizer;
	}