| // A framework for simple tokenizers. Takes care of newlines and |
| // white-space, and of getting the text from the source stream into |
| // the token object. A state is a function of two arguments -- a |
| // string stream and a setState function. The second can be used to |
| // change the tokenizer's state, and can be ignored for stateless |
| // tokenizers. This function should advance the stream over a token |
| // and return a string or object containing information about the next |
| // token, or null to pass and have the (new) state be called to finish |
| // the token. When a string is given, it is wrapped in a {style, type} |
| // object. In the resulting object, the characters consumed are stored |
| // under the content property. Any whitespace following them is also |
| // automatically consumed, and added to the value property. (Thus, |
| // content is the actual meaningful part of the token, while value |
| // contains all the text it spans.) |
| |
| function tokenizer(source, state) { |
| // Newlines are always a separate token. |
| function isWhiteSpace(ch) { |
| // The messy regexp is because IE's regexp matcher is of the |
| // opinion that non-breaking spaces are no whitespace. |
| return ch != "\n" && /^[\s\u00a0]*$/.test(ch); |
| } |
| |
| var tokenizer = { |
| state: state, |
| |
| take: function(type) { |
| if (typeof(type) == "string") |
| type = {style: type, type: type}; |
| |
| type.content = (type.content || "") + source.get(); |
| if (!/\n$/.test(type.content)) |
| source.nextWhile(isWhiteSpace); |
| type.value = type.content + source.get(); |
| return type; |
| }, |
| |
| next: function () { |
| if (!source.more()) throw StopIteration; |
| |
| var type; |
| if (source.equals("\n")) { |
| source.next(); |
| return this.take("whitespace"); |
| } |
| |
| if (source.applies(isWhiteSpace)) |
| type = "whitespace"; |
| else |
| while (!type) |
| type = this.state(source, function(s) {tokenizer.state = s;}); |
| |
| return this.take(type); |
| } |
| }; |
| return tokenizer; |
| } |