blob: 572d7d64e581cbd552fa608a3dac2fa070a2aef3 [file] [log] [blame]
/* vim: set sw=4 ts=4 et tw=78: */
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is the Narcissus JavaScript engine.
*
* The Initial Developer of the Original Code is
* Brendan Eich <brendan@mozilla.org>.
* Portions created by the Initial Developer are Copyright (C) 2004
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Tom Austin <taustin@ucsc.edu>
* Brendan Eich <brendan@mozilla.org>
* Shu-Yu Guo <shu@rfrn.org>
* Stephan Herhut <stephan.a.herhut@intel.com>
* Dave Herman <dherman@mozilla.com>
* Dimitris Vardoulakis <dimvar@ccs.neu.edu>
* Patrick Walton <pcwalton@mozilla.com>
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
/*
* Narcissus - JS implemented in JS.
*
* Lexical scanner.
*/
Narcissus.lexer = (function() {
var definitions = Narcissus.definitions;
// Set constants in the local scope.
eval(definitions.consts);
// Banned keywords by language version
const blackLists = { 160: {}, 185: {}, harmony: {} };
blackLists[160][LET] = true;
blackLists[160][MODULE] = true;
blackLists[160][YIELD] = true;
blackLists[185][MODULE] = true;
// Build up a trie of operator tokens.
var opTokens = {};
for (var op in definitions.opTypeNames) {
if (op === '\n' || op === '.')
continue;
var node = opTokens;
for (var i = 0; i < op.length; i++) {
var ch = op[i];
if (!(ch in node))
node[ch] = {};
node = node[ch];
node.op = op;
}
}
/*
* Since JavaScript provides no convenient way to determine if a
* character is in a particular Unicode category, we use
* metacircularity to accomplish this (oh yeaaaah!)
*/
function isValidIdentifierChar(ch, first) {
// check directly for ASCII
if (ch <= "\u007F") {
if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || ch === '$' || ch === '_' ||
(!first && (ch >= '0' && ch <= '9'))) {
return true;
}
return false;
}
// create an object to test this in
var x = {};
x["x"+ch] = true;
x[ch] = true;
// then use eval to determine if it's a valid character
var valid = false;
try {
valid = (Function("x", "return (x." + (first?"":"x") + ch + ");")(x) === true);
} catch (ex) {}
return valid;
}
function isIdentifier(str) {
if (typeof str !== "string")
return false;
if (str.length === 0)
return false;
if (!isValidIdentifierChar(str[0], true))
return false;
for (var i = 1; i < str.length; i++) {
if (!isValidIdentifierChar(str[i], false))
return false;
}
return true;
}
/*
* Tokenizer :: (source, filename, line number) -> Tokenizer
*/
function Tokenizer(s, f, l) {
this.cursor = 0;
this.source = String(s);
this.tokens = [];
this.tokenIndex = 0;
this.lookahead = 0;
this.scanNewlines = false;
this.unexpectedEOF = false;
this.filename = f || "";
this.lineno = l || 1;
this.blackList = blackLists[Narcissus.options.version];
this.blockComments = null;
}
Tokenizer.prototype = {
get done() {
// We need to set scanOperand to true here because the first thing
// might be a regexp.
return this.peek(true) === END;
},
get token() {
return this.tokens[this.tokenIndex];
},
match: function (tt, scanOperand, keywordIsName) {
return this.get(scanOperand, keywordIsName) === tt || this.unget();
},
mustMatch: function (tt, keywordIsName) {
if (!this.match(tt, false, keywordIsName)) {
throw this.newSyntaxError("Missing " +
definitions.tokens[tt].toLowerCase());
}
return this.token;
},
peek: function (scanOperand) {
var tt, next;
if (this.lookahead) {
next = this.tokens[(this.tokenIndex + this.lookahead) & 3];
tt = (this.scanNewlines && next.lineno !== this.lineno)
? NEWLINE
: next.type;
} else {
tt = this.get(scanOperand);
this.unget();
}
return tt;
},
peekOnSameLine: function (scanOperand) {
this.scanNewlines = true;
var tt = this.peek(scanOperand);
this.scanNewlines = false;
return tt;
},
lastBlockComment: function() {
var length = this.blockComments.length;
return length ? this.blockComments[length - 1] : null;
},
// Eat comments and whitespace.
skip: function () {
var input = this.source;
this.blockComments = [];
for (;;) {
var ch = input[this.cursor++];
var next = input[this.cursor];
// handle \r, \r\n and (always preferable) \n
if (ch === '\r') {
// if the next character is \n, we don't care about this at all
if (next === '\n') continue;
// otherwise, we want to consider this as a newline
ch = '\n';
}
if (ch === '\n' && !this.scanNewlines) {
this.lineno++;
} else if (ch === '/' && next === '*') {
var commentStart = ++this.cursor;
for (;;) {
ch = input[this.cursor++];
if (ch === undefined)
throw this.newSyntaxError("Unterminated comment");
if (ch === '*') {
next = input[this.cursor];
if (next === '/') {
var commentEnd = this.cursor - 1;
this.cursor++;
break;
}
} else if (ch === '\n') {
this.lineno++;
}
}
this.blockComments.push(input.substring(commentStart, commentEnd));
} else if ((ch === '/' && next === '/') ||
(Narcissus.options.allowHTMLComments && ch === '<' && next === '!' &&
input[this.cursor + 1] === '-' && input[this.cursor + 2] === '-' &&
(this.cursor += 2))) {
this.cursor++;
for (;;) {
ch = input[this.cursor++];
next = input[this.cursor];
if (ch === undefined)
return;
if (ch === '\r') {
// check for \r\n
if (next !== '\n') ch = '\n';
}
if (ch === '\n') {
if (this.scanNewlines) {
this.cursor--;
} else {
this.lineno++;
}
break;
}
}
} else if (!(ch in definitions.whitespace)) {
this.cursor--;
return;
}
}
},
// Lex the exponential part of a number, if present. Return true iff an
// exponential part was found.
lexExponent: function() {
var input = this.source;
var next = input[this.cursor];
if (next === 'e' || next === 'E') {
this.cursor++;
ch = input[this.cursor++];
if (ch === '+' || ch === '-')
ch = input[this.cursor++];
if (ch < '0' || ch > '9')
throw this.newSyntaxError("Missing exponent");
do {
ch = input[this.cursor++];
} while (ch >= '0' && ch <= '9');
this.cursor--;
return true;
}
return false;
},
lexZeroNumber: function (ch) {
var token = this.token, input = this.source;
token.type = NUMBER;
ch = input[this.cursor++];
if (ch === '.') {
do {
ch = input[this.cursor++];
} while (ch >= '0' && ch <= '9');
this.cursor--;
this.lexExponent();
token.value = parseFloat(
input.substring(token.start, this.cursor));
} else if (ch === 'x' || ch === 'X') {
do {
ch = input[this.cursor++];
} while ((ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') ||
(ch >= 'A' && ch <= 'F'));
this.cursor--;
token.value = parseInt(input.substring(token.start, this.cursor));
} else if (ch >= '0' && ch <= '7') {
do {
ch = input[this.cursor++];
} while (ch >= '0' && ch <= '7');
this.cursor--;
token.value = parseInt(input.substring(token.start, this.cursor));
} else {
this.cursor--;
this.lexExponent(); // 0E1, &c.
token.value = 0;
}
},
lexNumber: function (ch) {
var token = this.token, input = this.source;
token.type = NUMBER;
var floating = false;
do {
ch = input[this.cursor++];
if (ch === '.' && !floating) {
floating = true;
ch = input[this.cursor++];
}
} while (ch >= '0' && ch <= '9');
this.cursor--;
var exponent = this.lexExponent();
floating = floating || exponent;
var str = input.substring(token.start, this.cursor);
token.value = floating ? parseFloat(str) : parseInt(str);
},
lexDot: function (ch) {
var token = this.token, input = this.source;
var next = input[this.cursor];
if (next >= '0' && next <= '9') {
do {
ch = input[this.cursor++];
} while (ch >= '0' && ch <= '9');
this.cursor--;
this.lexExponent();
token.type = NUMBER;
token.value = parseFloat(
input.substring(token.start, this.cursor));
} else {
token.type = DOT;
token.assignOp = null;
token.value = '.';
}
},
lexString: function (ch) {
var token = this.token, input = this.source;
token.type = STRING;
var hasEscapes = false;
var delim = ch;
if (input.length <= this.cursor)
throw this.newSyntaxError("Unterminated string literal");
while ((ch = input[this.cursor++]) !== delim) {
if (this.cursor == input.length)
throw this.newSyntaxError("Unterminated string literal");
if (ch === '\\') {
hasEscapes = true;
if (++this.cursor == input.length)
throw this.newSyntaxError("Unterminated string literal");
}
}
token.value = hasEscapes
? eval(input.substring(token.start, this.cursor))
: input.substring(token.start + 1, this.cursor - 1);
},
lexRegExp: function (ch) {
var token = this.token, input = this.source;
token.type = REGEXP;
do {
ch = input[this.cursor++];
if (ch === '\\') {
this.cursor++;
} else if (ch === '[') {
do {
if (ch === undefined)
throw this.newSyntaxError("Unterminated character class");
if (ch === '\\')
this.cursor++;
ch = input[this.cursor++];
} while (ch !== ']');
} else if (ch === undefined) {
throw this.newSyntaxError("Unterminated regex");
}
} while (ch !== '/');
do {
ch = input[this.cursor++];
} while (ch >= 'a' && ch <= 'z');
this.cursor--;
token.value = eval(input.substring(token.start, this.cursor));
},
lexOp: function (ch) {
var token = this.token, input = this.source;
// A bit ugly, but it seems wasteful to write a trie lookup routine
// for only 3 characters...
var node = opTokens[ch];
var next = input[this.cursor];
if (next in node) {
node = node[next];
this.cursor++;
next = input[this.cursor];
if (next in node) {
node = node[next];
this.cursor++;
next = input[this.cursor];
}
}
var op = node.op;
if (definitions.assignOps[op] && input[this.cursor] === '=') {
this.cursor++;
token.type = ASSIGN;
token.assignOp = definitions.tokenIds[definitions.opTypeNames[op]];
op += '=';
} else {
token.type = definitions.tokenIds[definitions.opTypeNames[op]];
token.assignOp = null;
}
token.value = op;
},
// FIXME: Unicode escape sequences
lexIdent: function (ch, keywordIsName) {
var token = this.token;
var id = ch;
while ((ch = this.getValidIdentifierChar(false)) !== null) {
id += ch;
}
token.type = IDENTIFIER;
token.value = id;
if (keywordIsName)
return;
var kw = definitions.keywords[id];
if (kw && !(kw in this.blackList))
token.type = kw;
},
/*
* Tokenizer.get :: [boolean[, boolean]] -> token type
*
* Consume input *only* if there is no lookahead.
* Dispatch to the appropriate lexing function depending on the input.
*/
get: function (scanOperand, keywordIsName) {
var token;
while (this.lookahead) {
--this.lookahead;
this.tokenIndex = (this.tokenIndex + 1) & 3;
token = this.tokens[this.tokenIndex];
if (token.type !== NEWLINE || this.scanNewlines)
return token.type;
}
this.skip();
this.tokenIndex = (this.tokenIndex + 1) & 3;
token = this.tokens[this.tokenIndex];
if (!token)
this.tokens[this.tokenIndex] = token = {};
var input = this.source;
if (this.cursor >= input.length)
return token.type = END;
token.start = this.cursor;
token.lineno = this.lineno;
var ich = this.getValidIdentifierChar(true);
var ch = (ich === null) ? input[this.cursor++] : null;
if (ich !== null) {
this.lexIdent(ich, keywordIsName);
} else if (scanOperand && ch === '/') {
this.lexRegExp(ch);
} else if (ch in opTokens) {
this.lexOp(ch);
} else if (ch === '.') {
this.lexDot(ch);
} else if (ch >= '1' && ch <= '9') {
this.lexNumber(ch);
} else if (ch === '0') {
this.lexZeroNumber(ch);
} else if (ch === '"' || ch === "'") {
this.lexString(ch);
} else if (this.scanNewlines && (ch === '\n' || ch === '\r')) {
// if this was a \r, look for \r\n
if (ch === '\r' && input[this.cursor] === '\n') this.cursor++;
token.type = NEWLINE;
token.value = '\n';
this.lineno++;
} else {
throw this.newSyntaxError("Illegal token");
}
token.end = this.cursor;
return token.type;
},
/*
* Tokenizer.unget :: void -> undefined
*
* Match depends on unget returning undefined.
*/
unget: function () {
if (++this.lookahead === 4) throw "PANIC: too much lookahead!";
this.tokenIndex = (this.tokenIndex - 1) & 3;
},
newSyntaxError: function (m) {
m = (this.filename ? this.filename + ":" : "") + this.lineno + ": " + m;
var e = new SyntaxError(m, this.filename, this.lineno);
e.source = this.source;
e.cursor = this.lookahead
? this.tokens[(this.tokenIndex + this.lookahead) & 3].start
: this.cursor;
return e;
},
/* Gets a single valid identifier char from the input stream, or null
* if there is none.
*/
getValidIdentifierChar: function(first) {
var input = this.source;
if (this.cursor >= input.length) return null;
var ch = input[this.cursor];
// first check for \u escapes
if (ch === '\\' && input[this.cursor+1] === 'u') {
// get the character value
try {
ch = String.fromCharCode(parseInt(
input.substring(this.cursor + 2, this.cursor + 6),
16));
} catch (ex) {
return null;
}
this.cursor += 5;
}
var valid = isValidIdentifierChar(ch, first);
if (valid) this.cursor++;
return (valid ? ch : null);
},
};
return {
isIdentifier: isIdentifier,
Tokenizer: Tokenizer
};
}());