blob: 18b5e1e8bfbc65cb34ff1f5e9872805dff0fa873 [file] [log] [blame]
// Copyright 2011 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: jmarantz@google.com
//
// Implements a permissive javascript lexer. This is not designed to reject
// all illegal javascript programs, but it attempts to accept all legal ones.
// No attempt is made to decode unicode characters. Comments and whitepace
// are outputs of the tokenization process. The token-stream can be used
// to easily reconstruct a byte-identical version of the javascript file.
//
// There are likely a few inaccuracies -- e.g. incorrect tokenization of
// multi-character operators. The regex recognition is heuristic and might
// be incorrect in some cases.
//
// This has only been unit-tested, and has not yet been subjected to "street
// javascript" by scanning the web.
//
// This is based on third_party/libpagespeed/src/pagespeed/js/js_minify.cc by
// mdsteele@google.com
#include "net/instaweb/js/public/js_lexer.h"
#include <cstring> // for strchr
#include "base/logging.h"
#include "net/instaweb/js/public/js_keywords.h"
#include "net/instaweb/util/public/string_util.h"
namespace net_instaweb {
#define CALL_MEMBER_FN(object, ptrToMember) ((object)->*(ptrToMember))
JsLexer::JsLexer() {
// Initialize the keywords from HtmlName into a reverse table. This
// could have been generated by gperf, but it isn't. It's easy
// enough to build it given an iterator.
//
// TODO(jmarantz): make a static Init/Terminate routine to avoid doing this
// on every lexer instantiation.
keyword_vector_.resize(JsKeywords::num_keywords() + 1);
DCHECK_EQ(JsKeywords::num_keywords(), JsKeywords::kNotAKeyword);
for (JsKeywords::Iterator iter; !iter.AtEnd(); iter.Next()) {
keyword_vector_[iter.keyword()] = iter.name();
}
keyword_vector_[JsKeywords::kNotAKeyword] = NULL;
}
JsKeywords::Type JsLexer::IdentifierOrKeyword(const StringPiece& name) {
JsKeywords::Flag flag;
JsKeywords::Type type = JsKeywords::Lookup(name, &flag);
if (type == JsKeywords::kNotAKeyword) {
last_token_may_end_value_ = true;
return JsKeywords::kIdentifier;
}
last_token_may_end_value_ = (flag == JsKeywords::kIsValue);
return type;
}
JsKeywords::Type JsLexer::NumberOrDot(const StringPiece& number_or_dot) {
if (number_or_dot == ".") {
last_token_may_end_value_ = false;
return JsKeywords::kOperator;
}
#ifndef NDEBUG
int num_dots = 0;
for (int i = 0, n = number_or_dot.size(); i < n; ++i) {
if (number_or_dot[i] == '.') {
++num_dots;
}
}
DCHECK_GE(1, num_dots);
#endif
last_token_may_end_value_ = true;
return JsKeywords::kNumber;
}
void JsLexer::Consume(LexicalPredicate predicate,
bool include_last_char,
bool ok_to_terminate_with_eof,
StringPiece* token) {
DCHECK_LT(index_, static_cast<int>(input_.size()));
const char* start = input_.data() + index_;
const char* end = input_.data() + input_.size();
prev_char_ = *start;
const char* p = start + 1;
for (int i = 1; (p < end) && CALL_MEMBER_FN(this, predicate)(*p, i);
++p, ++i) {
prev_char_ = *p;
}
int size = p - start;
if (p == end) {
error_ |= !ok_to_terminate_with_eof;
} else if (include_last_char) {
++size;
}
index_ += size;
*token = StringPiece(start, size);
}
bool JsLexer::IsSpace(uint8 ch, int index) {
return (strchr(" \t\f", ch) != NULL);
}
bool JsLexer::IsLineSeparator(uint8 ch, int index) {
return (strchr("\n\r", ch) != NULL);
}
bool JsLexer::IsNumber(uint8 ch, int index) {
// TODO(jmarantz): deal with hex/octal?
//
// Note that '.' by itself is not a number but its own token. Thus
// the callback method called for numbers is NumberOrDot which figures
// out what to do given the context of the whole token.
if (ch == '.') {
if (seen_a_dot_) {
return false;
}
seen_a_dot_ = true;
}
return (((ch >= '0') && (ch <= '9')) || (ch == '.'));
}
bool JsLexer::InBlockComment(uint8 ch, int index) {
bool at_end = ((prev_char_ == '*') && (ch == '/'));
return !at_end;
}
bool JsLexer::InSingleLineComment(uint8 ch, int index) {
bool at_end = (ch == '\n') || (ch == '\r');
return !at_end;
}
bool JsLexer::ProcessBackslash(uint8 ch) {
if (backslash_mode_) {
backslash_mode_ = false;
return true;
}
if (ch == '\\') {
backslash_mode_ = true;
return true;
}
return false;
}
// See http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-262.pdf
// page 17.
//
// Note this this algorithm errs on the side of allowing invalid characters into
// an identifier.
bool JsLexer::IdentifierStart(uint8 ch) {
// Note that backslashes can appear in identifiers due to unicode escape
// sequences (e.g. \u03c0). We still terminate the identifier using
// the same rules and make no attempt to decode the escape sequence.
if (ProcessBackslash(ch)) {
return true;
}
return ((ch >= 'a' && ch <= 'z') ||
(ch >= 'A' && ch <= 'Z') ||
(ch == '_') || (ch == '$') || (ch >= 127));
}
bool JsLexer::InIdentifier(uint8 ch, int index) {
return (IdentifierStart(ch) || (ch >= '0' && ch <= '9'));
}
bool JsLexer::InOperator(uint8 ch, int index) {
// TODO(jmarantz): add missing token-types !=, ==, ===, and others
// listed in the ecmascript doc.
if ((((token_start_ == '+') || (token_start_ == '-')) && // ++ --
(ch == token_start_)) ||
((ch == '=') && // += -= *= /=
((token_start_ == '+') || (token_start_ == '-') ||
(token_start_ == '/') || (token_start_ == '*')))) {
// Treat -- and ++ as a single token.
token_start_ = '\0'; // don't make a triple-plus or triple-minus.
return true;
}
last_token_may_end_value_ = (strchr(")]}", ch) != NULL);
return false;
}
bool JsLexer::InString(uint8 ch, int index) {
if (ProcessBackslash(ch)) {
return true;
}
return (token_start_ != ch);
}
bool JsLexer::InRegex(uint8 ch, int index) {
if (ProcessBackslash(ch)) {
return true;
}
if (ch == '/') {
// Slashes within brackets are implicitly escaped.
if (!within_brackets_) {
return false;
}
} else if (ch == '[') {
// Regex brackets don't nest, so we don't need a stack -- just a bool.
within_brackets_ = true;
} else if (ch == ']') {
within_brackets_ = false;
} else if (ch == '\n') {
error_ = true;
return false;
}
return true;
}
void JsLexer::Lex(const StringPiece& input) {
input_ = input;
index_ = 0;
error_ = false;
last_token_may_end_value_ = false;
prev_char_ = '\0';
token_start_ = '\0';
backslash_mode_ = false;
within_brackets_ = false;
token_start_index_ = -1;
seen_a_dot_ = false;
}
JsKeywords::Type JsLexer::NextToken(StringPiece* token) {
bool end_of_input = (index_ >= static_cast<int>(input_.size()));
if (end_of_input || error_) {
return JsKeywords::kEndOfInput;
}
uint8 ch = input_[index_];
token_start_ = ch;
token_start_index_ = index_;
// TODO(jmarantz): we can make a jump-table to map characters to an
// enum to be used in a switch-statement. If we care about speed
// this would probably be the first thing to do.
if (IsSpace(ch, 0)) {
Consume(&JsLexer::IsSpace, false, true, token);
return JsKeywords::kWhitespace;
// last_token_may_end_value_ does not change.
} else if (IsLineSeparator(ch, 0)) {
Consume(&JsLexer::IsLineSeparator, false, true, token);
return JsKeywords::kLineSeparator;
} else if (IsNumber(ch, 0)) {
seen_a_dot_ = (ch == '.');
Consume(&JsLexer::IsNumber, false, true, token);
seen_a_dot_ = false;
return NumberOrDot(*token);
} else if (ch == '/') {
// Decide whether this is a comment or a regex.
return ConsumeSlash(token);
} else if ((ch == '"') || (ch == '\'')) {
token_start_ = ch;
Consume(&JsLexer::InString, true, false, token);
last_token_may_end_value_ = true;
return JsKeywords::kStringLiteral;
} else if (IdentifierStart(ch)) {
Consume(&JsLexer::InIdentifier, false, true, token);
return IdentifierOrKeyword(*token);
} else if (input_.substr(index_, 4) == "<!--") {
Consume(&JsLexer::InSingleLineComment, false, true, token);
return JsKeywords::kComment;
}
// all other punctuation is a token.
token_start_ = ch;
Consume(&JsLexer::InOperator, false, true, token);
return JsKeywords::kOperator;
}
JsKeywords::Type JsLexer::ConsumeSlash(StringPiece* token) {
// A slash could herald a line comment, a block comment, a regex literal,
// or a mere division operator; we need to figure out which it is.
// Differentiating between division and regexes is mostly impossible
// without parsing, so we do our best based on the previous token.
if (index_ < static_cast<int>(input_.size() - 1)) {
char next = input_[index_ + 1];
if (next == '/') {
Consume(&JsLexer::InSingleLineComment, false, true, token);
return JsKeywords::kComment;
} else if (next == '*') {
Consume(&JsLexer::InBlockComment, true, false, token);
return JsKeywords::kComment;
} else if (last_token_may_end_value_) {
last_token_may_end_value_ = false;
} else {
within_brackets_ = false;
Consume(&JsLexer::InRegex, true, false, token);
return JsKeywords::kRegex;
}
}
Consume(&JsLexer::InOperator, false, true, token);
last_token_may_end_value_ = false;
return JsKeywords::kOperator;
}
} // namespace net_instaweb