src/net/instaweb/js/js_lexer.cc - incubator-pagespeed-mod - Git at Google

 // Copyright 2011 Google Inc. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //
 // Author: jmarantz@google.com
 //
 // Implements a permissive javascript lexer.  This is not designed to reject
 // all illegal javascript programs, but it attempts to accept all legal ones.
 // No attempt is made to decode unicode characters.  Comments and whitepace
 // are outputs of the tokenization process.  The token-stream can be used
 // to easily reconstruct a byte-identical version of the javascript file.
 //
 // There are likely a few inaccuracies -- e.g. incorrect tokenization of
 // multi-character operators.  The regex recognition is heuristic and might
 // be incorrect in some cases.
 //
 // This has only been unit-tested, and has not yet been subjected to "street
 // javascript" by scanning the web.
 //
 // This is based on third_party/libpagespeed/src/pagespeed/js/js_minify.cc by
 // mdsteele@google.com

 #include "net/instaweb/js/public/js_lexer.h"

 #include <cstring>  // for strchr

 #include "base/logging.h"
 #include "net/instaweb/js/public/js_keywords.h"
 #include "net/instaweb/util/public/string_util.h"

 namespace net_instaweb {

 #define CALL_MEMBER_FN(object, ptrToMember) ((object)->*(ptrToMember))

 JsLexer::JsLexer() {
   // Initialize the keywords from HtmlName into a reverse table.  This
   // could have been generated by gperf, but it isn't.  It's easy
   // enough to build it given an iterator.
   //
   // TODO(jmarantz): make a static Init/Terminate routine to avoid doing this
   // on every lexer instantiation.
   keyword_vector_.resize(JsKeywords::num_keywords() + 1);
   DCHECK_EQ(JsKeywords::num_keywords(), JsKeywords::kNotAKeyword);
   for (JsKeywords::Iterator iter; !iter.AtEnd(); iter.Next()) {
     keyword_vector_[iter.keyword()] = iter.name();
   }
   keyword_vector_[JsKeywords::kNotAKeyword] = NULL;
 }

 JsKeywords::Type JsLexer::IdentifierOrKeyword(const StringPiece& name) {
   JsKeywords::Flag flag;
   JsKeywords::Type type = JsKeywords::Lookup(name, &flag);
   if (type == JsKeywords::kNotAKeyword) {
     last_token_may_end_value_ = true;
     return JsKeywords::kIdentifier;
   }
   last_token_may_end_value_ = (flag == JsKeywords::kIsValue);
   return type;
 }

 JsKeywords::Type JsLexer::NumberOrDot(const StringPiece& number_or_dot) {
   if (number_or_dot == ".") {
     last_token_may_end_value_ = false;
     return JsKeywords::kOperator;
   }
 #ifndef NDEBUG
   int num_dots = 0;
   for (int i = 0, n = number_or_dot.size(); i < n; ++i) {
     if (number_or_dot[i] == '.') {
       ++num_dots;
     }
   }
   DCHECK_GE(1, num_dots);
 #endif
   last_token_may_end_value_ = true;
   return JsKeywords::kNumber;
 }

 void JsLexer::Consume(LexicalPredicate predicate,
                       bool include_last_char,
                       bool ok_to_terminate_with_eof,
                       StringPiece* token) {
   DCHECK_LT(index_, static_cast<int>(input_.size()));
   const char* start = input_.data() + index_;
   const char* end = input_.data() + input_.size();
   prev_char_ = *start;
   const char* p = start + 1;
   for (int i = 1; (p < end) && CALL_MEMBER_FN(this, predicate)(*p, i);
        ++p, ++i) {
     prev_char_ = *p;
   }

   int size = p - start;
   if (p == end) {
     error_ |= !ok_to_terminate_with_eof;
   } else if (include_last_char) {
     ++size;
   }
   index_ += size;
   *token = StringPiece(start, size);
 }

 bool JsLexer::IsSpace(uint8 ch, int index) {
   return (strchr(" \t\f", ch) != NULL);
 }

 bool JsLexer::IsLineSeparator(uint8 ch, int index) {
   return (strchr("\n\r", ch) != NULL);
 }

 bool JsLexer::IsNumber(uint8 ch, int index) {
   // TODO(jmarantz): deal with hex/octal?
   //
   // Note that '.' by itself is not a number but its own token.  Thus
   // the callback method called for numbers is NumberOrDot which figures
   // out what to do given the context of the whole token.
   if (ch == '.') {
     if (seen_a_dot_) {
       return false;
     }
     seen_a_dot_ = true;
   }
   return (((ch >= '0') && (ch <= '9')) || (ch == '.'));
 }

 bool JsLexer::InBlockComment(uint8 ch, int index) {
   bool at_end = ((prev_char_ == '*') && (ch == '/'));
   return !at_end;
 }

 bool JsLexer::InSingleLineComment(uint8 ch, int index) {
   bool at_end = (ch == '\n') || (ch == '\r');
   return !at_end;
 }

 bool JsLexer::ProcessBackslash(uint8 ch) {
   if (backslash_mode_) {
     backslash_mode_ = false;
     return true;
   }
   if (ch == '\\') {
     backslash_mode_ = true;
     return true;
   }
   return false;
 }

 // See http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-262.pdf
 // page 17.
 //
 // Note this this algorithm errs on the side of allowing invalid characters into
 // an identifier.
 bool JsLexer::IdentifierStart(uint8 ch) {
   // Note that backslashes can appear in identifiers due to unicode escape
   // sequences (e.g. \u03c0).  We still terminate the identifier using
   // the same rules and make no attempt to decode the escape sequence.
   if (ProcessBackslash(ch)) {
     return true;
   }
   return ((ch >= 'a' && ch <= 'z') ||
           (ch >= 'A' && ch <= 'Z') ||
           (ch == '_') || (ch == '$') || (ch >= 127));
 }

 bool JsLexer::InIdentifier(uint8 ch, int index) {
   return (IdentifierStart(ch) || (ch >= '0' && ch <= '9'));
 }

 bool JsLexer::InOperator(uint8 ch, int index) {
   // TODO(jmarantz): add missing token-types !=, ==, ===, and others
   // listed in the ecmascript doc.
   if ((((token_start_ == '+') || (token_start_ == '-')) &&  // ++ --
        (ch == token_start_)) ||
       ((ch == '=') &&                                       // += -= *= /=
        ((token_start_ == '+') || (token_start_ == '-') ||
         (token_start_ == '/') || (token_start_ == '*')))) {
     // Treat -- and ++ as a single token.
     token_start_ = '\0';  // don't make a triple-plus or triple-minus.
     return true;
   }
   last_token_may_end_value_ = (strchr(")]}", ch) != NULL);
   return false;
 }

 bool JsLexer::InString(uint8 ch, int index) {
   if (ProcessBackslash(ch)) {
     return true;
   }
   return (token_start_ != ch);
 }

 bool JsLexer::InRegex(uint8 ch, int index) {
   if (ProcessBackslash(ch)) {
     return true;
   }
   if (ch == '/') {
     // Slashes within brackets are implicitly escaped.
     if (!within_brackets_) {
       return false;
     }
   } else if (ch == '[') {
     // Regex brackets don't nest, so we don't need a stack -- just a bool.
     within_brackets_ = true;
   } else if (ch == ']') {
     within_brackets_ = false;
   } else if (ch == '\n') {
     error_ = true;
     return false;
   }
   return true;
 }

 void JsLexer::Lex(const StringPiece& input) {
   input_ = input;
   index_ = 0;
   error_ = false;
   last_token_may_end_value_ = false;
   prev_char_ = '\0';
   token_start_ = '\0';
   backslash_mode_ = false;
   within_brackets_ = false;
   token_start_index_ = -1;
   seen_a_dot_ = false;
 }

 JsKeywords::Type JsLexer::NextToken(StringPiece* token) {
   bool end_of_input = (index_ >= static_cast<int>(input_.size()));
   if (end_of_input || error_) {
     return JsKeywords::kEndOfInput;
   }

   uint8 ch = input_[index_];
   token_start_ = ch;
   token_start_index_ = index_;

   // TODO(jmarantz): we can make a jump-table to map characters to an
   // enum to be used in a switch-statement.  If we care about speed
   // this would probably be the first thing to do.
   if (IsSpace(ch, 0)) {
     Consume(&JsLexer::IsSpace, false, true, token);
     return JsKeywords::kWhitespace;
     // last_token_may_end_value_ does not change.
   } else if (IsLineSeparator(ch, 0)) {
     Consume(&JsLexer::IsLineSeparator, false, true, token);
     return JsKeywords::kLineSeparator;
   } else if (IsNumber(ch, 0)) {
     seen_a_dot_ = (ch == '.');
     Consume(&JsLexer::IsNumber, false, true, token);
     seen_a_dot_ = false;
     return NumberOrDot(*token);
   } else if (ch == '/') {
     // Decide whether this is a comment or a regex.
     return ConsumeSlash(token);
   } else if ((ch == '"') || (ch == '\'')) {
     token_start_ = ch;
     Consume(&JsLexer::InString, true, false, token);
     last_token_may_end_value_ = true;
     return JsKeywords::kStringLiteral;
   } else if (IdentifierStart(ch)) {
     Consume(&JsLexer::InIdentifier, false, true, token);
     return IdentifierOrKeyword(*token);
   } else if (input_.substr(index_, 4) == "<!--") {
     Consume(&JsLexer::InSingleLineComment, false, true, token);
     return JsKeywords::kComment;
   }
   // all other punctuation is a token.
   token_start_ = ch;
   Consume(&JsLexer::InOperator, false, true, token);
   return JsKeywords::kOperator;
 }

 JsKeywords::Type JsLexer::ConsumeSlash(StringPiece* token) {
   // A slash could herald a line comment, a block comment, a regex literal,
   // or a mere division operator; we need to figure out which it is.
   // Differentiating between division and regexes is mostly impossible
   // without parsing, so we do our best based on the previous token.
   if (index_ < static_cast<int>(input_.size() - 1)) {
     char next = input_[index_ + 1];
     if (next == '/') {
       Consume(&JsLexer::InSingleLineComment, false, true, token);
       return JsKeywords::kComment;
     } else if (next == '*') {
       Consume(&JsLexer::InBlockComment, true, false, token);
       return JsKeywords::kComment;
     } else if (last_token_may_end_value_) {
       last_token_may_end_value_ = false;
     } else {
       within_brackets_ = false;
       Consume(&JsLexer::InRegex, true, false, token);
       return JsKeywords::kRegex;
     }
   }
   Consume(&JsLexer::InOperator, false, true, token);
   last_token_may_end_value_ = false;
   return JsKeywords::kOperator;
 }

 }  // namespace net_instaweb
	// Copyright 2011 Google Inc. All Rights Reserved.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.
	//
	// Author: jmarantz@google.com
	//
	// Implements a permissive javascript lexer. This is not designed to reject
	// all illegal javascript programs, but it attempts to accept all legal ones.
	// No attempt is made to decode unicode characters. Comments and whitepace
	// are outputs of the tokenization process. The token-stream can be used
	// to easily reconstruct a byte-identical version of the javascript file.
	//
	// There are likely a few inaccuracies -- e.g. incorrect tokenization of
	// multi-character operators. The regex recognition is heuristic and might
	// be incorrect in some cases.
	//
	// This has only been unit-tested, and has not yet been subjected to "street
	// javascript" by scanning the web.
	//
	// This is based on third_party/libpagespeed/src/pagespeed/js/js_minify.cc by
	// mdsteele@google.com

	#include "net/instaweb/js/public/js_lexer.h"

	#include <cstring> // for strchr

	#include "base/logging.h"
	#include "net/instaweb/js/public/js_keywords.h"
	#include "net/instaweb/util/public/string_util.h"

	namespace net_instaweb {

	#define CALL_MEMBER_FN(object, ptrToMember) ((object)->*(ptrToMember))

	JsLexer::JsLexer() {
	// Initialize the keywords from HtmlName into a reverse table. This
	// could have been generated by gperf, but it isn't. It's easy
	// enough to build it given an iterator.
	//
	// TODO(jmarantz): make a static Init/Terminate routine to avoid doing this
	// on every lexer instantiation.
	keyword_vector_.resize(JsKeywords::num_keywords() + 1);
	DCHECK_EQ(JsKeywords::num_keywords(), JsKeywords::kNotAKeyword);
	for (JsKeywords::Iterator iter; !iter.AtEnd(); iter.Next()) {
	keyword_vector_[iter.keyword()] = iter.name();
	}
	keyword_vector_[JsKeywords::kNotAKeyword] = NULL;
	}

	JsKeywords::Type JsLexer::IdentifierOrKeyword(const StringPiece& name) {
	JsKeywords::Flag flag;
	JsKeywords::Type type = JsKeywords::Lookup(name, &flag);
	if (type == JsKeywords::kNotAKeyword) {
	last_token_may_end_value_ = true;
	return JsKeywords::kIdentifier;
	}
	last_token_may_end_value_ = (flag == JsKeywords::kIsValue);
	return type;
	}

	JsKeywords::Type JsLexer::NumberOrDot(const StringPiece& number_or_dot) {
	if (number_or_dot == ".") {
	last_token_may_end_value_ = false;
	return JsKeywords::kOperator;
	}
	#ifndef NDEBUG
	int num_dots = 0;
	for (int i = 0, n = number_or_dot.size(); i < n; ++i) {
	if (number_or_dot[i] == '.') {
	++num_dots;
	}
	}
	DCHECK_GE(1, num_dots);
	#endif
	last_token_may_end_value_ = true;
	return JsKeywords::kNumber;
	}

	void JsLexer::Consume(LexicalPredicate predicate,
	bool include_last_char,
	bool ok_to_terminate_with_eof,
	StringPiece* token) {
	DCHECK_LT(index_, static_cast<int>(input_.size()));
	const char* start = input_.data() + index_;
	const char* end = input_.data() + input_.size();
	prev_char_ = *start;
	const char* p = start + 1;
	for (int i = 1; (p < end) && CALL_MEMBER_FN(this, predicate)(*p, i);
	++p, ++i) {
	prev_char_ = *p;
	}

	int size = p - start;
	if (p == end) {
	error_ \|= !ok_to_terminate_with_eof;
	} else if (include_last_char) {
	++size;
	}
	index_ += size;
	*token = StringPiece(start, size);
	}

	bool JsLexer::IsSpace(uint8 ch, int index) {
	return (strchr(" \t\f", ch) != NULL);
	}

	bool JsLexer::IsLineSeparator(uint8 ch, int index) {
	return (strchr("\n\r", ch) != NULL);
	}

	bool JsLexer::IsNumber(uint8 ch, int index) {
	// TODO(jmarantz): deal with hex/octal?
	//
	// Note that '.' by itself is not a number but its own token. Thus
	// the callback method called for numbers is NumberOrDot which figures
	// out what to do given the context of the whole token.
	if (ch == '.') {
	if (seen_a_dot_) {
	return false;
	}
	seen_a_dot_ = true;
	}
	return (((ch >= '0') && (ch <= '9')) \|\| (ch == '.'));
	}

	bool JsLexer::InBlockComment(uint8 ch, int index) {
	bool at_end = ((prev_char_ == '*') && (ch == '/'));
	return !at_end;
	}

	bool JsLexer::InSingleLineComment(uint8 ch, int index) {
	bool at_end = (ch == '\n') \|\| (ch == '\r');
	return !at_end;
	}

	bool JsLexer::ProcessBackslash(uint8 ch) {
	if (backslash_mode_) {
	backslash_mode_ = false;
	return true;
	}
	if (ch == '\\') {
	backslash_mode_ = true;
	return true;
	}
	return false;
	}

	// See http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-262.pdf
	// page 17.
	//
	// Note this this algorithm errs on the side of allowing invalid characters into
	// an identifier.
	bool JsLexer::IdentifierStart(uint8 ch) {
	// Note that backslashes can appear in identifiers due to unicode escape
	// sequences (e.g. \u03c0). We still terminate the identifier using
	// the same rules and make no attempt to decode the escape sequence.
	if (ProcessBackslash(ch)) {
	return true;
	}
	return ((ch >= 'a' && ch <= 'z') \|\|
	(ch >= 'A' && ch <= 'Z') \|\|
	(ch == '_') \|\| (ch == '$') \|\| (ch >= 127));
	}

	bool JsLexer::InIdentifier(uint8 ch, int index) {
	return (IdentifierStart(ch) \|\| (ch >= '0' && ch <= '9'));
	}

	bool JsLexer::InOperator(uint8 ch, int index) {
	// TODO(jmarantz): add missing token-types !=, ==, ===, and others
	// listed in the ecmascript doc.
	if ((((token_start_ == '+') \|\| (token_start_ == '-')) && // ++ --
	(ch == token_start_)) \|\|
	((ch == '=') && // += -= *= /=
	((token_start_ == '+') \|\| (token_start_ == '-') \|\|
	(token_start_ == '/') \|\| (token_start_ == '*')))) {
	// Treat -- and ++ as a single token.
	token_start_ = '\0'; // don't make a triple-plus or triple-minus.
	return true;
	}
	last_token_may_end_value_ = (strchr(")]}", ch) != NULL);
	return false;
	}

	bool JsLexer::InString(uint8 ch, int index) {
	if (ProcessBackslash(ch)) {
	return true;
	}
	return (token_start_ != ch);
	}

	bool JsLexer::InRegex(uint8 ch, int index) {
	if (ProcessBackslash(ch)) {
	return true;
	}
	if (ch == '/') {
	// Slashes within brackets are implicitly escaped.
	if (!within_brackets_) {
	return false;
	}
	} else if (ch == '[') {
	// Regex brackets don't nest, so we don't need a stack -- just a bool.
	within_brackets_ = true;
	} else if (ch == ']') {
	within_brackets_ = false;
	} else if (ch == '\n') {
	error_ = true;
	return false;
	}
	return true;
	}

	void JsLexer::Lex(const StringPiece& input) {
	input_ = input;
	index_ = 0;
	error_ = false;
	last_token_may_end_value_ = false;
	prev_char_ = '\0';
	token_start_ = '\0';
	backslash_mode_ = false;
	within_brackets_ = false;
	token_start_index_ = -1;
	seen_a_dot_ = false;
	}

	JsKeywords::Type JsLexer::NextToken(StringPiece* token) {
	bool end_of_input = (index_ >= static_cast<int>(input_.size()));
	if (end_of_input \|\| error_) {
	return JsKeywords::kEndOfInput;
	}

	uint8 ch = input_[index_];
	token_start_ = ch;
	token_start_index_ = index_;

	// TODO(jmarantz): we can make a jump-table to map characters to an
	// enum to be used in a switch-statement. If we care about speed
	// this would probably be the first thing to do.
	if (IsSpace(ch, 0)) {
	Consume(&JsLexer::IsSpace, false, true, token);
	return JsKeywords::kWhitespace;
	// last_token_may_end_value_ does not change.
	} else if (IsLineSeparator(ch, 0)) {
	Consume(&JsLexer::IsLineSeparator, false, true, token);
	return JsKeywords::kLineSeparator;
	} else if (IsNumber(ch, 0)) {
	seen_a_dot_ = (ch == '.');
	Consume(&JsLexer::IsNumber, false, true, token);
	seen_a_dot_ = false;
	return NumberOrDot(*token);
	} else if (ch == '/') {
	// Decide whether this is a comment or a regex.
	return ConsumeSlash(token);
	} else if ((ch == '"') \|\| (ch == '\'')) {
	token_start_ = ch;
	Consume(&JsLexer::InString, true, false, token);
	last_token_may_end_value_ = true;
	return JsKeywords::kStringLiteral;
	} else if (IdentifierStart(ch)) {
	Consume(&JsLexer::InIdentifier, false, true, token);
	return IdentifierOrKeyword(*token);
	} else if (input_.substr(index_, 4) == "<!--") {
	Consume(&JsLexer::InSingleLineComment, false, true, token);
	return JsKeywords::kComment;
	}
	// all other punctuation is a token.
	token_start_ = ch;
	Consume(&JsLexer::InOperator, false, true, token);
	return JsKeywords::kOperator;
	}

	JsKeywords::Type JsLexer::ConsumeSlash(StringPiece* token) {
	// A slash could herald a line comment, a block comment, a regex literal,
	// or a mere division operator; we need to figure out which it is.
	// Differentiating between division and regexes is mostly impossible
	// without parsing, so we do our best based on the previous token.
	if (index_ < static_cast<int>(input_.size() - 1)) {
	char next = input_[index_ + 1];
	if (next == '/') {
	Consume(&JsLexer::InSingleLineComment, false, true, token);
	return JsKeywords::kComment;
	} else if (next == '*') {
	Consume(&JsLexer::InBlockComment, true, false, token);
	return JsKeywords::kComment;
	} else if (last_token_may_end_value_) {
	last_token_may_end_value_ = false;
	} else {
	within_brackets_ = false;
	Consume(&JsLexer::InRegex, true, false, token);
	return JsKeywords::kRegex;
	}
	}
	Consume(&JsLexer::InOperator, false, true, token);
	last_token_may_end_value_ = false;
	return JsKeywords::kOperator;
	}

	} // namespace net_instaweb