src/pagespeed/kernel/js/js_tokenizer.cc - incubator-pagespeed-debian - Git at Google

 // Copyright 2014 Google Inc. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // Tokenizing JavaScript is tricky.  Most programming languages can be lexed
 // and parsed separately; for example, in Java, given the code fragment "(x +
 // y) / z", you can divide it up into tokens "(", "x", "+", and so on without
 // keeping track of previous tokens, whether the parens match up, etc., and
 // once tokenized you can parse based on that token stream without remembering
 // any of the whitespace or comments that appeared between the tokens.  In
 // JavaScript, neither of these things are true.  In the above Java example,
 // that slash is a division operator, but in JavaScript it *could* instead be
 // the start of a regex literal if the token before the "(" was e.g. "if";
 // therefore you have to keep track of the parse state.  Moreover, whitespace
 // can sometimes matter in JavaScript due to semicolon insertion, and
 // determining whether a given piece of whitespace matters or not requires not
 // only *previous* parse state, but also the ability to look *ahead* to the
 // next token (something that even other whitespace-significant languages, like
 // Python or Haskell, don't require).  The goal of this class is to correctly
 // tokenize JavaScript code with as little code as possible, by not being a
 // full parser but still keeping track of some minimal parse state.
 //
 // We keep a stack of ParseState values, and in general most tokens will push a
 // new state onto the stack, possibly after popping off other states.
 // Examining the stack helps us to disambiguate the meanings of certain
 // characters (like slashes).  So how many different ParseState values do we
 // need?  The big three questions we have to be able to answer are: (1) Is this
 // slash division or a regex?  (2) Are these braces a code block or an object
 // literal?  (This matters primarily because a slash after a code block is a
 // regex, and a slash after an object literal is division.)  (3) Does this
 // linebreak induce semicolon insertion or not?  The different ParseState
 // values we have exist to answer these questions.
 //
 // - kStartOfInput exists as a convenience.  It is only ever used at the bottom
 //   of the stack, and the bottom of the stack is always kStartOfInput.  It's
 //   just there so that we can always assume the stack is nonempty and thus we
 //   can always read its top value.
 //
 // - kExpression is for expressions.  A slash after this is division.  An open
 //   brace after this is an error.  A linebreak after this may or may not
 //   insert a semicolon, depending on the next token.
 //
 // - kOperator is for prefix and binary operators, including keywords like
 //   "in".  A slash after this is a regex, and braces after this are an object
 //   literal.  (Note that postfix operators don't need a parse state, because a
 //   postfix operator must follow an expression, and an expression followed by
 //   a postfix operator is still just an expression.)
 //
 // - kPeriod is for the "." operator (this parse state is *not* used for
 //   decimal points in numeric literals).  It is similar to other operators,
 //   but a reserved word just after a period is an identifier.  For example,
 //   even though "if" is normally a reserved word, "foo.if" is legal code, and
 //   is equivalent to "foo['if']".
 //
 // - kQuestionMark is for the "?" character.  It behaves just like other
 //   operators, but we must track it separately in order to determine whether a
 //   given ":" character is for a label or a ternary operator.  This matters
 //   because "foo:{}" is a label and code block, while "a?foo:{}" is a ternary
 //   operator and object literal.
 //
 // - kOpenBrace, kOpenBracket, and kOpenParen are for opening delimiters.  When
 //   we encounter a closing delimiter, we pop back to the matching open
 //   delimiter and then modify the stack from there depending on what was just
 //   created (e.g. an expression, or a block header, or something else).
 //
 // - kBlockKeyword is for keywords like "if" and "for" that are followed by
 //   parentheses.  We track these so we know whether a pair of parens forms an
 //   expression like "(a+b)" (after which a slash is division) or a block
 //   header like "if(a>b)" (after which a slash is a regex).
 //
 // - kBlockHeader is a completed block header, like "if(a>b)".  Certain other
 //   keywords like "do" and "else" are block headers on their own.
 //
 // - Lastly, we're left with eight keywords that don't fit into any of the
 //   above categories.  We group these into three parse states:
 //
 //     - kReturnThrow for "return" and "throw".  They're sort of like prefix
 //       operators in that a slash after these is a regex, but a linebreak
 //       after these *always* inserts a semicolon.
 //
 //     - kJumpKeyword for "break", "continue", and "debugger".  A slash after
 //       these is an error, and a linebreak after these *always* inserts a
 //       semicolon.
 //
 //     - kOtherKeyword for "const", "default", and "var".  A slash after these
 //       is an error too, but a linebreak after these *never* inserts a
 //       semicolon.
 //
 // To help make the above more concrete, suppose we're parsing the code:
 //
 //   if ([]) {
 //     foo: while(true) break;
 //   } else /x/.test('y');
 //
 // The progression of the parse stack would look like this:
 //
 //   if     -> BkKwd               "if" is a block keyword, so it needs (...).
 //   (      -> BkKwd (
 //   [      -> BkKwd ( [
 //   ]      -> BkKwd ( Expr        [] is an expression (array literal).
 //   )      -> BkHdr               Now "if (...)" is a complete block header.
 //   {      -> BkHdr {
 //   foo    -> BkHdr { Expr        An identifier is usually an expression...
 //   :      -> BkHdr {             ...nevermind, a label.  Roll back statement.
 //   while  -> BkHdr { BkKwd       "while" is a block keyword, just like "if".
 //   (true) -> BkHdr { BkHdr       Three more tokens gives us the block header.
 //   break  -> BkHdr { BkHdr Jump  "break" is special, slashes can't follow it.
 //   ;      -> BkHdr {             Semicolon, roll back to start-of-statement.
 //   }      ->                     Block finished.
 //   else   -> BkHdr               "else" is a block header by itself.
 //   /x/    -> BkHdr Expr          A slash after BkHdr is a regex.
 //   .      -> BkHdr Expr Oper     A period is essentially a binary operator.
 //   test   -> BkHdr Expr          "Expr Oper Expr" collapses to "Expr"
 //   (      -> BkHdr Expr (
 //   'y'    -> BkHdr Expr ( Expr
 //   )      -> BkHdr Expr          Method call collapses into a single Expr.
 //   ;      ->                     Semicolon, roll back to start-of-statement.
 //
 // In general, this class is focused on tokenizing, not actual parsing or
 // detecting syntax errors, so there are many kinds of syntax errors that we
 // don't detect and will simply ignore (such as "break 42;", which can be
 // reasonably split into tokens even if it doesn't actually parse).  But we
 // *must* abort whenever the parse state becomes too mangled for us to make
 // meaningful decisions about what slashes mean.  For example, in the code
 // "[a}/x/i", are those slashes a regex literal or division?  The question has
 // no answer.  They'd be division if the code were "[a]/x/i", and a regex if
 // the code were "{a}/x/i", but faced with "[a}", we have little choice but to
 // abort.
 //
 // More information about semicolon insertion can be found here:
 //   http://inimino.org/~inimino/blog/javascript_semicolons

 #include "pagespeed/kernel/js/js_tokenizer.h"

 #include <stddef.h>
 #include <vector>

 #include "base/logging.h"
 #include "pagespeed/kernel/base/string.h"
 #include "pagespeed/kernel/base/string_util.h"
 #include "pagespeed/kernel/js/js_keywords.h"
 #include "pagespeed/kernel/util/re2.h"

 namespace pagespeed {

 namespace js {

 namespace {

 // Regex to match JavaScript identifiers.  For details, see page 18 of
 // http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-262.pdf
 const char* const kIdentifierRegex =
     // An identifier must begin with a $, _, unicode letter (more specifically,
     // a character in the Lu, Ll, Lt, Lm, Lo, or Nl category), or unicode
     // escape.
     "([$_\\p{Lu}\\p{Ll}\\p{Lt}\\p{Lm}\\p{Lo}\\p{Nl}]|\\\\u[0-9A-Fa-f]{4})"
     // After that, an identifier may have zero or more characters that are one
     // of the above, a combining mark (Mn or Mc), a digit (Nd), a connector
     // punctuation (Pc) or one of the characters ZERO WIDTH NON-JOINER (U+200C)
     // or ZERO WIDTH JOINER (U+200D).
     "([$_\\p{Lu}\\p{Ll}\\p{Lt}\\p{Lm}\\p{Lo}\\p{Nl}\\p{Mn}\\p{Mc}\\p{Nd}"
     "\\p{Pc}\xE2\x80\x8C\xE2\x80\x8D]|\\\\u[0-9A-Fa-f]{4})*";

 // Regex to match JavaScript line comments.  This regex contains exactly one
 // capturing group, which will match the linebreak (or end-of-input) that
 // terminated the line comment.
 const char* const kLineCommentRegex =
     "(?://|<!--|-->)\\C*?([\r\n\\p{Zl}\\p{Zp}]|\\z)";

 // Regex to match JavaScript numeric literals.  This must be compiled in POSIX
 // mode, so that the |'s are leftmost-longest rather than leftmost-first.
 const char* const kNumericLiteralPosixRegex =
     // A number can be a hexadecimal literal, or...
     "0[xX][0-9a-fA-F]+|"
     // ...it can be a octal literal, or...
     "0[0-7]+|"
     // ...it can be a decimal literal.  To qualify as a decimal literal, it
     // must 1) start with a nonzero digit, or 2) start with zero but contain
     // a non-octal digit (8 or 9) in there somewhere, or 3) be a single zero
     // digit.
     "(([1-9][0-9]*|0([0-9]*[89][0-9]*)?)"
     // A decimal literal may optionally be followed by a decimal point and
     // fractional part:
     "(\\.[0-9]*)?"
     // Alternatively, instead of all that, a decimal literal may instead
     // start with a decimal point (instead of starting with a digit).
     "|\\.[0-9]+)"
     // Finally, any of the above kinds of decimal literal may optionally be
     // followed by an exponent.
     "([eE][+-]?[0-9]+)?";

 // Regex to match most JavaScript operators (some operators, such as comma,
 // period, question mark, and colon are special-cased elsewhere).
 const char* const kOperatorRegex =
     // && || ++ -- ~
     "&&|\\|\\||\\+\\+|--|~|"
     // * *= / /= % %= ^ ^= & &= | |= + += - -=
     "[*/%^&|+-]=?|"
     // ! != !== = == ===
     "[!=]={0,2}|"
     // < <= << <<=
     "<{1,2}=?|"
     // > >= >> >>= >>> >>>=
     ">{1,3}=?";

 // Regex to match JavaScript regex literals.  For details, see page 25 of
 // http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-262.pdf
 const char* const kRegexLiteralRegex =
     // Regex literals can contain characters that aren't slashes, backslashes,
     // open brackets, or linebreaks.
     "/([^/\\\\\\[\r\n\\p{Zl}\\p{Zp}]|"
     // They can also contain character classes, which are enclosed in square
     // brackets.  Within the brackets, close brackets and backslashes must be
     // escaped.  Linebreaks are *never* permitted -- not even if escaped.
     "\\[([^\\]\\\\\r\n\\p{Zl}\\p{Zp}]|"
     "\\\\[^\r\n\\p{Zl}\\p{Zp}])*\\]|"
     // Finally, they can contain escape sequences.  Again, linebreaks are
     // forbidden and cannot be escaped.
     "\\\\[^\r\n\\p{Zl}\\p{Zp}])+/"
     // Regex literals may optionally be followed by zero or more flags, which
     // can consist of any characters allowed within identifiers (even \uXXXX
     // escapes!); see kIdentifierRegex for details.  (Very few of these
     // characters are actually semantically valid regex flags, but they're all
     // lexically valid.)
     "([$_\\p{Lu}\\p{Ll}\\p{Lt}\\p{Lm}\\p{Lo}\\p{Nl}\\p{Mn}\\p{Mc}\\p{Nd}"
     "\\p{Pc}\xE2\x80\x8C\xE2\x80\x8D]|\\\\u[0-9A-Fa-f]{4})*";

 // Regex to match JavaScript string literals.  For details, see page 22 of
 // http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-262.pdf
 // This regex will still match when given a string literal containing an
 // unescaped linebreak, but the match will terminate after the linebreak; the
 // caller must then check whether the start and end characters of the match are
 // the same (both single quote or both double quote), and reject it if not.
 const char* const kStringLiteralRegex =
     // Single-quoted string literals can contain any characters that aren't
     // single quotes, backslashes, or linebreaks.  They can also contain escape
     // sequences, which is a backslash followed either by a linebreak or by any
     // one character.  But note that the sequence \r\n counts as *one*
     // linebreak for this purpose, as does \n\r.  Finally, we use RE2's \C
     // escape for matching arbitrary bytes, along with very careful use of
     // greedy and non-greedy operators, to allow the string literal to contain
     // invalid UTF-8 characters, in case we're given e.g. Latin1-encoded input.
     // This is subtle and fragile, but fortunately we have unit tests that will
     // break if we ever get this wrong.
     //
     // This would be easier if there were a way to say "match an invalid UTF8
     // byte only", but apparently there is no way to do this in RE2.
     // See https://groups.google.com/forum/#!topic/re2-dev/26wVIHcowh4
     "'(\\C*?(\\\\(\r\n|\n\r|\n|.))?)*?['\n\r\\p{Zl}\\p{Zp}]|"
     // A string literal can also be double-quoted instead, which is the same,
     // except that double quotes must be escaped instead of single quotes.
     "\"(\\C*?(\\\\(\r\n|\n\r|\n|.))?)*?[\"\n\r\\p{Zl}\\p{Zp}]";

 // Regex to match JavaScript whitespace.  For details, see page 15 of
 // http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-262.pdf
 // This regex contains exactly one capturing group; iff it captures anything,
 // then the whitespace contains at least one linebreak.
 const char* const kWhitespaceRegex =
     // Line separators include \n, \r, and characters in the "Line Separator"
     // (Zl) and "Paragraph Separator" (Zp) Unicode categories.
     "(?:([\n\r\\p{Zl}\\p{Zp}])|"
     // Horizontal whitespace includes space, \f, \t, \v, BYTE ORDER MARK
     // (U+FEFF), and characters in the "Space Separator" (Zs) Unicode category.
     "[ \f\t\v\xEF\xBB\xBF\\p{Zs}])+";

 // Regex to check if the next token in the remaining input could continue the
 // current statement, assuming the current statement currently ends with an
 // expression.  (Note that this regex will not necessarily capture the entire
 // next token; the only useful information to be had from it is whether it
 // matches at all or not).
 const char* const kLineContinuationRegex =
     // Any operator (even a multicharacter operator) starting with one of the
     // following characters can continue the current expression.
     "[=(*/%^&|<>?:,.]|"
     // A != can continue immediately after an expression, but not a !.
     "!=|"
     // A + or - can continue after an expression, but not a ++ or -- (because
     // JavaScript's grammar specifically forbids linebreaks between the two
     // tokens in "i++" or in "i--").
     "\\+($|[^+])|-($|[^-])|"
     // Finally, the in or instanceof operators can continue, though we have to
     // be sure we're not just looking at an identifier that starts with "in",
     // so make sure the "in" or "instanceof" is not followed by an identifier
     // character (see kIdentifierRegex for details).
     "(in|instanceof)($|[^$_\\p{Lu}\\p{Ll}\\p{Lt}\\p{Lm}\\p{Lo}\\p{Nl}\\p{Mn}"
     "\\p{Mc}\\p{Nd}\\p{Pc}\xE2\x80\x8C\xE2\x80\x8D\\\\])";

 }  // namespace

 JsTokenizer::JsTokenizer(const JsTokenizerPatterns* patterns,
                          StringPiece input)
     : patterns_(patterns), input_(input), json_step_(kJsonStart),
       start_of_line_(true), error_(false) {
   parse_stack_.push_back(kStartOfInput);
 }

 JsTokenizer::~JsTokenizer() {}

 JsKeywords::Type JsTokenizer::NextToken(StringPiece* token_out) {
   // Empty out the lookahead queue before we scan any further.
   if (!lookahead_queue_.empty()) {
     const JsKeywords::Type type = lookahead_queue_.front().first;
     *token_out = lookahead_queue_.front().second;
     lookahead_queue_.pop_front();
     return type;
   }
   // If we've already encountered an error, just keep returning an error token.
   if (error_) {
     return Error(token_out);
   }
   // If we've cleanly reached the end of the input, we're done.
   if (input_.empty()) {
     parse_stack_.clear();
     token_out->clear();
     return JsKeywords::kEndOfInput;
   }
   // Invariant: until we reach the end of the input, the parse stack is never
   // empty, and the bottom entry is always kStartOfInput.  This is for
   // convenience, so that elsewhere we don't have to keep testing whether the
   // parse stack is empty before looking at the top entry.
   DCHECK(!parse_stack_.empty());
   DCHECK_EQ(kStartOfInput, parse_stack_[0]);
   // Scan and return the next token.
   const char ch = input_[0];
   switch (ch) {
     case ' ':
     case '\f':
     case '\n':
     case '\r':
     case '\t':
     case '\v':
       // This covers ASCII whitespace (which is the common case).  Unicode
       // whitespace is detected in the default case below.
       {
         JsKeywords::Type type;
         if (!TryConsumeWhitespace(true, &type, token_out)) {
           LOG(DFATAL) << "TryConsumeWhitespace failed on ASCII whitespace: "
                       << static_cast<int>(ch);
           return Error(token_out);
         }
         return type;
       }
     case '{':
       return ConsumeOpenBrace(token_out);
     case '}':
       return ConsumeCloseBrace(token_out);
     case '[':
       return ConsumeOpenBracket(token_out);
     case ']':
       return ConsumeCloseBracket(token_out);
     case '(':
       return ConsumeOpenParen(token_out);
     case ')':
       return ConsumeCloseParen(token_out);
     case ':':
       return ConsumeColon(token_out);
     case ',':
       return ConsumeComma(token_out);
     case '.':
       return ConsumePeriod(token_out);
     case '?':
       return ConsumeQuestionMark(token_out);
     case ';':
       return ConsumeSemicolon(token_out);
     case '/':
       return ConsumeSlash(token_out);
     case '\'':
     case '"':
       return ConsumeString(token_out);
     case '0':
     case '1':
     case '2':
     case '3':
     case '4':
     case '5':
     case '6':
     case '7':
     case '8':
     case '9':
       // Numeric literals (whether decimal, hex, or octal) start either with a
       // digit or with a period.  This line covers the starts-with-digit case,
       // while ConsumePeriod above checks for the starts-with-period case.
       return ConsumeNumber(token_out);
     default:
       {
         JsKeywords::Type type;
         if (TryConsumeIdentifierOrKeyword(&type, token_out) ||
             TryConsumeComment(&type, token_out) ||
             TryConsumeWhitespace(true, &type, token_out)) {
           return type;
         }
         // If all else fails, maybe this is an operator.  If not,
         // ConsumeOperator will return an error token.
         return ConsumeOperator(token_out);
       }
   }
 }

 GoogleString JsTokenizer::ParseStackForTest() const {
   GoogleString output;
   for (std::vector<ParseState>::const_iterator iter = parse_stack_.begin();
        iter != parse_stack_.end(); ++iter) {
     if (!output.empty()) {
       output.push_back(' ');
     }
     switch (*iter) {
       case kStartOfInput: output.append("Start"); break;
       case kExpression:   output.append("Expr");  break;
       case kOperator:     output.append("Oper");  break;
       case kPeriod:       output.append(".");     break;
       case kQuestionMark: output.append("?");     break;
       case kOpenBrace:    output.append("{");     break;
       case kOpenBracket:  output.append("[");     break;
       case kOpenParen:    output.append("(");     break;
       case kBlockKeyword: output.append("BkKwd"); break;
       case kBlockHeader:  output.append("BkHdr"); break;
       case kReturnThrow:  output.append("RetTh"); break;
       case kJumpKeyword:  output.append("Jump");  break;
       case kOtherKeyword: output.append("Other"); break;
       default:
         LOG(DFATAL) << "Unknown parse state: " << *iter;
         output.append("UNKNOWN");
         break;
     }
   }
   return output;
 }

 JsKeywords::Type JsTokenizer::ConsumeOpenBrace(StringPiece* token_out) {
   DCHECK(!input_.empty());
   DCHECK_EQ('{', input_[0]);
   const ParseState state = parse_stack_.back();
   if (state == kExpression || state == kPeriod || state == kBlockKeyword ||
       state == kJumpKeyword || state == kOtherKeyword) {
     return Error(token_out);
   }
   parse_stack_.push_back(kOpenBrace);
   return Emit(JsKeywords::kOperator, 1, token_out);
 }

 JsKeywords::Type JsTokenizer::ConsumeCloseBrace(StringPiece* token_out) {
   DCHECK(!input_.empty());
   DCHECK_EQ('}', input_[0]);
   // Pop the most recent kOpenBrace (and everything above it) off the stack.
   while (true) {
     DCHECK(!parse_stack_.empty());
     const ParseState state = parse_stack_.back();
     if (state == kOpenBrace) {
       parse_stack_.pop_back();
       break;
     } else if (state == kStartOfInput || state == kOpenBracket ||
                state == kOpenParen || state == kBlockKeyword) {
       return Error(token_out);
     } else {
       parse_stack_.pop_back();
     }
   }
   // If the open brace was preceeded by a BlockHeader, we can pop that off the
   // stack at this point.  The presence of a BlockHeader means these braces
   // were a block (rather than an object literal), and usually after popping it
   // off we'll now be back at a start-of-statement (in which case we'll
   // correctly deduce below that this was a block).  The one exception is
   // anonymous function literals, which is the one case where the block header
   // will (necessarily) be preceeded by an operator, or open paran, or
   // something else indicating an expression (e.g. foo=function(){};).  In that
   // case, after popping the BlockHeader, we will correctly conclude below that
   // we have just created an Expression.
   //
   // (If there were no braces after the BlockHeader (e.g. "if (x) return;"),
   // then that BlockHeader will be popped when we roll back to
   // start-of-statement for some other reason, such as encountering a
   // semicolon.)
   if (parse_stack_.back() == kBlockHeader) {
     parse_stack_.pop_back();
   }
   // Depending on the parse state that came before the kOpenBrace, we just
   // closed either an object literal (which is a kExpression), or a block
   // (which isn't).
   DCHECK(!parse_stack_.empty());
   if (CanPreceedObjectLiteral(parse_stack_.back())) {
     PushExpression();
   }
   // Emit a token for the close brace.
   return Emit(JsKeywords::kOperator, 1, token_out);
 }

 JsKeywords::Type JsTokenizer::ConsumeOpenBracket(StringPiece* token_out) {
   DCHECK(!input_.empty());
   DCHECK_EQ('[', input_[0]);
   const ParseState state = parse_stack_.back();
   if (state == kPeriod || state == kBlockKeyword || state == kJumpKeyword ||
       state == kOtherKeyword) {
     return Error(token_out);
   }
   parse_stack_.push_back(kOpenBracket);
   return Emit(JsKeywords::kOperator, 1, token_out);
 }

 JsKeywords::Type JsTokenizer::ConsumeCloseBracket(StringPiece* token_out) {
   DCHECK(!input_.empty());
   DCHECK_EQ(']', input_[0]);
   // Pop the most recent kOpenBracket (and everything above it) off the stack.
   while (true) {
     DCHECK(!parse_stack_.empty());
     const ParseState state = parse_stack_.back();
     if (state == kOpenBracket) {
       parse_stack_.pop_back();
       break;
     } else if (state == kStartOfInput ||
                state == kOpenBrace || state == kOpenParen ||
                state == kBlockKeyword || state == kBlockHeader) {
       return Error(token_out);
     } else {
       parse_stack_.pop_back();
     }
   }
   PushExpression();
   // Emit a token for the close bracket.
   return Emit(JsKeywords::kOperator, 1, token_out);
 }

 JsKeywords::Type JsTokenizer::ConsumeOpenParen(StringPiece* token_out) {
   DCHECK(!input_.empty());
   DCHECK_EQ('(', input_[0]);
   const ParseState state = parse_stack_.back();
   if (state == kPeriod || state == kJumpKeyword || state == kOtherKeyword) {
     return Error(token_out);
   }
   parse_stack_.push_back(kOpenParen);
   return Emit(JsKeywords::kOperator, 1, token_out);
 }

 JsKeywords::Type JsTokenizer::ConsumeCloseParen(StringPiece* token_out) {
   DCHECK(!input_.empty());
   DCHECK_EQ(')', input_[0]);
   // Pop the most recent kOpenParen (and everything above it) off the stack.
   while (true) {
     DCHECK(!parse_stack_.empty());
     const ParseState state = parse_stack_.back();
     if (state == kOpenParen) {
       parse_stack_.pop_back();
       break;
     } else if (state == kStartOfInput ||
                state == kOpenBrace || state == kOpenBracket ||
                state == kBlockKeyword || state == kBlockHeader) {
       return Error(token_out);
     } else {
       parse_stack_.pop_back();
     }
   }
   // If this is the closing paren of e.g. "if (...)", then we've just created a
   // kBlockHeader.  Otherwise, we've just created a kExpression.
   DCHECK(!parse_stack_.empty());
   if (parse_stack_.back() == kBlockKeyword) {
     parse_stack_.pop_back();
     PushBlockHeader();
   } else {
     PushExpression();
   }
   // Emit a token for the close parenthesis.
   return Emit(JsKeywords::kOperator, 1, token_out);
 }

 JsKeywords::Type JsTokenizer::ConsumeBlockComment(StringPiece* token_out) {
   DCHECK_GE(input_.size(), 2u);
   DCHECK_EQ('/', input_[0]);
   DCHECK_EQ('*', input_[1]);
   const stringpiece_ssize_type index = input_.find("*/", 2);
   if (index == StringPiece::npos) {
     return Error(token_out);
   }
   return Emit(JsKeywords::kComment, index + 2, token_out);
 }

 JsKeywords::Type JsTokenizer::ConsumeLineComment(StringPiece* token_out) {
   Re2StringPiece unconsumed = StringPieceToRe2(input_);
   Re2StringPiece linebreak;
   if (!RE2::Consume(&unconsumed, patterns_->line_comment_pattern, &linebreak)) {
     // We only call ConsumeLineComment when we're sure we're looking at a line
     // comment, so this ought not happen even for pathalogical input.
     LOG(DFATAL) << "Failed to match line comment pattern: "
                 << input_.substr(0, 50);
     return Error(token_out);
   }
   return Emit(JsKeywords::kComment,
               input_.size() - unconsumed.size() - linebreak.size(),
               token_out);
 }

 bool JsTokenizer::TryConsumeComment(
     JsKeywords::Type* type_out, StringPiece* token_out) {
   DCHECK(!input_.empty());
   if (input_.starts_with("/*")) {
     *type_out = ConsumeBlockComment(token_out);
     return true;
   }
   if (input_.starts_with("//") || input_.starts_with("<!--") ||
       (start_of_line_ && input_.starts_with("-->"))) {
     *type_out = ConsumeLineComment(token_out);
     return true;
   }
   return false;
 }

 JsKeywords::Type JsTokenizer::ConsumeColon(StringPiece* token_out) {
   DCHECK(!input_.empty());
   DCHECK_EQ(':', input_[0]);
   while (true) {
     DCHECK(!parse_stack_.empty());
     switch (parse_stack_.back()) {
       // If we reach a kQuestionMark, this colon is part of a ternary
       // operator.  Remove the kQuestionMark and replace it with a kOperator.
       case kQuestionMark:
         parse_stack_.pop_back();
         PushOperator();
         return Emit(JsKeywords::kOperator, 1, token_out);
       // If we reach the start of the statement without seeing a kQuestionMark,
       // this was a label.  No need to push any new parse state.
       case kStartOfInput:
       case kBlockHeader:
         return Emit(JsKeywords::kOperator, 1, token_out);
       // If we hit an open brace, check if it's for an object literal or a
       // block.  If it's an object literal, then this colon was for a property
       // name; push a kOperator state so that we know that what follows is an
       // expression (rather than the next property name).  If it's a block,
       // then we're back to start-of-statement (as above) so there's no need to
       // push any new parse state.
       case kOpenBrace:
         // Since the top state is currently kOpenBrace, and the bottom state is
         // always kStartOfInput, we know that the parse stack has at least two
         // entries right now.
         DCHECK_GE(parse_stack_.size(), 2u);
         if (CanPreceedObjectLiteral(parse_stack_[parse_stack_.size() - 2])) {
           PushOperator();
         }
         return Emit(JsKeywords::kOperator, 1, token_out);
       // Skip past anything that could lie between the colon and the question
       // mark or start-of-statement.  This includes the kOtherKeyword parse
       // state for the sake of the "default" keyword.
       case kExpression:
       case kOtherKeyword:
         parse_stack_.pop_back();
         break;
       // Reaching any other parse state is an error.
       case kOperator:
       case kPeriod:
       case kOpenBracket:
       case kOpenParen:
       case kBlockKeyword:
       case kReturnThrow:
       case kJumpKeyword:
         return Error(token_out);
       default:
         LOG(DFATAL) << "Unknown parse state: " << parse_stack_.back();
         return Error(token_out);
     }
   }
 }

 JsKeywords::Type JsTokenizer::ConsumeComma(StringPiece* token_out) {
   DCHECK(!input_.empty());
   DCHECK_EQ(',', input_[0]);
   const ParseState state = parse_stack_.back();
   if (state == kExpression) {
     // Since the top state is currently kExpression, and the bottom state is
     // always kStartOfInput, we know that the parse stack has at least two
     // entries right now.
     DCHECK_GE(parse_stack_.size(), 2u);
     const ParseState prev = parse_stack_[parse_stack_.size() - 2];
     // One use of commas is as the separator for array/object literals and for
     // identifier lists for e.g. the var keyword.  For any of those, pop the
     // stack back up to the opening delimiter, so that we see the same parse
     // stack state for each item in the list.
     if (prev == kOtherKeyword || prev == kOpenBracket ||
         (prev == kOpenBrace &&
          // Similarly, if the second-from-top state is kOpenBrace (or anything
          // else other than kStartOfInput), we know the parse stack has at
          // least three entries.
          CanPreceedObjectLiteral(parse_stack_[parse_stack_.size() - 3]))) {
       parse_stack_.pop_back();
     } else {
       // A comma can also be a binary operator (executing the first operand and
       // returning the second, as it does in C).
       PushOperator();
     }
   } else if (state != kOpenBracket) {
     // The only time commas show up other than right after an expression or
     // identifier is when you have an array literal with missing entries, such
     // as [,2,,3].  So if the top state isn't kExpression, it had better be
     // kOpenBracket.
     return Error(token_out);
   }
   return Emit(JsKeywords::kOperator, 1, token_out);
 }

 bool JsTokenizer::TryConsumeIdentifierOrKeyword(
     JsKeywords::Type* type_out, StringPiece* token_out) {
   DCHECK(!input_.empty());
   // This method gets very hot under load, and regex matching is slow.  We need
   // RE2 here mainly for the unicode support, but most JS files are plain
   // ASCII.  So first try to match against ASCII identifiers; only if we run
   // into a non-ASCII byte will we resort to RE2.
   int index = 0;
   {
     bool use_regex = false;
     const unsigned char first = input_[0];
     if (first >= 0x80) {
       use_regex = true;
     } else if (('a' <= first && first <= 'z') || first == '_' ||
                ('A' <= first && first <= 'Z') || first == '$' ||
                first == '\\') {
       int size = input_.size();
       for (index = 1; index < size; ++index) {
         const unsigned char ch = input_[index];
         if (ch >= 0x80) {
           use_regex = true;
           break;
         } else if (!net_instaweb::IsAsciiAlphaNumeric(ch) && ch != '_' &&
                    ch != '$' && ch != '\\') {
           break;
         }
       }
     } else {
       return false;
     }
     if (use_regex) {
       Re2StringPiece unconsumed = StringPieceToRe2(input_);
       if (!RE2::Consume(&unconsumed, patterns_->identifier_pattern)) {
         return false;
       }
       index = input_.size() - unconsumed.size();
     }
   }
   DCHECK_GT(index, 0);
   // We have a match.  Determine which keyword it is, if any.
   JsKeywords::Flag flag_ignored;
   JsKeywords::Type type =
       JsKeywords::Lookup(input_.substr(0, index), &flag_ignored);
   // A reserved word immediately after a period operator is treated as an
   // identifier.  For example, even though "if" is normally a reserved word,
   // "foo.if" is legal code, and is equivalent to "foo['if']".  Similarly, a
   // reserved word is an identifier when used as a property name for an object
   // literal.
   if (parse_stack_.back() == kPeriod ||
       (parse_stack_.back() == kOpenBrace &&
        CanPreceedObjectLiteral(parse_stack_[parse_stack_.size() - 2]))) {
     PushExpression();
     *type_out = Emit(JsKeywords::kIdentifier, index, token_out);
     return true;
   }
   switch (type) {
     // If the word isn't a keyword, then it's an identifier.  Also, these other
     // "keywords" are only reserved for future use in strict mode, and
     // otherwise are legal identifiers.  Since we don't detect strict mode
     // errors yet, just always allow them as identifiers.
     case JsKeywords::kNotAKeyword:
     case JsKeywords::kImplements:
     case JsKeywords::kInterface:
     case JsKeywords::kLet:
     case JsKeywords::kPackage:
     case JsKeywords::kPrivate:
     case JsKeywords::kProtected:
     case JsKeywords::kPublic:
     case JsKeywords::kStatic:
     case JsKeywords::kYield:
       type = JsKeywords::kIdentifier;
       // An identifier just after a kBlockKeyword is the name of a function
       // declaration; we just ignore it and leave the parse state as
       // kBlockKeyword.  Other identifiers are treated as kExpressions.
       if (parse_stack_.back() != kBlockKeyword) {
         PushExpression();
       }
       break;
     // These keywords are expressions.  A slash after one of these is division
     // (rather than a regex literal).
     case JsKeywords::kFalse:
     case JsKeywords::kNull:
     case JsKeywords::kThis:
     case JsKeywords::kTrue:
       PushExpression();
       break;
     // These keywords must be followed by something in parentheses.  A slash
     // immediately after one of these is invalid; a slash after the parentheses
     // is the start of a regex literal (rather than division).
     case JsKeywords::kCatch:
     case JsKeywords::kFor:
     case JsKeywords::kFunction:
     case JsKeywords::kIf:
     case JsKeywords::kSwitch:
     case JsKeywords::kWhile:
     case JsKeywords::kWith:
       parse_stack_.push_back(kBlockKeyword);
       break;
     // These keywords mark the start of a block.  A slash after one of these is
     // the start of a regex literal (rather than division); an open brace after
     // one of these is the start of a block (rather than an object literal).
     case JsKeywords::kDo:
     case JsKeywords::kElse:
     case JsKeywords::kFinally:
     case JsKeywords::kTry:
       PushBlockHeader();
       break;
     // These keywords act like operators (sort of).  A slash after one of these
     // marks the start of a regex literal (rather than division); an open brace
     // after one of these is the start of an object literal (rather than a
     // block).
     case JsKeywords::kCase:
     case JsKeywords::kDelete:
     case JsKeywords::kIn:
     case JsKeywords::kInstanceof:
     case JsKeywords::kNew:
     case JsKeywords::kTypeof:
     case JsKeywords::kVoid:
       PushOperator();
       break;
     // These two keywords are like prefix operators in their treatment of
     // slashes, but a linebreak after them always induces semicolon insertion.
     case JsKeywords::kReturn:
     case JsKeywords::kThrow:
       parse_stack_.push_back(kReturnThrow);
       break;
     // These keywords can't have a division operator or a regex literal after
     // them, so a slash after one of these is an error (not counting comments,
     // of course).  Moreover, a linebreak after them always induces semicolon
     // insertion.
     case JsKeywords::kBreak:
     case JsKeywords::kContinue:
     case JsKeywords::kDebugger:
       parse_stack_.push_back(kJumpKeyword);
       break;
     // These keywords also can't have a division operator or a regex literal
     // after them.  However, a linebreak after them never induces semicolon
     // insertion.
     case JsKeywords::kConst:
     case JsKeywords::kDefault:
     case JsKeywords::kVar:
       parse_stack_.push_back(kOtherKeyword);
       break;
     // These keywords are reserved and may not be used:
     case JsKeywords::kClass:
     case JsKeywords::kEnum:
     case JsKeywords::kExport:
     case JsKeywords::kExtends:
     case JsKeywords::kImport:
     case JsKeywords::kSuper:
       *type_out = Error(token_out);
       return true;
     default:
       LOG(DFATAL) << "Unknown keyword type: " << type;
       *type_out = Error(token_out);
       return true;
   }
   *type_out = Emit(type, index, token_out);
   return true;
 }

 JsKeywords::Type JsTokenizer::ConsumeNumber(StringPiece* token_out) {
   DCHECK(!input_.empty());
   Re2StringPiece unconsumed = StringPieceToRe2(input_);
   if (!RE2::Consume(&unconsumed, patterns_->numeric_literal_pattern)) {
     // We only call ConsumeNumber when we're sure we're looking at a numeric
     // literal, so this ought not happen even for pathalogical input.
     LOG(DFATAL) << "Failed to match number pattern: " << input_.substr(0, 50);
     return Error(token_out);
   }
   PushExpression();
   return Emit(JsKeywords::kNumber, input_.size() - unconsumed.size(),
               token_out);
 }

 JsKeywords::Type JsTokenizer::ConsumeOperator(StringPiece* token_out) {
   DCHECK(!input_.empty());
   Re2StringPiece unconsumed = StringPieceToRe2(input_);
   if (!RE2::Consume(&unconsumed, patterns_->operator_pattern)) {
     // Unrecognized character:
     return Error(token_out);
   }
   const JsKeywords::Type type =
       Emit(JsKeywords::kOperator, input_.size() - unconsumed.size(), token_out);
   const StringPiece token = *token_out;
   // Is this a postfix operator?  We treat those differently than prefix or
   // unary operators.
   DCHECK(!parse_stack_.empty());
   if ((token == "++" || token == "--") &&
       parse_stack_.back() == kExpression) {
     // Postfix operator; leave the parse state as kExpression.
   } else {
     // Prefix or binary operator; push it onto the stack.
     PushOperator();
   }
   return type;
 }

 JsKeywords::Type JsTokenizer::ConsumePeriod(StringPiece* token_out) {
   DCHECK(!input_.empty());
   DCHECK_EQ('.', input_[0]);
   if (input_.size()  >= 2) {
     const int next = input_[1];
     if (next >= '0' && next <= '9') {
       return ConsumeNumber(token_out);
     }
   }
   parse_stack_.push_back(kPeriod);
   return Emit(JsKeywords::kOperator, 1, token_out);
 }

 JsKeywords::Type JsTokenizer::ConsumeQuestionMark(StringPiece* token_out) {
   DCHECK(!input_.empty());
   DCHECK_EQ('?', input_[0]);
   DCHECK(!parse_stack_.empty());
   if (parse_stack_.back() != kExpression) {
     return Error(token_out);
   }
   parse_stack_.push_back(kQuestionMark);
   return Emit(JsKeywords::kOperator, 1, token_out);
 }

 JsKeywords::Type JsTokenizer::ConsumeRegex(StringPiece* token_out) {
   DCHECK(!input_.empty());
   DCHECK_EQ('/', input_[0]);
   Re2StringPiece unconsumed = StringPieceToRe2(input_);
   if (!RE2::Consume(&unconsumed, patterns_->regex_literal_pattern)) {
     // EOF or a linebreak in the regex will cause an error.
     return Error(token_out);
   }
   PushExpression();
   return Emit(JsKeywords::kRegex, input_.size() - unconsumed.size(), token_out);
 }

 JsKeywords::Type JsTokenizer::ConsumeSemicolon(StringPiece* token_out) {
   DCHECK(!input_.empty());
   DCHECK_EQ(';', input_[0]);
   // Semicolons can appear either at the end of a statement, or within a
   // for-loop header.  So pop the parse state back to the previous open brace
   // (or start of input) for end-of-statement, or the previous open paren (in
   // which case we'd better be within a block header).
   while (true) {
     DCHECK(!parse_stack_.empty());
     const ParseState state = parse_stack_.back();
     if (state == kOpenBracket) {
       return Error(token_out);
     } else if (state == kOpenParen) {
       // Semicolon within parens is only okay if it's a for-loop header, so the
       // parse state below the kOpenParen had better be kBlockKeyword (for the
       // "for" keyword) or else this is a parse error.  (Since the top state is
       // currently kOpenParen, and the bottom state is always kStartOfInput, we
       // know that the parse stack has at least two entries right now).
       DCHECK_GE(parse_stack_.size(), 2u);
       if (parse_stack_[parse_stack_.size() - 2] != kBlockKeyword) {
         return Error(token_out);
       }
       break;
     } else if (state == kStartOfInput || state == kOpenBrace) {
       break;
     }
     parse_stack_.pop_back();
   }
   // Emit a token for the semicolon.
   return Emit(JsKeywords::kOperator, 1, token_out);
 }

 JsKeywords::Type JsTokenizer::ConsumeSlash(StringPiece* token_out) {
   DCHECK(!input_.empty());
   DCHECK_EQ('/', input_[0]);
   // If the slash is immediately followed by a slash or star, it's a comment,
   // no matter what the current parse state is.
   if (input_.size() >= 2) {
     const int next = input_[1];
     if (next == '/') {
       return ConsumeLineComment(token_out);
     } else if (next == '*') {
       return ConsumeBlockComment(token_out);
     }
   }
   // Otherwise, we have to consult the current parse state to decide if this
   // slash is a division operator or the start of a regex literal.
   DCHECK(!parse_stack_.empty());
   switch (parse_stack_.back()) {
     case kExpression:
       return ConsumeOperator(token_out);
     case kStartOfInput:
     case kOperator:
     case kQuestionMark:
     case kOpenBrace:
     case kOpenBracket:
     case kOpenParen:
     case kBlockHeader:
     case kReturnThrow:
       return ConsumeRegex(token_out);
     case kPeriod:
     case kBlockKeyword:
     case kJumpKeyword:
     case kOtherKeyword:
       return Error(token_out);
     default:
       LOG(DFATAL) << "Unknown parse state: " << parse_stack_.back();
       return Error(token_out);
   }
 }

 JsKeywords::Type JsTokenizer::ConsumeString(StringPiece* token_out) {
   DCHECK(!input_.empty());
   DCHECK(input_[0] == '"' || input_[0] == '\'');
   Re2StringPiece unconsumed = StringPieceToRe2(input_);
   if (!RE2::Consume(&unconsumed, patterns_->string_literal_pattern) ||
       input_[input_.size() - unconsumed.size() - 1] != input_[0]) {
     // EOF or an unescaped linebreak in the string will cause an error.
     return Error(token_out);
   }
   PushExpression();
   return Emit(JsKeywords::kStringLiteral, input_.size() - unconsumed.size(),
               token_out);
 }

 bool JsTokenizer::TryConsumeWhitespace(
     bool allow_semicolon_insertion,
     JsKeywords::Type* type_out, StringPiece* token_out) {
   DCHECK(!input_.empty());
   // This method gets very hot under load, and regex matching is slow.  We need
   // RE2 here mainly for the unicode support, but most JS files are plain
   // ASCII.  So first try to match against ASCII whitespace; only if we run
   // into a non-ASCII byte will we resort to RE2.
   bool has_linebreak = false;
   bool use_regex = false;
   int token_size = 0, size = input_.size();
   for (; token_size < size; ++token_size) {
     const unsigned char ch = input_[token_size];
     if (ch >= 0x80) {
       use_regex = true;
       break;
     } else if (ch == '\n' || ch == '\r') {
       has_linebreak = true;
     } else if (ch != ' ' && ch != '\t' && ch != '\f' && ch != '\v') {
       break;
     }
   }
   if (use_regex) {
     Re2StringPiece unconsumed = StringPieceToRe2(input_);
     Re2StringPiece linebreak;
     if (!RE2::Consume(&unconsumed, patterns_->whitespace_pattern, &linebreak)) {
       return false;
     }
     has_linebreak = !linebreak.empty();
     token_size = input_.size() - unconsumed.size();
     DCHECK_GT(token_size, 0);
   }
   if (token_size == 0) {
     return false;
   }
   // Yep, this was whitespace.  Emit a token now, since we may need to do some
   // lookahead in a moment.  We may change *type_out in a moment, but
   // kWhitespace is good enough to get Emit() to do the right thing for now.
   *type_out = Emit(JsKeywords::kWhitespace, token_size, token_out);
   // Now we have to decide what kind of whitespace this was.  If it contained
   // no linebreaks, it's just regular whitespace; otherwise, we have to decide
   // whether or not this linebreak will cause semicolon insertion, and set
   // *type_out accordingly.
   if (has_linebreak) {
     start_of_line_ = true;
     if (allow_semicolon_insertion && TryInsertLinebreakSemicolon()) {
       *type_out = JsKeywords::kSemiInsert;
     } else {
       *type_out = JsKeywords::kLineSeparator;
     }
   }
   return true;
 }

 JsKeywords::Type JsTokenizer::Error(StringPiece* token_out) {
   error_ = true;
   *token_out = input_;
   input_.clear();
   return JsKeywords::kError;
 }

 JsKeywords::Type JsTokenizer::Emit(JsKeywords::Type type, int num_chars,
                                    StringPiece* token_out) {
   DCHECK_GT(num_chars, 0);
   DCHECK_LE(static_cast<size_t>(num_chars), input_.size());
   const StringPiece token = input_.substr(0, num_chars);
   if (type != JsKeywords::kComment && type != JsKeywords::kWhitespace &&
       type != JsKeywords::kLineSeparator && type != JsKeywords::kSemiInsert) {
     start_of_line_ = false;
     // Check if it looks like we're tokenizing a JSON object rather than JS
     // code.  If the first three tokens in the input are open brace, string
     // literal, colon, then this is a JSON object (since that would be illegal
     // syntax at the start of JS code), and we should tweak the parse stack so
     // that we treat the outer braces as an object literal rather than as a
     // code block.  If the first three tokens in the input are anything else,
     // then we can assume this is JS code.
     switch (json_step_) {
       case kJsonStart:
         if (type == JsKeywords::kOperator && token == "{") {
           json_step_ = kJsonOpenBrace;
         } else {
           json_step_ = kIsNotJsonObject;
         }
         break;
       case kJsonOpenBrace:
         if (type == JsKeywords::kStringLiteral) {
           json_step_ = kJsonOpenBraceStringLiteral;
         } else {
           json_step_ = kIsNotJsonObject;
         }
         break;
       case kJsonOpenBraceStringLiteral:
         if (type == JsKeywords::kOperator && token == ":") {
           json_step_ = kIsJsonObject;
           // The first three tokens were open brace, string literal, colon.
           // That will make the parse stack look like "Start {".  We will add
           // an Oper state in between Start and { to make the braces look like
           // an object literal, and then add an Oper state at the end, since
           // that's what we do for colons in an object literal.  The resulting
           // parse stack is "Start Oper { Oper", and we can just continue as
           // normal from there.
           DCHECK_EQ(2u, parse_stack_.size());
           DCHECK_EQ(kStartOfInput, parse_stack_[0]);
           DCHECK_EQ(kOpenBrace, parse_stack_[1]);
           parse_stack_.pop_back();
           parse_stack_.push_back(kOperator);
           parse_stack_.push_back(kOpenBrace);
           parse_stack_.push_back(kOperator);
         } else {
           json_step_ = kIsNotJsonObject;
         }
         break;
       default:
         break;
     }
   }
   *token_out = token;
   input_ = input_.substr(num_chars);
   return type;
 }

 void JsTokenizer::PushBlockHeader() {
   // Push a kBlockHeader state onto the stack, but if there's already a
   // kBlockHeader on the stack (e.g. as in "else if (...)"), merge the two
   // together by simply leaving the stack alone.
   DCHECK(!parse_stack_.empty());
   if (parse_stack_.back() != kBlockHeader) {
     parse_stack_.push_back(kBlockHeader);
   }
 }

 void JsTokenizer::PushExpression() {
   // Push a kExpression state onto the stack, merging it with any kExpression or
   // kOperator states on top (e.g. so "a + b" -> "Expr Oper Expr" becomes "Expr"
   // and "foo(1)" -> "Expr ( Expr )" becomes "Expr Expr" becomes "Expr").
   DCHECK(!parse_stack_.empty());
   while (parse_stack_.back() == kExpression ||
          parse_stack_.back() == kOperator || parse_stack_.back() == kPeriod) {
     parse_stack_.pop_back();
     DCHECK(!parse_stack_.empty());
   }
   parse_stack_.push_back(kExpression);
 }

 void JsTokenizer::PushOperator() {
   // Push a kOperator state onto the stack, but if there's already a kOperator
   // on the stack (e.g. as in "x && !y"), merge the two together by simply
   // leaving the stack alone.
   DCHECK(!parse_stack_.empty());
   if (parse_stack_.back() != kOperator) {
     parse_stack_.push_back(kOperator);
   }
 }

 bool JsTokenizer::TryInsertLinebreakSemicolon() {
   // Determining whether semicolon insertion happens requires checking the next
   // non-whitespace/comment token, so skip past any comments and whitespace and
   // store them in the lookahead queue.  Note that whether or not the linebreak
   // we're considering in this method inserts a semicolon, the subsequent
   // whitespace we're about to skip past certainly won't.
   DCHECK(lookahead_queue_.empty());
   {
     JsKeywords::Type type;
     StringPiece token;
     while (!input_.empty() &&
            (TryConsumeComment(&type, &token) ||
             TryConsumeWhitespace(false, &type, &token))) {
       lookahead_queue_.push_back(std::make_pair(type, token));
     }
   }
   // Even if semicolon insertion would technically happen for the linebreak
   // here, we will pretend that it won't if we're about to hit a real
   // semicolon, or if the semicolon would be inserted anyway without the
   // linebreak.
   if (input_.empty() || input_[0] == ';' || input_[0] == '}') {
     return false;
   }
   // Whether semicolon insertion can happen depends on the current parse state.
   DCHECK(!parse_stack_.empty());
   switch (parse_stack_.back()) {
     case kStartOfInput:
     case kOpenBrace:
     case kOpenBracket:
     case kOpenParen:
     case kBlockKeyword:
     case kBlockHeader:
       // Semicolon insertion never happens in places where it would create an
       // empty statement.
       return false;
     case kExpression:
       // A statement can't end with an unclosed paren or bracket; in
       // particular, semicolons for a for-loop header are never inserted.
       for (std::vector<ParseState>::const_reverse_iterator iter =
                parse_stack_.rbegin(), end = parse_stack_.rend();
            iter != end; ++iter) {
         const ParseState state = *iter;
         if (state == kOpenParen || state == kOpenBracket) {
           return false;
         }
         if (state == kOpenBrace || state == kBlockHeader) {
           break;
         }
       }
       // Semicolon insertion will not happen after an expression if the next
       // token could continue the statement.
       {
         Re2StringPiece unconsumed = StringPieceToRe2(input_);
         if (RE2::Consume(&unconsumed, patterns_->line_continuation_pattern)) {
           return false;
         }
       }
       break;
     // Binary and prefix operators should not have semicolon insertion happen
     // after them.
     case kOperator:
     case kPeriod:
     case kQuestionMark:
       return false;
     // Line continuations are never permitted after return, throw, break,
     // continue, or debugger keywords, so a semicolon is always inserted for
     // those.
     case kReturnThrow:
     case kJumpKeyword:
       break;
     // A statement cannot end after const, default, or var, so we never insert
     // a semicolon after those.
     case kOtherKeyword:
       return false;
     default:
       LOG(DFATAL) << "Unknown parse state: " << parse_stack_.back();
       break;
   }
   // We've decided at this point that semicolon insertion will happen, so
   // update the parse stack to end the current statement.
   while (true) {
     DCHECK(!parse_stack_.empty());
     const ParseState state = parse_stack_.back();
     if (state == kStartOfInput || state == kOpenBrace) {
       break;
     }
     parse_stack_.pop_back();
   }
   return true;
 }

 bool JsTokenizer::CanPreceedObjectLiteral(ParseState state) {
   return (state == kOperator || state == kQuestionMark ||
           state == kOpenBracket || state == kOpenParen ||
           state == kReturnThrow);
 }

 JsTokenizerPatterns::JsTokenizerPatterns()
     : identifier_pattern(kIdentifierRegex),
       line_comment_pattern(kLineCommentRegex),
       numeric_literal_pattern(kNumericLiteralPosixRegex, re2::posix_syntax),
       operator_pattern(kOperatorRegex),
       regex_literal_pattern(kRegexLiteralRegex),
       string_literal_pattern(kStringLiteralRegex),
       whitespace_pattern(kWhitespaceRegex),
       line_continuation_pattern(kLineContinuationRegex) {
   DCHECK(identifier_pattern.ok());
   DCHECK(numeric_literal_pattern.ok());
   DCHECK(operator_pattern.ok());
   DCHECK(regex_literal_pattern.ok());
   DCHECK(string_literal_pattern.ok());
   DCHECK(whitespace_pattern.ok());
   DCHECK(line_continuation_pattern.ok());
 }

 JsTokenizerPatterns::~JsTokenizerPatterns() {}

 }  // namespace js

 }  // namespace pagespeed