| // Copyright 2014 Google Inc. All Rights Reserved. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| #ifndef PAGESPEED_KERNEL_JS_JS_TOKENIZER_H_ |
| #define PAGESPEED_KERNEL_JS_JS_TOKENIZER_H_ |
| |
| #include <deque> |
| #include <utility> |
| #include <vector> |
| |
| #include "pagespeed/kernel/base/basictypes.h" |
| #include "pagespeed/kernel/base/string.h" |
| #include "pagespeed/kernel/base/string_util.h" |
| #include "pagespeed/kernel/js/js_keywords.h" |
| #include "pagespeed/kernel/util/re2.h" |
| |
| namespace pagespeed { |
| |
| namespace js { |
| |
| class JsTokenizerPatterns; |
| |
| // This class accurately breaks up JavaScript code into a sequence of tokens. |
| // This includes tokens for comments and whitespace; every byte of the input is |
| // represented in the token stream, so that concatenating the text of each |
| // token will perfectly recover the original input, even in error cases (since |
| // the final, error token will contain the entire rest of the input). Also, |
| // each whitespace token is classified by the tokenizer as 1) not containing |
| // linebreaks, 2) containing linebreaks but not inducing semicolon insertion, |
| // or 3) inducing semicolon insertion. |
| // |
| // To do all this, JsTokenizer keeps track of a minimal amount of parse state |
| // to allow it to accurately differentiate between division operators and regex |
| // literals, and to determine which linebreaks will result in semicolon |
| // insertion and which will not. If the given JavaScript code is syntactically |
| // incorrect such that this differentiation becomes impossible, this class will |
| // return an error, but will still tokenize as much as it can up to that point |
| // (note however that many other kinds of syntax errors will be ignored; being |
| // a complete parser or syntax checker is a non-goal of this class). |
| // |
| // This class can also be used to tokenize JSON. Note that a JSON object, such |
| // as {"foo":"bar"}, is NOT legal JavaScript code by itself (since, absent any |
| // context, the braces will be interpreted as a code block rather than as an |
| // object literal); however, JsTokenizer contains special logic to recognize |
| // this case and still tokenize it correctly. |
| // |
| // This separation of tokens and classification of whitespace means that this |
| // class can be used to create a robust JavaScript minifier (see js_minify.h). |
| // It could also perhaps be used as the basis of a more complete JavaScript |
| // parser. |
| class JsTokenizer { |
| public: |
| // Creates a tokenizer that will tokenize the given UTF8-encoded input string |
| // (which must outlive the JsTokenizer object). |
| JsTokenizer(const JsTokenizerPatterns* patterns, StringPiece input); |
| |
| ~JsTokenizer(); |
| |
| // Gets the next token type from the input, and stores the relevant substring |
| // of the original input in token_out (which must be non-NULL). If the end |
| // of input has been reached, returns kEndOfInput and sets token_out to the |
| // empty string. If an error is encountered, sets has_error() to true, |
| // returns kError, and sets token_out to the remainder of the input. |
| JsKeywords::Type NextToken(StringPiece* token_out); |
| |
| // True if an error has been encountered. All future calls to NextToken() |
| // will return JsKeywords::kError with an empty token string. |
| bool has_error() const { return error_; } |
| |
| // Return a string representing the current parse stack, for testing only. |
| GoogleString ParseStackForTest() const; |
| |
| private: |
| // An entry in the parse stack. This does not fully capture the grammar of |
| // JavaScript -- far from it -- rather, it is just barely nuanced enough to |
| // determine which linebreaks are important for semicolon insertion, and to |
| // tell whether or not a given slash begins a regex literal. If it turns out |
| // to insufficiently nuanced (i.e. we find new bugs), it can be refined by |
| // adding more parse states. |
| enum ParseState { |
| kStartOfInput, // For convenience, the bottom of the stack is always this. |
| kExpression, |
| kOperator, // A prefix or binary operator (including some keywords). |
| kPeriod, |
| kQuestionMark, |
| kOpenBrace, |
| kOpenBracket, |
| kOpenParen, |
| kBlockKeyword, // Keyword that precedes "(...)", e.g. "if" or "for". |
| kBlockHeader, // Start of block, e.g. "if (...)", "for (...)", or "else". |
| kReturnThrow, // A return or throw keyword. |
| kJumpKeyword, // A break, continue, or debugger keyword. |
| kOtherKeyword, // A const, default, or var keyword. |
| }; |
| |
| // Enum for tracking whether the first three tokens in the input are open |
| // brace, string literal, colon. If so, we're parsing a JSON object, |
| // otherwise we'll assume we're parsing legal JS code. |
| enum JsonStep { |
| kJsonStart, |
| kJsonOpenBrace, |
| kJsonOpenBraceStringLiteral, |
| kIsJsonObject, |
| kIsNotJsonObject, |
| }; |
| |
| // Consumes an appropriate amount of input and return an appropriate token. |
| JsKeywords::Type ConsumeOpenBrace(StringPiece* token_out); |
| JsKeywords::Type ConsumeCloseBrace(StringPiece* token_out); |
| JsKeywords::Type ConsumeOpenBracket(StringPiece* token_out); |
| JsKeywords::Type ConsumeCloseBracket(StringPiece* token_out); |
| JsKeywords::Type ConsumeOpenParen(StringPiece* token_out); |
| JsKeywords::Type ConsumeCloseParen(StringPiece* token_out); |
| JsKeywords::Type ConsumeBlockComment(StringPiece* token_out); |
| JsKeywords::Type ConsumeLineComment(StringPiece* token_out); |
| JsKeywords::Type ConsumeColon(StringPiece* token_out); |
| JsKeywords::Type ConsumeComma(StringPiece* token_out); |
| JsKeywords::Type ConsumeNumber(StringPiece* token_out); |
| JsKeywords::Type ConsumeOperator(StringPiece* token_out); |
| JsKeywords::Type ConsumePeriod(StringPiece* token_out); |
| JsKeywords::Type ConsumeQuestionMark(StringPiece* token_out); |
| JsKeywords::Type ConsumeRegex(StringPiece* token_out); |
| JsKeywords::Type ConsumeSemicolon(StringPiece* token_out); |
| JsKeywords::Type ConsumeSlash(StringPiece* token_out); |
| JsKeywords::Type ConsumeString(StringPiece* token_out); |
| |
| // For each of these methods, if the start of the input is that kind of |
| // token, consumes the token and returns true, otherwise returns false |
| // without making changes. |
| bool TryConsumeComment( |
| JsKeywords::Type* type_out, StringPiece* token_out); |
| bool TryConsumeIdentifierOrKeyword( |
| JsKeywords::Type* type_out, StringPiece* token_out); |
| bool TryConsumeWhitespace( |
| bool allow_semicolon_insertion, |
| JsKeywords::Type* type_out, StringPiece* token_out); |
| |
| // Sets error_ to true and returns an error token. |
| JsKeywords::Type Error(StringPiece* token_out); |
| |
| // Stores the next num_chars characters of the input into *token_out, and |
| // then increment the start of input_ by num_chars characters. If the token |
| // type is not comment or whitespace, sets start_of_line_ to false. Also |
| // updates json_step_ based on the token type. Returns the token type passed |
| // in, for convenience. |
| JsKeywords::Type Emit(JsKeywords::Type type, int num_chars, |
| StringPiece* token_out); |
| |
| // Pushes a new state onto the parse_stack_, merging states as needed. |
| void PushBlockHeader(); |
| void PushExpression(); |
| void PushOperator(); |
| |
| // If a semicolon will be inserted between the previous token and the next |
| // token (assuming there was a linebreak in between) that _wouldn't_ be |
| // inserted if the linebreak weren't there, update the parse stack to reflect |
| // the semicolon insertion and return true; otherwise do nothing and return |
| // false. |
| bool TryInsertLinebreakSemicolon(); |
| |
| // Returns true if an open brace at this parse state begins an object |
| // literal, or false if it begins a block. |
| static bool CanPreceedObjectLiteral(ParseState state); |
| |
| const JsTokenizerPatterns* patterns_; |
| std::vector<ParseState> parse_stack_; |
| std::deque<std::pair<JsKeywords::Type, StringPiece> > lookahead_queue_; |
| StringPiece input_; // The portion of input that has yet to be consumed. |
| JsonStep json_step_; |
| bool start_of_line_; // No non-whitespace/comment tokens on this line yet. |
| bool error_; |
| |
| DISALLOW_COPY_AND_ASSIGN(JsTokenizer); |
| }; |
| |
| // Structure to store RE2 patterns that can be shared by instances of |
| // JsTokenizer. These patterns are slightly expensive to compile, so we'd |
| // rather not create one for every JsTokenizer instance, but unfortunately C++ |
| // static initializers can run in non-deterministic order and cause other |
| // integration issues. Instead, you must create a JsTokenizerPatterns object |
| // yourself and pass it to the JsTokenizer constructor; ideally, you would just |
| // create one and share it for all JsTokenizer instances. |
| struct JsTokenizerPatterns { |
| public: |
| JsTokenizerPatterns(); |
| ~JsTokenizerPatterns(); |
| |
| const RE2 identifier_pattern; |
| const RE2 line_comment_pattern; |
| const RE2 numeric_literal_pattern; |
| const RE2 operator_pattern; |
| const RE2 regex_literal_pattern; |
| const RE2 string_literal_pattern; |
| const RE2 whitespace_pattern; |
| const RE2 line_continuation_pattern; |
| |
| private: |
| DISALLOW_COPY_AND_ASSIGN(JsTokenizerPatterns); |
| }; |
| |
| } // namespace js |
| |
| } // namespace pagespeed |
| |
| #endif // PAGESPEED_KERNEL_JS_JS_TOKENIZER_H_ |