| /* |
| * Copyright 2010 Google Inc. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| // Author: jmarantz@google.com (Joshua Marantz) |
| |
| #ifndef PAGESPEED_KERNEL_HTML_HTML_LEXER_H_ |
| #define PAGESPEED_KERNEL_HTML_HTML_LEXER_H_ |
| |
| #include <vector> |
| |
| #include "pagespeed/kernel/base/basictypes.h" |
| #include "pagespeed/kernel/base/printf_format.h" |
| #include "pagespeed/kernel/base/string.h" |
| #include "pagespeed/kernel/base/string_util.h" |
| #include "pagespeed/kernel/html/doctype.h" |
| #include "pagespeed/kernel/html/html_element.h" |
| #include "pagespeed/kernel/html/html_name.h" |
| #include "pagespeed/kernel/http/content_type.h" |
| |
| namespace net_instaweb { |
| |
| class HtmlParse; |
| |
| // Constructs a re-entrant HTML lexer. This lexer minimally parses tags, |
| // attributes, and comments. It is intended to parse the Wild West of the |
| // Web. It's designed to be tolerant of syntactic transgressions, merely |
| // passing through unparseable chunks as Characters. |
| // |
| // TODO(jmarantz): refactor this with html_parse, so that this class owns |
| // the symbol table and the event queue, and no longer needs to mutually |
| // depend on HtmlParse. That will make it easier to unit-test. |
| class HtmlLexer { |
| public: |
| explicit HtmlLexer(HtmlParse* html_parse); |
| ~HtmlLexer(); |
| |
| // Initialize a new parse session, id is only used for error messages. |
| void StartParse(const StringPiece& id, const ContentType& content_type); |
| |
| // Parse a chunk of text, adding events to the parser by calling |
| // html_parse_->AddEvent(...). |
| void Parse(const char* text, int size); |
| |
| // Completes parse, reporting any leftover text as a final HtmlCharacterEvent. |
| void FinishParse(); |
| |
| // Determines whether a tag should be terminated in HTML. |
| bool IsImplicitlyClosedTag(HtmlName::Keyword keyword) const; |
| |
| // Determines whether a tag should be interpreted as a 'literal' |
| // tag. That is, a tag whose contents are not parsed until a |
| // corresponding matching end tag is encountered. |
| static bool IsLiteralTag(HtmlName::Keyword keyword); |
| |
| // Determines whether a tag is interpreted as a 'literal' tag in |
| // some user agents. Since some user agents will interpret the |
| // contents of these tags, our lexer never treats them as literal |
| // tags. |
| static bool IsSometimesLiteralTag(HtmlName::Keyword keyword); |
| |
| // Determines whether a tag can be terminated briefly (e.g. <tag/>) |
| bool TagAllowsBriefTermination(HtmlName::Keyword keyword) const; |
| |
| // Determines whether it's OK to leave a tag unclosed. |
| bool IsOptionallyClosedTag(HtmlName::Keyword keyword) const; |
| |
| // Print element stack to stdout (for debugging). |
| void DebugPrintStack(); |
| |
| // Returns the current lowest-level parent element in the element stack, or |
| // NULL if the stack is empty. |
| HtmlElement* Parent() const; |
| |
| // Return the current assumed doctype of the document (based on the content |
| // type and any HTML directives encountered so far). |
| const DocType& doctype() const { return doctype_; } |
| |
| // Sets the limit on the maximum number of bytes that should be parsed. |
| void set_size_limit(int64 x) { size_limit_ = x; } |
| |
| // Indicates whether we have exceeded the limit on the maximum number of bytes |
| // that we should parse. |
| bool size_limit_exceeded() const { return size_limit_exceeded_; } |
| |
| private: |
| // Most of these routines expect c to be the last character of literal_ |
| inline void EvalStart(char c); |
| inline void EvalTag(char c); |
| inline void EvalTagOpen(char c); |
| inline void EvalTagCloseNoName(char c); |
| inline void EvalTagClose(char c); |
| inline void EvalTagBriefClose(char c); |
| inline void EvalCommentStart1(char c); |
| inline void EvalCommentStart2(char c); |
| inline void EvalCommentBody(char c); |
| inline void EvalCommentEnd1(char c); |
| inline void EvalCommentEnd2(char c); |
| inline void EvalCdataStart1(char c); |
| inline void EvalCdataStart2(char c); |
| inline void EvalCdataStart3(char c); |
| inline void EvalCdataStart4(char c); |
| inline void EvalCdataStart5(char c); |
| inline void EvalCdataStart6(char c); |
| inline void EvalCdataBody(char c); |
| inline void EvalCdataEnd1(char c); |
| inline void EvalCdataEnd2(char c); |
| inline void EvalAttribute(char c); |
| inline void EvalAttrName(char c); |
| inline void EvalAttrNameSpace(char c); |
| inline void EvalAttrEq(char c); |
| inline void EvalAttrVal(char c); |
| inline void EvalAttrValSq(char c); |
| inline void EvalAttrValDq(char c); |
| inline void EvalLiteralTag(char c); |
| inline void EvalScriptTag(char c); |
| inline void EvalDirective(char c); |
| inline void EvalBogusComment(char c); |
| |
| // Makes an element based on token_, which will be parsed as the tag |
| // name. |
| void MakeElement(); |
| |
| void MakeAttribute(bool has_value); |
| void FinishAttribute(char c, bool has_value, bool brief_close); |
| |
| void EmitCdata(); |
| void EmitComment(); |
| void EmitLiteral(); |
| void EmitTagOpen(bool allow_implicit_close); // expects element_ != NULL. |
| void EmitTagClose(HtmlElement::Style style); |
| void EmitTagBriefClose(); |
| void EmitDirective(); |
| void Restart(char c); |
| |
| // Emits a syntax error message. |
| void SyntaxError(const char* format, ...) INSTAWEB_PRINTF_FORMAT(2, 3); |
| |
| // Tries to find a HTML element on the stack matching a tag. If it |
| // finds it, it pops all the intervening elements off the stack, |
| // issuing warnings for each discarded tag, the matching element is |
| // also popped off the stack, and returned. |
| // |
| // If the tag is not matched, then no mutations are done to the stack, |
| // and NULL is returned. |
| // |
| // The tag name should be interned. |
| // TODO(jmarantz): use type system |
| HtmlElement* PopElementMatchingTag(const StringPiece& tag); |
| |
| HtmlElement* PopElement(); |
| void CloseElement(HtmlElement* element, HtmlElement::Style style); |
| |
| // Minimal i18n analysis. With utf-8 and gb2312 we can do this |
| // context-free, and thus the method can be static. If we add |
| // more encodings we may need to turn this into a non-static method. |
| static inline bool IsI18nChar(char c) {return (((c) & 0x80) != 0); } |
| |
| // Determines whether a character can be used in a tag name as first char ... |
| static inline bool IsLegalTagFirstChar(char c); |
| // ... or subsequent char. |
| static inline bool IsLegalTagChar(char c); |
| |
| // Determines whether a character can be used in an attribute name. |
| static inline bool IsLegalAttrNameChar(char c); |
| |
| // The lexer is implemented as a pure state machine. There is |
| // no lookahead. The state is understood primarily in this |
| // enum, although there are a few state flavors that are managed |
| // by the other member variables, notably: has_attr_value_ and |
| // attr_name_.empty(). Those could be eliminated by adding |
| // a few more explicit states. |
| enum State { |
| START, |
| TAG, // "<" |
| TAG_CLOSE_NO_NAME, // "</" |
| TAG_CLOSE, // "</x" |
| TAG_CLOSE_TERMINATE, // "</x " |
| TAG_OPEN, // "<x" |
| TAG_BRIEF_CLOSE, // "<x/" or "<x /" or "<x y/" etc |
| COMMENT_START1, // "<!" |
| COMMENT_START2, // "<!-" |
| COMMENT_BODY, // "<!--" |
| COMMENT_END1, // "-" |
| COMMENT_END2, // "--" |
| CDATA_START1, // "<![" |
| CDATA_START2, // "<![C" |
| CDATA_START3, // "<![CD" |
| CDATA_START4, // "<![CDA" |
| CDATA_START5, // "<![CDAT" |
| CDATA_START6, // "<![CDATA" |
| CDATA_BODY, // "<![CDATA[" |
| CDATA_END1, // "]" |
| CDATA_END2, // "]]" |
| TAG_ATTRIBUTE, // "<x " |
| TAG_ATTR_NAME, // "<x y" |
| TAG_ATTR_NAME_SPACE, // "<x y " |
| TAG_ATTR_EQ, // "<x y=" |
| TAG_ATTR_VAL, // "<x y=x" value terminated by whitespace or > |
| TAG_ATTR_VALDQ, // '<x y="' value terminated by double-quote |
| TAG_ATTR_VALSQ, // "<x y='" value terminated by single-quote |
| LITERAL_TAG, // "<style " or "<iframe ", etc. |
| SCRIPT_TAG, // "<script " |
| DIRECTIVE, // "<!x" |
| BOGUS_COMMENT, // "<?foo>" or "</?foo>" |
| }; |
| |
| HtmlParse* html_parse_; |
| State state_; |
| GoogleString token_; // accumulates tag names and comments |
| GoogleString literal_; // accumulates raw text to pass through |
| GoogleString attr_name_; // accumulates attribute name |
| GoogleString attr_value_; // accumulates attribute value |
| HtmlElement::QuoteStyle attr_quote_; // quote used to delimit attribute |
| bool has_attr_value_; // distinguishes <a n=> from <a n> |
| HtmlElement* element_; // current element; used to collect attributes |
| int line_; |
| int tag_start_line_; // line at which we last transitioned to TAG state |
| GoogleString id_; |
| GoogleString literal_close_; // specific tag go close, e.g </script> |
| bool script_html_comment_; // inside <script> <!-- |
| bool script_html_comment_script_; // inside <script> <!-- <script> |
| // in some cases we have to drop what looks like attributes on a closing |
| // tag as part of error recovery. |
| bool discard_until_start_state_for_error_recovery_; |
| |
| ContentType content_type_; |
| DocType doctype_; |
| |
| std::vector<HtmlElement*> element_stack_; |
| |
| // Indicates that we have exceeded the enforced size limit on the maximum |
| // number of input HTML that we can parse. |
| bool size_limit_exceeded_; |
| // Whether we should skip parsing of all subsequent bytes. HtmlParse calls |
| // this once it has started or ended an HtmlElement. |
| bool skip_parsing_; |
| int64 num_bytes_parsed_; |
| int64 size_limit_; |
| |
| DISALLOW_COPY_AND_ASSIGN(HtmlLexer); |
| }; |
| |
| } // namespace net_instaweb |
| |
| #endif // PAGESPEED_KERNEL_HTML_HTML_LEXER_H_ |