* Copyright 2010 Google Inc.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
// Author: (Joshua Marantz)
#include <vector>
#include "pagespeed/kernel/base/basictypes.h"
#include "pagespeed/kernel/base/printf_format.h"
#include "pagespeed/kernel/base/string.h"
#include "pagespeed/kernel/base/string_util.h"
#include "pagespeed/kernel/html/doctype.h"
#include "pagespeed/kernel/html/html_element.h"
#include "pagespeed/kernel/html/html_name.h"
#include "pagespeed/kernel/http/content_type.h"
namespace net_instaweb {
class HtmlParse;
// Constructs a re-entrant HTML lexer. This lexer minimally parses tags,
// attributes, and comments. It is intended to parse the Wild West of the
// Web. It's designed to be tolerant of syntactic transgressions, merely
// passing through unparseable chunks as Characters.
// TODO(jmarantz): refactor this with html_parse, so that this class owns
// the symbol table and the event queue, and no longer needs to mutually
// depend on HtmlParse. That will make it easier to unit-test.
class HtmlLexer {
explicit HtmlLexer(HtmlParse* html_parse);
// Initialize a new parse session, id is only used for error messages.
void StartParse(const StringPiece& id, const ContentType& content_type);
// Parse a chunk of text, adding events to the parser by calling
// html_parse_->AddEvent(...).
void Parse(const char* text, int size);
// Completes parse, reporting any leftover text as a final HtmlCharacterEvent.
void FinishParse();
// Determines whether a tag should be terminated in HTML.
bool IsImplicitlyClosedTag(HtmlName::Keyword keyword) const;
// Determines whether a tag should be interpreted as a 'literal'
// tag. That is, a tag whose contents are not parsed until a
// corresponding matching end tag is encountered.
static bool IsLiteralTag(HtmlName::Keyword keyword);
// Determines whether a tag is interpreted as a 'literal' tag in
// some user agents. Since some user agents will interpret the
// contents of these tags, our lexer never treats them as literal
// tags.
static bool IsSometimesLiteralTag(HtmlName::Keyword keyword);
// Determines whether a tag can be terminated briefly (e.g. <tag/>)
bool TagAllowsBriefTermination(HtmlName::Keyword keyword) const;
// Determines whether it's OK to leave a tag unclosed.
bool IsOptionallyClosedTag(HtmlName::Keyword keyword) const;
// Print element stack to stdout (for debugging).
void DebugPrintStack();
// Returns the current lowest-level parent element in the element stack, or
// NULL if the stack is empty.
HtmlElement* Parent() const;
// Return the current assumed doctype of the document (based on the content
// type and any HTML directives encountered so far).
const DocType& doctype() const { return doctype_; }
// Sets the limit on the maximum number of bytes that should be parsed.
void set_size_limit(int64 x) { size_limit_ = x; }
// Indicates whether we have exceeded the limit on the maximum number of bytes
// that we should parse.
bool size_limit_exceeded() const { return size_limit_exceeded_; }
// Most of these routines expect c to be the last character of literal_
inline void EvalStart(char c);
inline void EvalTag(char c);
inline void EvalTagOpen(char c);
inline void EvalTagCloseNoName(char c);
inline void EvalTagClose(char c);
inline void EvalTagBriefClose(char c);
inline void EvalCommentStart1(char c);
inline void EvalCommentStart2(char c);
inline void EvalCommentBody(char c);
inline void EvalCommentEnd1(char c);
inline void EvalCommentEnd2(char c);
inline void EvalCdataStart1(char c);
inline void EvalCdataStart2(char c);
inline void EvalCdataStart3(char c);
inline void EvalCdataStart4(char c);
inline void EvalCdataStart5(char c);
inline void EvalCdataStart6(char c);
inline void EvalCdataBody(char c);
inline void EvalCdataEnd1(char c);
inline void EvalCdataEnd2(char c);
inline void EvalAttribute(char c);
inline void EvalAttrName(char c);
inline void EvalAttrNameSpace(char c);
inline void EvalAttrEq(char c);
inline void EvalAttrVal(char c);
inline void EvalAttrValSq(char c);
inline void EvalAttrValDq(char c);
inline void EvalLiteralTag(char c);
inline void EvalScriptTag(char c);
inline void EvalDirective(char c);
inline void EvalBogusComment(char c);
// Makes an element based on token_, which will be parsed as the tag
// name.
void MakeElement();
void MakeAttribute(bool has_value);
void FinishAttribute(char c, bool has_value, bool brief_close);
void EmitCdata();
void EmitComment();
void EmitLiteral();
void EmitTagOpen(bool allow_implicit_close); // expects element_ != NULL.
void EmitTagClose(HtmlElement::Style style);
void EmitTagBriefClose();
void EmitDirective();
void Restart(char c);
// Emits a syntax error message.
void SyntaxError(const char* format, ...) INSTAWEB_PRINTF_FORMAT(2, 3);
// Tries to find a HTML element on the stack matching a tag. If it
// finds it, it pops all the intervening elements off the stack,
// issuing warnings for each discarded tag, the matching element is
// also popped off the stack, and returned.
// If the tag is not matched, then no mutations are done to the stack,
// and NULL is returned.
// The tag name should be interned.
// TODO(jmarantz): use type system
HtmlElement* PopElementMatchingTag(const StringPiece& tag);
HtmlElement* PopElement();
void CloseElement(HtmlElement* element, HtmlElement::Style style);
// Minimal i18n analysis. With utf-8 and gb2312 we can do this
// context-free, and thus the method can be static. If we add
// more encodings we may need to turn this into a non-static method.
static inline bool IsI18nChar(char c) {return (((c) & 0x80) != 0); }
// Determines whether a character can be used in a tag name as first char ...
static inline bool IsLegalTagFirstChar(char c);
// ... or subsequent char.
static inline bool IsLegalTagChar(char c);
// Determines whether a character can be used in an attribute name.
static inline bool IsLegalAttrNameChar(char c);
// The lexer is implemented as a pure state machine. There is
// no lookahead. The state is understood primarily in this
// enum, although there are a few state flavors that are managed
// by the other member variables, notably: has_attr_value_ and
// attr_name_.empty(). Those could be eliminated by adding
// a few more explicit states.
enum State {
TAG, // "<"
TAG_CLOSE, // "</x"
TAG_OPEN, // "<x"
TAG_BRIEF_CLOSE, // "<x/" or "<x /" or "<x y/" etc
COMMENT_START2, // "<!-"
COMMENT_BODY, // "<!--"
COMMENT_END1, // "-"
COMMENT_END2, // "--"
CDATA_START1, // "<!["
CDATA_START2, // "<![C"
CDATA_START3, // "<![CD"
CDATA_END1, // "]"
CDATA_END2, // "]]"
TAG_ATTR_NAME, // "<x y"
TAG_ATTR_EQ, // "<x y="
TAG_ATTR_VAL, // "<x y=x" value terminated by whitespace or >
TAG_ATTR_VALDQ, // '<x y="' value terminated by double-quote
TAG_ATTR_VALSQ, // "<x y='" value terminated by single-quote
LITERAL_TAG, // "<style " or "<iframe ", etc.
SCRIPT_TAG, // "<script "
DIRECTIVE, // "<!x"
BOGUS_COMMENT, // "<?foo>" or "</?foo>"
HtmlParse* html_parse_;
State state_;
GoogleString token_; // accumulates tag names and comments
GoogleString literal_; // accumulates raw text to pass through
GoogleString attr_name_; // accumulates attribute name
GoogleString attr_value_; // accumulates attribute value
HtmlElement::QuoteStyle attr_quote_; // quote used to delimit attribute
bool has_attr_value_; // distinguishes <a n=> from <a n>
HtmlElement* element_; // current element; used to collect attributes
int line_;
int tag_start_line_; // line at which we last transitioned to TAG state
GoogleString id_;
GoogleString literal_close_; // specific tag go close, e.g </script>
bool script_html_comment_; // inside <script> <!--
bool script_html_comment_script_; // inside <script> <!-- <script>
// in some cases we have to drop what looks like attributes on a closing
// tag as part of error recovery.
bool discard_until_start_state_for_error_recovery_;
ContentType content_type_;
DocType doctype_;
std::vector<HtmlElement*> element_stack_;
// Indicates that we have exceeded the enforced size limit on the maximum
// number of input HTML that we can parse.
bool size_limit_exceeded_;
// Whether we should skip parsing of all subsequent bytes. HtmlParse calls
// this once it has started or ended an HtmlElement.
bool skip_parsing_;
int64 num_bytes_parsed_;
int64 size_limit_;
} // namespace net_instaweb