src/pagespeed/kernel/html/html_lexer.cc - incubator-pagespeed-debian - Git at Google

 /*
  * Copyright 2010 Google Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 // Author: jmarantz@google.com (Joshua Marantz)

 #include "pagespeed/kernel/html/html_lexer.h"

 #include <algorithm>
 #include <cctype>
 #include <cstdarg>
 #include <cstddef>  // for size_t
 #include <cstdio>

 #include "base/logging.h"
 #include "pagespeed/kernel/base/message_handler.h"
 #include "pagespeed/kernel/base/string_util.h"
 #include "pagespeed/kernel/html/html_element.h"
 #include "pagespeed/kernel/html/html_event.h"
 #include "pagespeed/kernel/html/html_keywords.h"
 #include "pagespeed/kernel/html/html_name.h"
 #include "pagespeed/kernel/html/html_node.h"
 #include "pagespeed/kernel/html/html_parse.h"

 namespace net_instaweb {

 namespace {

 // TODO(jmarantz): consider making these sorted-lists be an enum field
 // in the table in html_name.gperf.  I'm not sure if that would make things
 // noticably faster or not.

 // These tags can be specified in documents without a brief "/>",
 // or an explicit </tag>, according to the Chrome Developer Tools console.  See:
 //
 // http://www.whatwg.org/specs/web-apps/current-work/multipage/
 // syntax.html#void-elements
 const HtmlName::Keyword kImplicitlyClosedHtmlTags[] = {
   HtmlName::kXml,
   HtmlName::kArea,
   HtmlName::kBase,
   HtmlName::kBr,
   HtmlName::kCol,
   HtmlName::kEmbed,
   HtmlName::kHr,
   HtmlName::kImg,
   HtmlName::kInput,
   HtmlName::kKeygen,
   HtmlName::kLink,
   HtmlName::kMeta,
   HtmlName::kParam,
   HtmlName::kSource,
   HtmlName::kTrack,
   HtmlName::kWbr,
 };

 // These tags cannot be closed using the brief syntax; they must
 // be closed by using an explicit </TAG>.
 const HtmlName::Keyword kNonBriefTerminatedTags[] = {
   HtmlName::kA,
   HtmlName::kDiv,
   HtmlName::kHeader,  // TODO(jmaessen): All div-like tags?
   HtmlName::kIframe,
   HtmlName::kNav,
   HtmlName::kScript,
   HtmlName::kSpan,
   HtmlName::kStyle,
   HtmlName::kTextarea,
   HtmlName::kXmp,
 };

 // These tags cause the text inside them to be retained literally and not
 // interpreted. See
 // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#parsing-html-fragments
 // for more information.
 //
 // Note that we do not include noscript, noembed, or noframes tags
 // here. For noembed and noframes, HTML5 compatible user agents will
 // not parse their contents, but older user agents that don't support
 // embed/frames tags will still parse their contents. noscript content
 // is parsed conditionally depending on whether the client has
 // scripting enabled. Thus we need to parse the content within these
 // tags as HTML, since some user agents will parse their contents as
 // HTML. These tags are included in kSometimesLiteralTags below.
 //
 // In addition, we do not include the 'plaintext' tag in kLiteralTags,
 // since it works slightly differently from the other literal
 // tags. plaintext indicates that *all* text that follows, up to end
 // of document, should be interpreted as plain text. There is no
 // closing plaintext tag. Thus, if we want to support plaintext, we
 // need to handle it differently from the kLiteralTags.
 const HtmlName::Keyword kLiteralTags[] = {
   HtmlName::kIframe,
   HtmlName::kScript,
   HtmlName::kStyle,
   HtmlName::kTextarea,
   HtmlName::kTitle,
   HtmlName::kXmp,
 };

 // These tags cause the text inside them to be retained literally and
 // not interpreted in some user agents. Since some user agents will
 // interpret the contents of these tags, our lexer never treats them
 // as literal tags. However, a filter that wants to insert new tags
 // that should be processed by all user agents should not insert those
 // elements into one of these tags.
 const HtmlName::Keyword kSometimesLiteralTags[] = {
   HtmlName::kNoembed,
   HtmlName::kNoframes,
   HtmlName::kNoscript,
 };

 // We start our stack-iterations from 1, because we put a NULL into
 // position 0 to reduce special-cases.
 const int kStartStack = 1;

 #ifndef NDEBUG
 #define CHECK_KEYWORD_SET_ORDERING(keywords) \
     CheckKeywordSetOrdering(keywords, arraysize(keywords))
 void CheckKeywordSetOrdering(const HtmlName::Keyword* keywords, int num) {
   for (int i = 1; i < num; ++i) {
     DCHECK_GT(keywords[i], keywords[i - 1]);
   }
 }
 #endif

 bool IsInSet(const HtmlName::Keyword* keywords, int num,
              HtmlName::Keyword keyword) {
   const HtmlName::Keyword* end = keywords + num;
   return std::binary_search(keywords, end, keyword);
 }

 #define IS_IN_SET(keywords, keyword) \
     IsInSet(keywords, arraysize(keywords), keyword)

 }  // namespace

 // TODO(jmarantz): support multi-byte encodings
 // TODO(jmarantz): emit close-tags immediately for selected html tags,
 //   rather than waiting for the next explicit close-tag to force a rebalance.
 //   See http://www.whatwg.org/specs/web-apps/current-work/multipage/
 //   syntax.html#optional-tags

 HtmlLexer::HtmlLexer(HtmlParse* html_parse)
     : html_parse_(html_parse),
       state_(START),
       attr_quote_(HtmlElement::NO_QUOTE),
       has_attr_value_(false),
       element_(NULL),
       line_(1),
       tag_start_line_(-1),
       script_html_comment_(false),
       script_html_comment_script_(false),
       discard_until_start_state_for_error_recovery_(false),
       size_limit_exceeded_(false),
       skip_parsing_(false),
       size_limit_(-1) {
 #ifndef NDEBUG
   CHECK_KEYWORD_SET_ORDERING(kImplicitlyClosedHtmlTags);
   CHECK_KEYWORD_SET_ORDERING(kNonBriefTerminatedTags);
   CHECK_KEYWORD_SET_ORDERING(kLiteralTags);
   CHECK_KEYWORD_SET_ORDERING(kSometimesLiteralTags);
 #endif
 }

 HtmlLexer::~HtmlLexer() {
 }

 void HtmlLexer::EvalStart(char c) {
   if (c == '<') {
     literal_.resize(literal_.size() - 1);
     EmitLiteral();
     literal_ += c;
     state_ = TAG;
     discard_until_start_state_for_error_recovery_ = false;
     tag_start_line_ = line_;
   } else {
     state_ = START;
   }
 }

 // Browsers only allow letters for first char in tag name --- see
 // HTML5 "Tag open state"
 // TODO(morlovich): Use an ASCII method rather than isalpha
 bool HtmlLexer::IsLegalTagFirstChar(char c) {
   return (isalpha(c) != 0);  // Required by MSVC 10.0: warning C4800 :-(
 }

 // ... and letters, digits, unicode and some symbols for subsequent chars.
 // Based on a test of Firefox and Chrome.
 //
 // TODO(jmarantz): revisit these predicates based on
 // http://www.w3.org/TR/REC-xml/#NT-NameChar .  This
 // XML spec may or may not inform of us of what we need to do
 // to parse all HTML on the web.
 // TODO(morlovich): It's completely bogus for HTML.
 bool HtmlLexer::IsLegalTagChar(char c) {
   return (IsI18nChar(c) ||
           (isalnum(c) || (c == '<') || (c == '-') || (c == '#') ||
           (c == '_') || (c == ':')));
 }

 // TODO(morlovich): This is even more bogus, since it's true for
 // anything that's not =, >, / or whitespace.
 bool HtmlLexer::IsLegalAttrNameChar(char c) {
   return (IsI18nChar(c) ||
           ((c != '=') && (c != '>') && (c != '/') && !IsHtmlSpace(c)));
 }

 // Handle the case where "<" was recently parsed.
 // HTML5 spec state name: Tag open state
 void HtmlLexer::EvalTag(char c) {
   if (c == '/') {
     state_ = TAG_CLOSE_NO_NAME;
   } else if (IsLegalTagFirstChar(c)) {   // "<x"
     state_ = TAG_OPEN;
     discard_until_start_state_for_error_recovery_ = false;
     token_ += c;
   } else if (c == '!') {
     state_ = COMMENT_START1;
   } else if (c == '?') {
     state_ = BOGUS_COMMENT;
   } else {
     //  Illegal tag syntax; just pass it through as raw characters
     SyntaxError("Invalid tag syntax: unexpected sequence `<%c'", c);
     EvalStart(c);
   }
 }

 // Handle the case where "<x" was recently parsed.  We will stay in this
 // state as long as we keep seeing legal tag characters, appending to
 // token_ for each character.
 void HtmlLexer::EvalTagOpen(char c) {
   if (IsLegalTagChar(c)) {
     token_ += c;
   } else if (c == '>') {
     MakeElement();
     EmitTagOpen(true);
   } else if (c == '/') {
     state_ = TAG_BRIEF_CLOSE;
   } else if (IsHtmlSpace(c)) {
     state_ = TAG_ATTRIBUTE;
   } else {
     // Some other punctuation.  Not sure what to do.  Let's run this
     // on the web and see what breaks & decide what to do.  E.g. "<x&"
     SyntaxError("Invalid character `%c` while parsing tag `%s'",
                 c, token_.c_str());
     token_.clear();
     state_ = START;
   }
 }

 // Handle several cases of seeing "/" in the middle of a tag.
 // Examples: "<x/", "<x /", "<x foo/", "<x foo /"
 // Important thing to note about this is
 // that this state isn't entered when parsing an attribute value, e.g.
 // after =, only before it.
 // HTML5 spec state name:  Self-closing start tag state.
 void HtmlLexer::EvalTagBriefClose(char c) {
   DCHECK(!has_attr_value_);
   if (c == '>') {
     // FinishAttribute is robust with attr_name_ being empty,
     // which happens if we just have <foo/>; we might need to actually
     // create the element itself, though.
     if (!discard_until_start_state_for_error_recovery_) {
       MakeElement();
     }
     FinishAttribute(c, has_attr_value_, true /* self-closing*/);
   } else {
     if (!attr_name_.empty()) {
       MakeAttribute(has_attr_value_);
     }
     state_ = TAG_ATTRIBUTE;
     EvalAttribute(c);
   }
 }

 // Called after </
 // HTML5 spec state name: End tag open state
 void HtmlLexer::EvalTagCloseNoName(char c) {
   if (IsLegalTagChar(c)) {
     token_ += c;
     state_ = TAG_CLOSE;
   } else if (c == '>') {
     SyntaxError("Invalid tag syntax: </>");
     token_.clear();
     EvalStart(c);
   } else {
     // Anything else after </ is handled as bogus comment.
     state_ = BOGUS_COMMENT;
   }
 }

 // Handle the case where "</a" was recently parsed.  This function
 // is also called for "</a ", in which case state will be TAG_CLOSE_TERMINATE.
 // We distinguish that case to report an error on "</a b>".
 void HtmlLexer::EvalTagClose(char c) {
   if ((state_ != TAG_CLOSE_TERMINATE) && IsLegalTagChar(c)) {  // "</x"
     token_ += c;
   } else if (IsHtmlSpace(c)) {
     if (token_.empty()) {  // e.g. "</ a>"
       // just ignore the whitespace.  Wait for
       // the tag-name to begin.
     } else {
       // "</a ".  Now we are in a state where we can only
       // accept more whitespace or a close.
       state_ = TAG_CLOSE_TERMINATE;
     }
   } else if (c == '>') {
     EmitTagClose(HtmlElement::EXPLICIT_CLOSE);
   } else {
     SyntaxError("Invalid tag syntax: expected `>' after `</%s' got `%c'",
                 token_.c_str(), c);
     token_.clear();
     EvalStart(c);
   }
 }

 // Handle the case where "<!x" was recently parsed, where x
 // is any illegal tag identifier.  We stay in this state until
 // we see the ">", accumulating the directive in token_.
 void HtmlLexer::EvalDirective(char c) {
   if (c == '>') {
     EmitDirective();
   } else {
     token_ += c;
   }
 }

 // HTML5 handles things like <?foo> and </?foo> as a special kind of messed up
 // comments, terminated by >. We do likewise, but also pass the bytes along
 // HTML5 state name: Bogus comment state
 void HtmlLexer::EvalBogusComment(char c) {
   if (c == '>') {
     EmitLiteral();
     state_ = START;
   }
 }

 // After a partial match of a multi-character lexical sequence, a mismatched
 // character needs to temporarily removed from the retained literal_ before
 // being emitted.  Then re-inserted for so that EvalStart can attempt to
 // re-evaluate this character as potentialy starting a new lexical token.
 void HtmlLexer::Restart(char c) {
   CHECK_LE(1U, literal_.size());
   CHECK_EQ(c, literal_[literal_.size() - 1]);
   literal_.resize(literal_.size() - 1);
   EmitLiteral();
   literal_ += c;
   EvalStart(c);
 }

 // Handle the case where "<!" was recently parsed.
 void HtmlLexer::EvalCommentStart1(char c) {
   if (c == '-') {
     state_ = COMMENT_START2;
   } else if (c == '[') {
     state_ = CDATA_START1;
   } else if (IsLegalTagChar(c) && (c != '<')) {  // "<!DOCTYPE ... >"
     state_ = DIRECTIVE;
     EvalDirective(c);
   } else {
     SyntaxError("Invalid comment syntax");
     Restart(c);
   }
 }

 // Handle the case where "<!-" was recently parsed.
 void HtmlLexer::EvalCommentStart2(char c) {
   if (c == '-') {
     state_ = COMMENT_BODY;
   } else {
     SyntaxError("Invalid comment syntax");
     Restart(c);
   }
 }

 // Handle the case where "<!--" was recently parsed.  We will stay in
 // this state until we see "-".  And even after that we may go back to
 // this state if the "-" is not followed by "->".
 void HtmlLexer::EvalCommentBody(char c) {
   if (c == '-') {
     state_ = COMMENT_END1;
   } else {
     token_ += c;
   }
 }

 // Handle the case where "-" has been parsed from a comment.  If we
 // see another "-" then we go to CommentEnd2, otherwise we go back
 // to the comment state.
 void HtmlLexer::EvalCommentEnd1(char c) {
   if (c == '-') {
     state_ = COMMENT_END2;
   } else {
     // thought we were ending a comment cause we saw '-', but
     // now we changed our minds.   No worries mate.  That
     // fake-out dash was just part of the comment.
     token_ += '-';
     token_ += c;
     state_ = COMMENT_BODY;
   }
 }

 // Handle the case where "--" has been parsed from a comment.
 void HtmlLexer::EvalCommentEnd2(char c) {
   if (c == '>') {
     EmitComment();
     state_ = START;
   } else if (c == '-') {
     // There could be an arbitrarily long stream of dashes before
     // we see the >.  Keep looking.
     token_ += "-";
   } else {
     // thought we were ending a comment cause we saw '--', but
     // now we changed our minds.   No worries mate.  Those
     // fake-out dashes were just part of the comment.
     token_ += "--";
     token_ += c;
     state_ = COMMENT_BODY;
   }
 }

 // Handle the case where "<![" was recently parsed.
 void HtmlLexer::EvalCdataStart1(char c) {
   // TODO(mdsteele): What about IE downlevel-revealed conditional comments?
   //   Those look like e.g. <![if foo]> and <![endif]>.  This will treat those
   //   as syntax errors and emit them verbatim (which is usually harmless), but
   //   ideally we'd identify them as HtmlIEDirectiveEvents.
   //   See http://msdn.microsoft.com/en-us/library/ms537512(VS.85).aspx
   if (c == 'C') {
     state_ = CDATA_START2;
   } else {
     SyntaxError("Invalid CDATA syntax");
     Restart(c);
   }
 }

 // Handle the case where "<![C" was recently parsed.
 void HtmlLexer::EvalCdataStart2(char c) {
   if (c == 'D') {
     state_ = CDATA_START3;
   } else {
     SyntaxError("Invalid CDATA syntax");
     Restart(c);
   }
 }

 // Handle the case where "<![CD" was recently parsed.
 void HtmlLexer::EvalCdataStart3(char c) {
   if (c == 'A') {
     state_ = CDATA_START4;
   } else {
     SyntaxError("Invalid CDATA syntax");
     Restart(c);
   }
 }

 // Handle the case where "<![CDA" was recently parsed.
 void HtmlLexer::EvalCdataStart4(char c) {
   if (c == 'T') {
     state_ = CDATA_START5;
   } else {
     SyntaxError("Invalid CDATA syntax");
     Restart(c);
   }
 }

 // Handle the case where "<![CDAT" was recently parsed.
 void HtmlLexer::EvalCdataStart5(char c) {
   if (c == 'A') {
     state_ = CDATA_START6;
   } else {
     SyntaxError("Invalid CDATA syntax");
     Restart(c);
   }
 }

 // Handle the case where "<![CDATA" was recently parsed.
 void HtmlLexer::EvalCdataStart6(char c) {
   if (c == '[') {
     state_ = CDATA_BODY;
   } else {
     SyntaxError("Invalid CDATA syntax");
     Restart(c);
   }
 }

 // Handle the case where "<![CDATA[" was recently parsed.  We will stay in
 // this state until we see "]".  And even after that we may go back to
 // this state if the "]" is not followed by "]>".
 void HtmlLexer::EvalCdataBody(char c) {
   if (c == ']') {
     state_ = CDATA_END1;
   } else {
     token_ += c;
   }
 }

 // Handle the case where "]" has been parsed from a cdata.  If we
 // see another "]" then we go to CdataEnd2, otherwise we go back
 // to the cdata state.
 void HtmlLexer::EvalCdataEnd1(char c) {
   if (c == ']') {
     state_ = CDATA_END2;
   } else {
     // thought we were ending a cdata cause we saw ']', but
     // now we changed our minds.   No worries mate.  That
     // fake-out bracket was just part of the cdata.
     token_ += ']';
     token_ += c;
     state_ = CDATA_BODY;
   }
 }

 // Handle the case where "]]" has been parsed from a cdata.
 void HtmlLexer::EvalCdataEnd2(char c) {
   if (c == '>') {
     EmitCdata();
     state_ = START;
   } else {
     // thought we were ending a cdata cause we saw ']]', but
     // now we changed our minds.   No worries mate.  Those
     // fake-out brackets were just part of the cdata.
     token_ += "]]";
     token_ += c;
     state_ = CDATA_BODY;
   }
 }

 // Handle the case where a literal tag (style, iframe) was started.
 // This is of lexical significance because we ignore all the special
 // characters until we see "</style>" or "</iframe>", or similar for
 // other tags.
 void HtmlLexer::EvalLiteralTag(char c) {
   // Look explicitly for </style, etc.> in the literal buffer.
   // TODO(jmarantz): check for whitespace in unexpected places.
   if (c == '>') {
     // expecting "</x>" for tag x.
     html_parse_->message_handler()->Check(
         literal_close_.size() > 3, "literal_close_.size() <= 3");  // NOLINT
     int literal_minus_close_size = literal_.size() - literal_close_.size();
     if ((literal_minus_close_size >= 0) &&
         StringCaseEqual(literal_.c_str() + literal_minus_close_size,
                         literal_close_)) {
       // The literal actually starts after the "<style>", and we will
       // also let it finish before, so chop it off.
       literal_.resize(literal_minus_close_size);
       EmitLiteral();
       token_.clear();
       // Transform "</style>" into "style" to form close tag.
       token_.append(literal_close_.c_str() + 2, literal_close_.size() - 3);
       EmitTagClose(HtmlElement::EXPLICIT_CLOSE);
     }
   }
 }

 // This returns true if 'c' following a </script should get us out of either
 // script parsing or escaping level.
 static bool CanEndTag(char c) {
   return (c == '\t' || c == '\r' || c == '\n' || c == '\f' || c == ' ' ||
           c == '/' || c == '>');
 }

 void HtmlLexer::EvalScriptTag(char c) {
   // We generally just buffer stuff into literal_ until we see </script ,
   // but there is a special case we need to worry about unlike for other
   // literal tags: a </script> wouldn't close us if we're both inside
   // what looks like an HTML comment and saw a <script opening before.
   // See http://wiki.whatwg.org/wiki/CDATA_Escapes and
   // http://lists.w3.org/Archives/Public/public-html/2009Aug/0452.html
   // for a bit of backstory.
   if (c == '-') {
     if (StringPiece(literal_).ends_with("<!--")) {
       script_html_comment_ = true;
     }
   }

   if (CanEndTag(c) && !literal_.empty()) {
     StringPiece prev_fragment(literal_);
     prev_fragment.remove_suffix(1);
     if (StringCaseEndsWith(prev_fragment, "</script")) {
       if (script_html_comment_script_) {
         // Just close one escaping level, not <script>"
         script_html_comment_script_ = false;
       } else {
         // Script actually closed, emit it.
         script_html_comment_ = false;
         script_html_comment_script_ = false;

         // Drop the '</script' + c from literal, and also save the form
         // of the '</script' for the close tag.
         token_ = literal_.substr(
             literal_.size() - STATIC_STRLEN("</script") + 1,
             STATIC_STRLEN("script"));
         literal_.resize(literal_.size() - STATIC_STRLEN("</script") - 1);
         EmitLiteral();
         EmitTagClose(HtmlElement::EXPLICIT_CLOSE);

         // Now depending on the 'c' we may need to do some further
         // parsing to recover from errors.
         if (c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == ' ') {
           // Weirdly, we're supposed to parse attributes here (on a closing
           // tag!) and just throw them away.
           discard_until_start_state_for_error_recovery_ = true;
           state_ = TAG_ATTRIBUTE;
         } else if (c == '/') {
           discard_until_start_state_for_error_recovery_ = true;
           state_ = TAG_BRIEF_CLOSE;
         }
       }
     } else if (script_html_comment_ &&
                StringCaseEndsWith(prev_fragment, "<script")) {
       // Inside a comment, what looks like a 'terminated' <script>
       // gets us into an another level of escaping.
       script_html_comment_script_ = true;
     } else if (c == '>' && StringPiece(literal_).ends_with("-->")) {
       // --> exits both level of escaping.
       script_html_comment_ = false;
       script_html_comment_script_ = false;
     }
   }
 }

 // Emits raw uninterpreted characters.
 void HtmlLexer::EmitLiteral() {
   if (!literal_.empty()) {
     html_parse_->AddEvent(new HtmlCharactersEvent(
         html_parse_->NewCharactersNode(Parent(), literal_), tag_start_line_));
     literal_.clear();
   }
   state_ = START;
 }

 void HtmlLexer::EmitComment() {
   literal_.clear();
   // The precise syntax of IE conditional comments (for example, exactly where
   // is whitespace tolerated?) doesn't seem to be specified anywhere, but my
   // brief experiments suggest that this heuristic is okay.  (mdsteele)
   // See http://en.wikipedia.org/wiki/Conditional_comment
   if ((token_.find("[if") != GoogleString::npos) ||
       (token_.find("[endif]") != GoogleString::npos)) {
     HtmlIEDirectiveNode* node =
         html_parse_->NewIEDirectiveNode(Parent(), token_);
     html_parse_->AddEvent(new HtmlIEDirectiveEvent(node, tag_start_line_));
   } else {
     HtmlCommentNode* node = html_parse_->NewCommentNode(Parent(), token_);
     html_parse_->AddEvent(new HtmlCommentEvent(node, tag_start_line_));
   }
   token_.clear();
   state_ = START;
 }

 void HtmlLexer::EmitCdata() {
   literal_.clear();
   html_parse_->AddEvent(new HtmlCdataEvent(
       html_parse_->NewCdataNode(Parent(), token_), tag_start_line_));
   token_.clear();
   state_ = START;
 }

 // If allow_implicit_close is true, and the element type is one which
 // does not require an explicit termination in HTML, then we will
 // automatically emit a matching 'element close' event.
 void HtmlLexer::EmitTagOpen(bool allow_implicit_close) {
   if (discard_until_start_state_for_error_recovery_) {
     state_ = START;
     literal_.clear();
     return;
   }

   DCHECK(element_ != NULL);
   DCHECK(token_.empty());
   HtmlName next_tag = element_->name();

   // Look for elements that are implicitly closed by an open for this type.
   HtmlName::Keyword next_keyword = next_tag.keyword();

   // Continue popping off auto-close elements as needed to handle cases like
   // IClosedByOpenTr in html_parse_test.cc: "<tr><i>a<tr>b".  The first the <i>
   // needs to be auto-closed, then the <tr>.
   for (HtmlElement* open_element = Parent(); open_element != NULL; ) {
     // TODO(jmarantz): this is a hack -- we should make a more elegant
     // structure of open/new tag combinations that we should auto-close.
     HtmlName::Keyword open_keyword = open_element->keyword();
     if (HtmlKeywords::IsAutoClose(open_keyword, next_keyword)) {
       element_stack_.pop_back();
       CloseElement(open_element, HtmlElement::AUTO_CLOSE);

       // Having automatically closed the element that was open on the stack,
       // we must recompute the open element from whatever is now on top of
       // the stack.  We must also correct the current element's parent to
       // maintain DOM consistency with the event stream.
       DCHECK_EQ(element_->parent(), open_element);
       open_element = Parent();
       element_->set_parent(open_element);
     } else {
       break;
     }
   }

   literal_.clear();
   html_parse_->AddElement(element_, tag_start_line_);
   if (size_limit_exceeded_) {
     skip_parsing_ = true;
   }
   element_stack_.push_back(element_);
   if (IsLiteralTag(element_->keyword())) {
     state_ =
         (element_->keyword() == HtmlName::kScript) ? SCRIPT_TAG : LITERAL_TAG;
     script_html_comment_ = false;
     script_html_comment_script_ = false;
     literal_close_ = StrCat("</", element_->name_str(), ">");
   } else {
     state_ = START;
   }

   if (allow_implicit_close && IsImplicitlyClosedTag(element_->keyword())) {
     element_->name_str().CopyToString(&token_);
     EmitTagClose(HtmlElement::IMPLICIT_CLOSE);
   }

   element_ = NULL;
 }

 void HtmlLexer::EmitTagBriefClose() {
   if (!discard_until_start_state_for_error_recovery_)  {
     HtmlElement* element = PopElement();
     CloseElement(element, HtmlElement::BRIEF_CLOSE);
   }
   state_ = START;
 }

 HtmlElement* HtmlLexer::Parent() const {
   if (element_stack_.empty()) {
     return NULL;
   }
   return element_stack_.back();
 }

 void HtmlLexer::MakeElement() {
   DCHECK(!discard_until_start_state_for_error_recovery_);
   if (element_ == NULL) {
     if (token_.empty()) {
       SyntaxError("Making element with empty tag name");
     }
     element_ = html_parse_->NewElement(Parent(), token_);
     element_->set_begin_line_number(tag_start_line_);
     token_.clear();
   }
 }

 void HtmlLexer::StartParse(const StringPiece& id,
                            const ContentType& content_type) {
   line_ = 1;
   tag_start_line_ = -1;
   id.CopyToString(&id_);
   content_type_ = content_type;
   has_attr_value_ = false;
   attr_quote_ = HtmlElement::NO_QUOTE;
   state_ = START;
   element_stack_.clear();
   element_stack_.push_back(static_cast<HtmlElement*>(0));
   element_ = NULL;
   token_.clear();
   attr_name_.clear();
   attr_value_.clear();
   literal_.clear();
   size_limit_exceeded_ = false;
   skip_parsing_ = false;
   num_bytes_parsed_ = 0;
   script_html_comment_ = false;
   script_html_comment_script_ = false;
   discard_until_start_state_for_error_recovery_ = false;
   // clear buffers
 }

 void HtmlLexer::FinishParse() {
   if (!token_.empty()) {
     SyntaxError("End-of-file in mid-token: %s", token_.c_str());
     token_.clear();
   }
   if (!attr_name_.empty()) {
     SyntaxError("End-of-file in mid-attribute-name: %s", attr_name_.c_str());
     attr_name_.clear();
   }
   if (!attr_value_.empty()) {
     SyntaxError("End-of-file in mid-attribute-value: %s", attr_value_.c_str());
     attr_value_.clear();
   }

   if (!literal_.empty()) {
     EmitLiteral();
   }

   // Any unclosed tags?  These should be noted.
   html_parse_->message_handler()->Check(!element_stack_.empty(),
                                         "element_stack_.empty()");
   html_parse_->message_handler()->Check(element_stack_[0] == NULL,
                                         "element_stack_[0] != NULL");

   for (int i = element_stack_.size() - 1; i > 0; --i) {
     HtmlElement* element = element_stack_.back();
     element->name_str().CopyToString(&token_);
     HtmlElement::Style style = skip_parsing_ ?
         HtmlElement::EXPLICIT_CLOSE : HtmlElement::UNCLOSED;
     EmitTagClose(style);
     if (!HtmlKeywords::IsOptionallyClosedTag(element->keyword())) {
       html_parse_->Info(id_.c_str(), element->begin_line_number(),
                         "End-of-file with open tag: %s",
                         CEscape(element->name_str()).c_str());
     }
   }
   DCHECK_EQ(1U, element_stack_.size());
   DCHECK_EQ(static_cast<HtmlElement*>(0), element_stack_[0]);
   element_ = NULL;
 }

 void HtmlLexer::MakeAttribute(bool has_value) {
   if (!discard_until_start_state_for_error_recovery_) {
     html_parse_->message_handler()->Check(element_ != NULL, "element_ == NULL");
   }
   HtmlName name = html_parse_->MakeName(attr_name_);
   attr_name_.clear();
   const char* value = NULL;
   html_parse_->message_handler()->Check(has_value == has_attr_value_,
                                         "has_value != has_attr_value_");
   if (has_value) {
     value = attr_value_.c_str();
     has_attr_value_ = false;
   } else {
     html_parse_->message_handler()->Check(attr_value_.empty(),
                                           "!attr_value_.empty()");
   }

   if (!discard_until_start_state_for_error_recovery_) {
     element_->AddEscapedAttribute(name, value, attr_quote_);
   }
   attr_value_.clear();
   attr_quote_ = HtmlElement::NO_QUOTE;
   state_ = TAG_ATTRIBUTE;
 }

 // HTML5 spec state name: before attribute name state
 void HtmlLexer::EvalAttribute(char c) {
   if (!discard_until_start_state_for_error_recovery_) {
     MakeElement();
   }
   attr_name_.clear();
   attr_value_.clear();
   if (c == '>') {
     EmitTagOpen(true);
   } else if (c == '/') {
     state_ = TAG_BRIEF_CLOSE;
   } else if (IsLegalAttrNameChar(c)) {
     attr_name_ += c;
     state_ = TAG_ATTR_NAME;
   } else if (!IsHtmlSpace(c)) {
     SyntaxError("Unexpected char `%c' in attribute list", c);
     // Per HTML5, we still switch to the attribute name state here,
     // even for weird things like ", =, etc.
     attr_name_ += c;
     state_ = TAG_ATTR_NAME;
   }
 }

 // "<x y".
 // HTML5 spec state name: Attribute name
 void HtmlLexer::EvalAttrName(char c) {
   if (c == '=') {
     state_ = TAG_ATTR_EQ;
     has_attr_value_ = true;
   } else if (IsHtmlSpace(c)) {
     state_ = TAG_ATTR_NAME_SPACE;
   } else if (c == '>') {
     MakeAttribute(false);
     EmitTagOpen(true);
   } else if (c == '/') {
     state_ = TAG_BRIEF_CLOSE;
   } else {
     // This includes both legal characters, and anything else, even stuff
     // like <, etc.
     attr_name_ += c;
   }
 }

 // "<x y ".
 // HTML5 spec state name: After attribute name
 void HtmlLexer::EvalAttrNameSpace(char c) {
   if (c == '=') {
     state_ = TAG_ATTR_EQ;
     has_attr_value_ = true;
   } else if (IsHtmlSpace(c)) {
     state_ = TAG_ATTR_NAME_SPACE;
   } else if (c == '>') {
     MakeAttribute(false);
     EmitTagOpen(true);
   } else if (c == '/') {
     state_ = TAG_BRIEF_CLOSE;
   } else {
     // "<x y z".  Now that we see the 'z', we need
     // to finish 'y' as an attribute, then queue up
     // 'z' (c) as the start of a new attribute.
     MakeAttribute(false);
     state_ = TAG_ATTR_NAME;
     attr_name_ += c;
   }
 }

 void HtmlLexer::FinishAttribute(char c, bool has_value, bool brief_close) {
   if (IsHtmlSpace(c)) {
     MakeAttribute(has_value);
     state_ = TAG_ATTRIBUTE;
   } else if (c == '>') {
     if (!attr_name_.empty()) {
       MakeAttribute(has_value);
     }
     EmitTagOpen(!brief_close);
     if (brief_close) {
       EmitTagBriefClose();
     }

     has_attr_value_ = false;
   } else {
     // We are only supposed to be involved on space and >
     LOG(DFATAL) << "FinishAttribute called with a weird c:" << c;
   }
 }

 // HTML5 state name: before attribute value
 void HtmlLexer::EvalAttrEq(char c) {
   if (c == '"') {
     attr_quote_ = HtmlElement::DOUBLE_QUOTE;
     state_ = TAG_ATTR_VALDQ;
   } else if (c == '\'') {
     attr_quote_ = HtmlElement::SINGLE_QUOTE;
     state_ = TAG_ATTR_VALSQ;
   } else if (IsHtmlSpace(c)) {
     // ignore -- spaces are allowed between "=" and the value
   } else if (c == '>') {
     FinishAttribute(c, true, false);
   } else {
     state_ = TAG_ATTR_VAL;
     attr_quote_ = HtmlElement::NO_QUOTE;
     EvalAttrVal(c);
   }
 }

 // HTML5 state name: Attribute value (unquoted) state
 void HtmlLexer::EvalAttrVal(char c) {
   if (IsHtmlSpace(c) || (c == '>')) {
     FinishAttribute(c, true, false);
   } else {
     attr_value_ += c;
   }
 }

 // HTML5 state name: Attribute value (double-quoted) state
 void HtmlLexer::EvalAttrValDq(char c) {
   if (c == '"') {
     MakeAttribute(true);
   } else {
     attr_value_ += c;
   }
 }

 // HTML5 state name: Attribute value (single-quoted) state
 void HtmlLexer::EvalAttrValSq(char c) {
   if (c == '\'') {
     MakeAttribute(true);
   } else {
     attr_value_ += c;
   }
 }

 void HtmlLexer::EmitTagClose(HtmlElement::Style style) {
   HtmlElement* element = PopElementMatchingTag(token_);
   if (element != NULL) {
     DCHECK(StringCaseEqual(token_, element->name_str()));
     element->set_end_line_number(line_);
     CloseElement(element, style);
   } else {
     SyntaxError("Unexpected close-tag `%s', no tags are open",
                 token_.c_str());

     // Structurally the close-tag we just parsed is not open.  This
     // might happen because the HTML structure constraint forced this
     // tag to be closed already, but now we finally see a literal
     // close.  Note that the earlier close will be structural in the
     // API, but invisible because it will be an AUTO_CLOSE.  Now that
     // we see the *real* close, we don't want to eat it because we
     // want to be byte-accurate to the input.  So we emit the "</tag>"
     // as a Characters literal.
     EmitLiteral();
   }

   literal_.clear();
   token_.clear();
   state_ = START;
 }

 void HtmlLexer::EmitDirective() {
   literal_.clear();
   html_parse_->AddEvent(new HtmlDirectiveEvent(
       html_parse_->NewDirectiveNode(Parent(), token_), line_));
   // Update the doctype; note that if this is not a doctype directive, Parse()
   // will return false and not alter doctype_.
   doctype_.Parse(token_, content_type_);
   token_.clear();
   state_ = START;
 }

 void HtmlLexer::Parse(const char* text, int size) {
   num_bytes_parsed_ += size;
   if (size_limit_ > 0 && num_bytes_parsed_ > size_limit_) {
     size_limit_exceeded_ = true;
   }
   // TODO(nikhilmadan): Protect against an unbounded sequence of bytes within an
   // element, probably by just aborting the parse completely.

   for (int i = 0; i < size; ++i) {
     if (skip_parsing_) {
       // Return without doing anything if skip_parsing_ is true.
       return;
     }
     char c = text[i];
     if (c == '\n') {
       ++line_;
     }

     // By default we keep track of every byte as it comes in.
     // If we can't accurately parse it, we transmit it as
     // raw characters to be re-serialized without interpretation,
     // and good luck to the browser.  When we do successfully
     // parse something, we remove it from the literal.
     literal_ += c;

     switch (state_) {
       case START:                 EvalStart(c);               break;
       case TAG:                   EvalTag(c);                 break;
       case TAG_OPEN:              EvalTagOpen(c);             break;
       case TAG_CLOSE_NO_NAME:     EvalTagCloseNoName(c);      break;
       case TAG_CLOSE:             EvalTagClose(c);            break;
       case TAG_CLOSE_TERMINATE:   EvalTagClose(c);            break;
       case TAG_BRIEF_CLOSE:       EvalTagBriefClose(c);       break;
       case COMMENT_START1:        EvalCommentStart1(c);       break;
       case COMMENT_START2:        EvalCommentStart2(c);       break;
       case COMMENT_BODY:          EvalCommentBody(c);         break;
       case COMMENT_END1:          EvalCommentEnd1(c);         break;
       case COMMENT_END2:          EvalCommentEnd2(c);         break;
       case CDATA_START1:          EvalCdataStart1(c);         break;
       case CDATA_START2:          EvalCdataStart2(c);         break;
       case CDATA_START3:          EvalCdataStart3(c);         break;
       case CDATA_START4:          EvalCdataStart4(c);         break;
       case CDATA_START5:          EvalCdataStart5(c);         break;
       case CDATA_START6:          EvalCdataStart6(c);         break;
       case CDATA_BODY:            EvalCdataBody(c);           break;
       case CDATA_END1:            EvalCdataEnd1(c);           break;
       case CDATA_END2:            EvalCdataEnd2(c);           break;
       case TAG_ATTRIBUTE:         EvalAttribute(c);           break;
       case TAG_ATTR_NAME:         EvalAttrName(c);            break;
       case TAG_ATTR_NAME_SPACE:   EvalAttrNameSpace(c);       break;
       case TAG_ATTR_EQ:           EvalAttrEq(c);              break;
       case TAG_ATTR_VAL:          EvalAttrVal(c);             break;
       case TAG_ATTR_VALDQ:        EvalAttrValDq(c);           break;
       case TAG_ATTR_VALSQ:        EvalAttrValSq(c);           break;
       case LITERAL_TAG:           EvalLiteralTag(c);          break;
       case SCRIPT_TAG:            EvalScriptTag(c);           break;
       case DIRECTIVE:             EvalDirective(c);           break;
       case BOGUS_COMMENT:         EvalBogusComment(c);        break;
     }
   }
 }

 // The HTML-input sloppiness in these three methods is applied independent
 // of whether we think the document is XHTML, either via doctype or
 // mime-type.  The internet is full of lies.  See Issue 252:
 //   http://code.google.com/p/modpagespeed/issues/detail?id=252

 bool HtmlLexer::IsImplicitlyClosedTag(HtmlName::Keyword keyword) const {
   return IS_IN_SET(kImplicitlyClosedHtmlTags, keyword);
 }

 bool HtmlLexer::IsLiteralTag(HtmlName::Keyword keyword) {
   return IS_IN_SET(kLiteralTags, keyword);
 }

 bool HtmlLexer::IsSometimesLiteralTag(HtmlName::Keyword keyword) {
   return IS_IN_SET(kSometimesLiteralTags, keyword);
 }

 bool HtmlLexer::TagAllowsBriefTermination(HtmlName::Keyword keyword) const {
   return (!IS_IN_SET(kNonBriefTerminatedTags, keyword) &&
           !IsImplicitlyClosedTag(keyword));
 }

 bool HtmlLexer::IsOptionallyClosedTag(HtmlName::Keyword keyword) const {
   return HtmlKeywords::IsOptionallyClosedTag(keyword);
 }

 void HtmlLexer::DebugPrintStack() {
   for (size_t i = kStartStack; i < element_stack_.size(); ++i) {
     puts(element_stack_[i]->ToString().c_str());
   }
   fflush(stdout);
 }

 HtmlElement* HtmlLexer::PopElement() {
   HtmlElement* element = NULL;
   if (!element_stack_.empty()) {
     element = element_stack_.back();
     element_stack_.pop_back();
   }
   return element;
 }

 void HtmlLexer::CloseElement(HtmlElement* element,
                              HtmlElement::Style style) {
   html_parse_->CloseElement(element, style, line_);
   if (size_limit_exceeded_) {
     skip_parsing_ = true;
   }
 }

 HtmlElement* HtmlLexer::PopElementMatchingTag(const StringPiece& tag) {
   HtmlElement* element = NULL;

   HtmlName::Keyword keyword = HtmlName::Lookup(tag);
   int close_index = element_stack_.size();

   // Search the stack from top to bottom.
   for (int i = element_stack_.size() - 1; i >= kStartStack; --i) {
     element = element_stack_[i];

     if (StringCaseEqual(element->name_str(), tag)) {
       // In tag-matching we will do case-insensitive comparisons, despite
       // the fact that we have a keywords enum.  Note that the symbol
       // table is case sensitive.
       close_index = i;
       break;
     } else if (HtmlKeywords::IsContained(keyword, element->keyword())) {
       // Stop when we get to an 'owner' of this element.  Consider
       // <tr><table></tr></table>.  When hitting the </tr> we start
       // looking for a matching <tr> to close.  We need to stop when
       // we get an IsContained match (e.g. tr,table).  But at at this
       // point the appropriate response is to give up -- there is no
       // matching open-tag for the </tr> inside the <table>.  See
       // HtmlAnnotationTest.StrayCloseTrInTable in html_parse_test.cc.
       return NULL;
     }
   }

   if (close_index == static_cast<int>(element_stack_.size())) {
     element = NULL;
   } else {
     element = element_stack_[close_index];

     // Emit warnings for the tags we are skipping.  We have to do
     // this in reverse order so that we maintain stack discipline.
     //
     // Note that the element at close_index does not get closed here,
     // but gets returned and closed at the call-site.
     for (int j = element_stack_.size() - 1; j > close_index; --j) {
       HtmlElement* skipped = element_stack_[j];
       // In fact, should we actually perform this optimization ourselves
       // in a filter to omit closing tags that can be inferred?
       if (!HtmlKeywords::IsOptionallyClosedTag(skipped->keyword())) {
         html_parse_->Info(id_.c_str(), skipped->begin_line_number(),
                           "Unclosed element `%s'",
                           CEscape(skipped->name_str()).c_str());
       }
       // Before closing the skipped element, pop it off the stack.  Otherwise,
       // the parent redundancy check in HtmlParse::AddEvent will fail.
       element_stack_.resize(j);
       CloseElement(skipped, HtmlElement::UNCLOSED);
     }
     element_stack_.resize(close_index);
   }
   return element;
 }

 void HtmlLexer::SyntaxError(const char* msg, ...) {
   va_list args;
   va_start(args, msg);
   html_parse_->InfoV(id_.c_str(), line_, msg, args);
   va_end(args);
 }

 }  // namespace net_instaweb