src/pagespeed/kernel/html/html_lexer.h - incubator-pagespeed-debian - Git at Google

 /*
  * Copyright 2010 Google Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 // Author: jmarantz@google.com (Joshua Marantz)

 #ifndef PAGESPEED_KERNEL_HTML_HTML_LEXER_H_
 #define PAGESPEED_KERNEL_HTML_HTML_LEXER_H_

 #include <vector>

 #include "pagespeed/kernel/base/basictypes.h"
 #include "pagespeed/kernel/base/printf_format.h"
 #include "pagespeed/kernel/base/string.h"
 #include "pagespeed/kernel/base/string_util.h"
 #include "pagespeed/kernel/html/doctype.h"
 #include "pagespeed/kernel/html/html_element.h"
 #include "pagespeed/kernel/html/html_name.h"
 #include "pagespeed/kernel/http/content_type.h"

 namespace net_instaweb {

 class HtmlParse;

 // Constructs a re-entrant HTML lexer.  This lexer minimally parses tags,
 // attributes, and comments.  It is intended to parse the Wild West of the
 // Web.  It's designed to be tolerant of syntactic transgressions, merely
 // passing through unparseable chunks as Characters.
 //
 // TODO(jmarantz): refactor this with html_parse, so that this class owns
 // the symbol table and the event queue, and no longer needs to mutually
 // depend on HtmlParse.  That will make it easier to unit-test.
 class HtmlLexer {
  public:
   explicit HtmlLexer(HtmlParse* html_parse);
   ~HtmlLexer();

   // Initialize a new parse session, id is only used for error messages.
   void StartParse(const StringPiece& id, const ContentType& content_type);

   // Parse a chunk of text, adding events to the parser by calling
   // html_parse_->AddEvent(...).
   void Parse(const char* text, int size);

   // Completes parse, reporting any leftover text as a final HtmlCharacterEvent.
   void FinishParse();

   // Determines whether a tag should be terminated in HTML.
   bool IsImplicitlyClosedTag(HtmlName::Keyword keyword) const;

   // Determines whether a tag should be interpreted as a 'literal'
   // tag. That is, a tag whose contents are not parsed until a
   // corresponding matching end tag is encountered.
   static bool IsLiteralTag(HtmlName::Keyword keyword);

   // Determines whether a tag is interpreted as a 'literal' tag in
   // some user agents. Since some user agents will interpret the
   // contents of these tags, our lexer never treats them as literal
   // tags.
   static bool IsSometimesLiteralTag(HtmlName::Keyword keyword);

   // Determines whether a tag can be terminated briefly (e.g. <tag/>)
   bool TagAllowsBriefTermination(HtmlName::Keyword keyword) const;

   // Determines whether it's OK to leave a tag unclosed.
   bool IsOptionallyClosedTag(HtmlName::Keyword keyword) const;

   // Print element stack to stdout (for debugging).
   void DebugPrintStack();

   // Returns the current lowest-level parent element in the element stack, or
   // NULL if the stack is empty.
   HtmlElement* Parent() const;

   // Return the current assumed doctype of the document (based on the content
   // type and any HTML directives encountered so far).
   const DocType& doctype() const { return doctype_; }

   // Sets the limit on the maximum number of bytes that should be parsed.
   void set_size_limit(int64 x) { size_limit_ = x; }

   // Indicates whether we have exceeded the limit on the maximum number of bytes
   // that we should parse.
   bool size_limit_exceeded() const { return size_limit_exceeded_; }

  private:
   // Most of these routines expect c to be the last character of literal_
   inline void EvalStart(char c);
   inline void EvalTag(char c);
   inline void EvalTagOpen(char c);
   inline void EvalTagCloseNoName(char c);
   inline void EvalTagClose(char c);
   inline void EvalTagBriefClose(char c);
   inline void EvalCommentStart1(char c);
   inline void EvalCommentStart2(char c);
   inline void EvalCommentBody(char c);
   inline void EvalCommentEnd1(char c);
   inline void EvalCommentEnd2(char c);
   inline void EvalCdataStart1(char c);
   inline void EvalCdataStart2(char c);
   inline void EvalCdataStart3(char c);
   inline void EvalCdataStart4(char c);
   inline void EvalCdataStart5(char c);
   inline void EvalCdataStart6(char c);
   inline void EvalCdataBody(char c);
   inline void EvalCdataEnd1(char c);
   inline void EvalCdataEnd2(char c);
   inline void EvalAttribute(char c);
   inline void EvalAttrName(char c);
   inline void EvalAttrNameSpace(char c);
   inline void EvalAttrEq(char c);
   inline void EvalAttrVal(char c);
   inline void EvalAttrValSq(char c);
   inline void EvalAttrValDq(char c);
   inline void EvalLiteralTag(char c);
   inline void EvalScriptTag(char c);
   inline void EvalDirective(char c);
   inline void EvalBogusComment(char c);

   // Makes an element based on token_, which will be parsed as the tag
   // name.
   void MakeElement();

   void MakeAttribute(bool has_value);
   void FinishAttribute(char c, bool has_value, bool brief_close);

   void EmitCdata();
   void EmitComment();
   void EmitLiteral();
   void EmitTagOpen(bool allow_implicit_close);  // expects element_ != NULL.
   void EmitTagClose(HtmlElement::Style style);
   void EmitTagBriefClose();
   void EmitDirective();
   void Restart(char c);

   // Emits a syntax error message.
   void SyntaxError(const char* format, ...) INSTAWEB_PRINTF_FORMAT(2, 3);

   // Tries to find a HTML element on the stack matching a tag.  If it
   // finds it, it pops all the intervening elements off the stack,
   // issuing warnings for each discarded tag, the matching element is
   // also popped off the stack, and returned.
   //
   // If the tag is not matched, then no mutations are done to the stack,
   // and NULL is returned.
   //
   // The tag name should be interned.
   // TODO(jmarantz): use type system
   HtmlElement* PopElementMatchingTag(const StringPiece& tag);

   HtmlElement* PopElement();
   void CloseElement(HtmlElement* element, HtmlElement::Style style);

   // Minimal i18n analysis.  With utf-8 and gb2312 we can do this
   // context-free, and thus the method can be static.  If we add
   // more encodings we may need to turn this into a non-static method.
   static inline bool IsI18nChar(char c) {return (((c) & 0x80) != 0); }

   // Determines whether a character can be used in a tag name as first char ...
   static inline bool IsLegalTagFirstChar(char c);
   // ... or subsequent char.
   static inline bool IsLegalTagChar(char c);

   // Determines whether a character can be used in an attribute name.
   static inline bool IsLegalAttrNameChar(char c);

   // The lexer is implemented as a pure state machine.  There is
   // no lookahead.  The state is understood primarily in this
   // enum, although there are a few state flavors that are managed
   // by the other member variables, notably: has_attr_value_ and
   // attr_name_.empty().  Those could be eliminated by adding
   // a few more explicit states.
   enum State {
     START,
     TAG,                   // "<"
     TAG_CLOSE_NO_NAME,     // "</"
     TAG_CLOSE,             // "</x"
     TAG_CLOSE_TERMINATE,   // "</x "
     TAG_OPEN,              // "<x"
     TAG_BRIEF_CLOSE,       // "<x/" or "<x /" or "<x y/" etc
     COMMENT_START1,        // "<!"
     COMMENT_START2,        // "<!-"
     COMMENT_BODY,          // "<!--"
     COMMENT_END1,          // "-"
     COMMENT_END2,          // "--"
     CDATA_START1,          // "<!["
     CDATA_START2,          // "<![C"
     CDATA_START3,          // "<![CD"
     CDATA_START4,          // "<![CDA"
     CDATA_START5,          // "<![CDAT"
     CDATA_START6,          // "<![CDATA"
     CDATA_BODY,            // "<![CDATA["
     CDATA_END1,            // "]"
     CDATA_END2,            // "]]"
     TAG_ATTRIBUTE,         // "<x "
     TAG_ATTR_NAME,         // "<x y"
     TAG_ATTR_NAME_SPACE,   // "<x y "
     TAG_ATTR_EQ,           // "<x y="
     TAG_ATTR_VAL,          // "<x y=x" value terminated by whitespace or >
     TAG_ATTR_VALDQ,        // '<x y="' value terminated by double-quote
     TAG_ATTR_VALSQ,        // "<x y='" value terminated by single-quote
     LITERAL_TAG,           // "<style " or "<iframe ", etc.
     SCRIPT_TAG,            // "<script "
     DIRECTIVE,             // "<!x"
     BOGUS_COMMENT,         // "<?foo>" or "</?foo>"
   };

   HtmlParse* html_parse_;
   State state_;
   GoogleString token_;       // accumulates tag names and comments
   GoogleString literal_;     // accumulates raw text to pass through
   GoogleString attr_name_;   // accumulates attribute name
   GoogleString attr_value_;  // accumulates attribute value
   HtmlElement::QuoteStyle attr_quote_;  // quote used to delimit attribute
   bool has_attr_value_;     // distinguishes <a n=> from <a n>
   HtmlElement* element_;    // current element; used to collect attributes
   int line_;
   int tag_start_line_;      // line at which we last transitioned to TAG state
   GoogleString id_;
   GoogleString literal_close_;  // specific tag go close, e.g </script>
   bool script_html_comment_;   // inside <script> <!--
   bool script_html_comment_script_;  // inside <script> <!-- <script>
   // in some cases we have to drop what looks like attributes on a closing
   // tag as part of error recovery.
   bool discard_until_start_state_for_error_recovery_;

   ContentType content_type_;
   DocType doctype_;

   std::vector<HtmlElement*> element_stack_;

   // Indicates that we have exceeded the enforced size limit on the maximum
   // number of input HTML that we can parse.
   bool size_limit_exceeded_;
   // Whether we should skip parsing of all subsequent bytes. HtmlParse calls
   // this once it has started or ended an HtmlElement.
   bool skip_parsing_;
   int64 num_bytes_parsed_;
   int64 size_limit_;

   DISALLOW_COPY_AND_ASSIGN(HtmlLexer);
 };

 }  // namespace net_instaweb

 #endif  // PAGESPEED_KERNEL_HTML_HTML_LEXER_H_
	/*
	* Copyright 2010 Google Inc.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	// Author: jmarantz@google.com (Joshua Marantz)

	#ifndef PAGESPEED_KERNEL_HTML_HTML_LEXER_H_
	#define PAGESPEED_KERNEL_HTML_HTML_LEXER_H_

	#include <vector>

	#include "pagespeed/kernel/base/basictypes.h"
	#include "pagespeed/kernel/base/printf_format.h"
	#include "pagespeed/kernel/base/string.h"
	#include "pagespeed/kernel/base/string_util.h"
	#include "pagespeed/kernel/html/doctype.h"
	#include "pagespeed/kernel/html/html_element.h"
	#include "pagespeed/kernel/html/html_name.h"
	#include "pagespeed/kernel/http/content_type.h"

	namespace net_instaweb {

	class HtmlParse;

	// Constructs a re-entrant HTML lexer. This lexer minimally parses tags,
	// attributes, and comments. It is intended to parse the Wild West of the
	// Web. It's designed to be tolerant of syntactic transgressions, merely
	// passing through unparseable chunks as Characters.
	//
	// TODO(jmarantz): refactor this with html_parse, so that this class owns
	// the symbol table and the event queue, and no longer needs to mutually
	// depend on HtmlParse. That will make it easier to unit-test.
	class HtmlLexer {
	public:
	explicit HtmlLexer(HtmlParse* html_parse);
	~HtmlLexer();

	// Initialize a new parse session, id is only used for error messages.
	void StartParse(const StringPiece& id, const ContentType& content_type);

	// Parse a chunk of text, adding events to the parser by calling
	// html_parse_->AddEvent(...).
	void Parse(const char* text, int size);

	// Completes parse, reporting any leftover text as a final HtmlCharacterEvent.
	void FinishParse();

	// Determines whether a tag should be terminated in HTML.
	bool IsImplicitlyClosedTag(HtmlName::Keyword keyword) const;

	// Determines whether a tag should be interpreted as a 'literal'
	// tag. That is, a tag whose contents are not parsed until a
	// corresponding matching end tag is encountered.
	static bool IsLiteralTag(HtmlName::Keyword keyword);

	// Determines whether a tag is interpreted as a 'literal' tag in
	// some user agents. Since some user agents will interpret the
	// contents of these tags, our lexer never treats them as literal
	// tags.
	static bool IsSometimesLiteralTag(HtmlName::Keyword keyword);

	// Determines whether a tag can be terminated briefly (e.g. <tag/>)
	bool TagAllowsBriefTermination(HtmlName::Keyword keyword) const;

	// Determines whether it's OK to leave a tag unclosed.
	bool IsOptionallyClosedTag(HtmlName::Keyword keyword) const;

	// Print element stack to stdout (for debugging).
	void DebugPrintStack();

	// Returns the current lowest-level parent element in the element stack, or
	// NULL if the stack is empty.
	HtmlElement* Parent() const;

	// Return the current assumed doctype of the document (based on the content
	// type and any HTML directives encountered so far).
	const DocType& doctype() const { return doctype_; }

	// Sets the limit on the maximum number of bytes that should be parsed.
	void set_size_limit(int64 x) { size_limit_ = x; }

	// Indicates whether we have exceeded the limit on the maximum number of bytes
	// that we should parse.
	bool size_limit_exceeded() const { return size_limit_exceeded_; }

	private:
	// Most of these routines expect c to be the last character of literal_
	inline void EvalStart(char c);
	inline void EvalTag(char c);
	inline void EvalTagOpen(char c);
	inline void EvalTagCloseNoName(char c);
	inline void EvalTagClose(char c);
	inline void EvalTagBriefClose(char c);
	inline void EvalCommentStart1(char c);
	inline void EvalCommentStart2(char c);
	inline void EvalCommentBody(char c);
	inline void EvalCommentEnd1(char c);
	inline void EvalCommentEnd2(char c);
	inline void EvalCdataStart1(char c);
	inline void EvalCdataStart2(char c);
	inline void EvalCdataStart3(char c);
	inline void EvalCdataStart4(char c);
	inline void EvalCdataStart5(char c);
	inline void EvalCdataStart6(char c);
	inline void EvalCdataBody(char c);
	inline void EvalCdataEnd1(char c);
	inline void EvalCdataEnd2(char c);
	inline void EvalAttribute(char c);
	inline void EvalAttrName(char c);
	inline void EvalAttrNameSpace(char c);
	inline void EvalAttrEq(char c);
	inline void EvalAttrVal(char c);
	inline void EvalAttrValSq(char c);
	inline void EvalAttrValDq(char c);
	inline void EvalLiteralTag(char c);
	inline void EvalScriptTag(char c);
	inline void EvalDirective(char c);
	inline void EvalBogusComment(char c);

	// Makes an element based on token_, which will be parsed as the tag
	// name.
	void MakeElement();

	void MakeAttribute(bool has_value);
	void FinishAttribute(char c, bool has_value, bool brief_close);

	void EmitCdata();
	void EmitComment();
	void EmitLiteral();
	void EmitTagOpen(bool allow_implicit_close); // expects element_ != NULL.
	void EmitTagClose(HtmlElement::Style style);
	void EmitTagBriefClose();
	void EmitDirective();
	void Restart(char c);

	// Emits a syntax error message.
	void SyntaxError(const char* format, ...) INSTAWEB_PRINTF_FORMAT(2, 3);

	// Tries to find a HTML element on the stack matching a tag. If it
	// finds it, it pops all the intervening elements off the stack,
	// issuing warnings for each discarded tag, the matching element is
	// also popped off the stack, and returned.
	//
	// If the tag is not matched, then no mutations are done to the stack,
	// and NULL is returned.
	//
	// The tag name should be interned.
	// TODO(jmarantz): use type system
	HtmlElement* PopElementMatchingTag(const StringPiece& tag);

	HtmlElement* PopElement();
	void CloseElement(HtmlElement* element, HtmlElement::Style style);

	// Minimal i18n analysis. With utf-8 and gb2312 we can do this
	// context-free, and thus the method can be static. If we add
	// more encodings we may need to turn this into a non-static method.
	static inline bool IsI18nChar(char c) {return (((c) & 0x80) != 0); }

	// Determines whether a character can be used in a tag name as first char ...
	static inline bool IsLegalTagFirstChar(char c);
	// ... or subsequent char.
	static inline bool IsLegalTagChar(char c);

	// Determines whether a character can be used in an attribute name.
	static inline bool IsLegalAttrNameChar(char c);

	// The lexer is implemented as a pure state machine. There is
	// no lookahead. The state is understood primarily in this
	// enum, although there are a few state flavors that are managed
	// by the other member variables, notably: has_attr_value_ and
	// attr_name_.empty(). Those could be eliminated by adding
	// a few more explicit states.
	enum State {
	START,
	TAG, // "<"
	TAG_CLOSE_NO_NAME, // "</"
	TAG_CLOSE, // "</x"
	TAG_CLOSE_TERMINATE, // "</x "
	TAG_OPEN, // "<x"
	TAG_BRIEF_CLOSE, // "<x/" or "<x /" or "<x y/" etc
	COMMENT_START1, // "<!"
	COMMENT_START2, // "<!-"
	COMMENT_BODY, // "<!--"
	COMMENT_END1, // "-"
	COMMENT_END2, // "--"
	CDATA_START1, // "<!["
	CDATA_START2, // "<![C"
	CDATA_START3, // "<![CD"
	CDATA_START4, // "<![CDA"
	CDATA_START5, // "<![CDAT"
	CDATA_START6, // "<![CDATA"
	CDATA_BODY, // "<![CDATA["
	CDATA_END1, // "]"
	CDATA_END2, // "]]"
	TAG_ATTRIBUTE, // "<x "
	TAG_ATTR_NAME, // "<x y"
	TAG_ATTR_NAME_SPACE, // "<x y "
	TAG_ATTR_EQ, // "<x y="
	TAG_ATTR_VAL, // "<x y=x" value terminated by whitespace or >
	TAG_ATTR_VALDQ, // '<x y="' value terminated by double-quote
	TAG_ATTR_VALSQ, // "<x y='" value terminated by single-quote
	LITERAL_TAG, // "<style " or "<iframe ", etc.
	SCRIPT_TAG, // "<script "
	DIRECTIVE, // "<!x"
	BOGUS_COMMENT, // "<?foo>" or "</?foo>"
	};

	HtmlParse* html_parse_;
	State state_;
	GoogleString token_; // accumulates tag names and comments
	GoogleString literal_; // accumulates raw text to pass through
	GoogleString attr_name_; // accumulates attribute name
	GoogleString attr_value_; // accumulates attribute value
	HtmlElement::QuoteStyle attr_quote_; // quote used to delimit attribute
	bool has_attr_value_; // distinguishes <a n=> from <a n>
	HtmlElement* element_; // current element; used to collect attributes
	int line_;
	int tag_start_line_; // line at which we last transitioned to TAG state
	GoogleString id_;
	GoogleString literal_close_; // specific tag go close, e.g </script>
	bool script_html_comment_; // inside <script> <!--
	bool script_html_comment_script_; // inside <script> <!-- <script>
	// in some cases we have to drop what looks like attributes on a closing
	// tag as part of error recovery.
	bool discard_until_start_state_for_error_recovery_;

	ContentType content_type_;
	DocType doctype_;

	std::vector<HtmlElement*> element_stack_;

	// Indicates that we have exceeded the enforced size limit on the maximum
	// number of input HTML that we can parse.
	bool size_limit_exceeded_;
	// Whether we should skip parsing of all subsequent bytes. HtmlParse calls
	// this once it has started or ended an HtmlElement.
	bool skip_parsing_;
	int64 num_bytes_parsed_;
	int64 size_limit_;

	DISALLOW_COPY_AND_ASSIGN(HtmlLexer);
	};

	} // namespace net_instaweb

	#endif // PAGESPEED_KERNEL_HTML_HTML_LEXER_H_