src/net/instaweb/htmlparse/html_lexer.h - incubator-pagespeed-mod - Git at Google

 /**
  * Copyright 2010 Google Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 // Author: jmarantz@google.com (Joshua Marantz)

 #ifndef NET_INSTAWEB_HTMLPARSE_HTML_LEXER_H_
 #define NET_INSTAWEB_HTMLPARSE_HTML_LEXER_H_

 #include <stdarg.h>
 #include <set>
 #include <vector>
 #include "base/basictypes.h"
 #include "net/instaweb/htmlparse/public/html_element.h"
 #include "net/instaweb/util/public/printf_format.h"
 #include <string>

 namespace net_instaweb {

 // Constructs a re-entrant HTML lexer.  This lexer minimally parses tags,
 // attributes, and comments.  It is intended to parse the Wild West of the
 // Web.  It's designed to be tolerant of syntactic transgressions, merely
 // passing through unparseable chunks as Characters.
 //
 // TODO(jmarantz): refactor this with html_parse, so that this class owns
 // the symbol table and the event queue, and no longer needs to mutually
 // depend on HtmlParse.  That will make it easier to unit-test.
 class HtmlLexer {
  public:
   explicit HtmlLexer(HtmlParse* html_parse);
   ~HtmlLexer();

   // Initialize a new parse session, id is only used for error messages.
   void StartParse(const StringPiece& id);

   // Parse a chunk of text, adding events to the parser by calling
   // html_parse_->AddEvent(...).
   void Parse(const char* text, int size);

   // Completes parse, reporting any leftover text as a final HtmlCharacterEvent.
   void FinishParse();

   // Determines whether a tag should be terminated in HTML.
   bool IsImplicitlyClosedTag(Atom tag) const;

   // Determines whether a tag can be terminated briefly (e.g. <tag/>)
   bool TagAllowsBriefTermination(Atom tag) const;

   // Print element stack to stdout (for debugging).
   void DebugPrintStack();

   // Returns the current lowest-level parent element in the element stack
   HtmlElement* Parent() const;

  private:
   inline void EvalStart(char c);
   inline void EvalTag(char c);
   inline void EvalTagOpen(char c);
   inline void EvalTagClose(char c);
   inline void EvalTagCloseTerminate(char c);
   inline void EvalTagBriefClose(char c);
   inline void EvalTagBriefCloseAttr(char c);
   inline void EvalCommentStart1(char c);
   inline void EvalCommentStart2(char c);
   inline void EvalCommentBody(char c);
   inline void EvalCommentEnd1(char c);
   inline void EvalCommentEnd2(char c);
   inline void EvalCdataStart1(char c);
   inline void EvalCdataStart2(char c);
   inline void EvalCdataStart3(char c);
   inline void EvalCdataStart4(char c);
   inline void EvalCdataStart5(char c);
   inline void EvalCdataStart6(char c);
   inline void EvalCdataBody(char c);
   inline void EvalCdataEnd1(char c);
   inline void EvalCdataEnd2(char c);
   inline void EvalAttribute(char c);
   inline void EvalAttrName(char c);
   inline void EvalAttrEq(char c);
   inline void EvalAttrVal(char c);
   inline void EvalAttrValSq(char c);
   inline void EvalAttrValDq(char c);
   inline void EvalLiteralTag(char c);
   inline void EvalDirective(char c);

   void MakeElement();
   void MakeAttribute(bool has_value);
   void FinishAttribute(char c, bool has_value, bool brief_close);

   void EmitCdata();
   void EmitComment();
   void EmitLiteral();
   void EmitTagOpen(bool allow_implicit_close);
   void EmitTagClose(HtmlElement::CloseStyle close_style);
   void EmitTagBriefClose();
   void EmitDirective();

   // Emits an error message.
   void Warning(const char* format, ...) INSTAWEB_PRINTF_FORMAT(2, 3);

   // Takes an interned tag, and tries to find a matching HTML element on
   // the stack.  If it finds it, it pops all the intervening elements off
   // the stack, issuing warnings for each discarded tag, the matching element
   // is also popped off the stack, and returned.
   //
   // If the tag is not matched, then no mutations are done to the stack,
   // and NULL is returned.
   //
   // The tag name should be interned.
   // TODO(jmarantz): use type system
   HtmlElement* PopElementMatchingTag(Atom tag);

   HtmlElement* PopElement();
   void CloseElement(HtmlElement* element, HtmlElement::CloseStyle close_style,
                     int line_nubmer);

   // Minimal i18n analysis.  With utf-8 and gb2312 we can do this
   // context-free, and thus the method can be static.  If we add
   // more encodings we may need to turn this into a non-static method.
   static inline bool IsI18nChar(char c) {return (((c) & 0x80) != 0); }

   // Determines whether a character can be used in a tag name as first char ...
   static inline bool IsLegalTagFirstChar(char c);
   // ... or subsequent char.
   static inline bool IsLegalTagChar(char c);

   // Determines whether a character can be used in an attribute name.
   static inline bool IsLegalAttrNameChar(char c);

   // Determines whether a character can be used in an attribute value.
   static inline bool IsLegalAttrValChar(char c);

   // The lexer is implemented as a pure state machine.  There is
   // no lookahead.  The state is understood primarily in this
   // enum, although there are a few state flavors that are managed
   // by the other member variables, notably: has_attr_value_ and
   // attr_name_.empty().  Those could be eliminated by adding
   // a few more explicit states.
   enum State {
     START,
     TAG,                   // "<"
     TAG_CLOSE,             // "</"
     TAG_CLOSE_TERMINATE,   // "</x "
     TAG_OPEN,              // "<x"
     TAG_BRIEF_CLOSE,       // "<x/"
     TAG_BRIEF_CLOSE_ATTR,  // "<x /" or "<x y/" or "x y=/z" etc
     COMMENT_START1,        // "<!"
     COMMENT_START2,        // "<!-"
     COMMENT_BODY,          // "<!--"
     COMMENT_END1,          // "-"
     COMMENT_END2,          // "--"
     CDATA_START1,          // "<!["
     CDATA_START2,          // "<![C"
     CDATA_START3,          // "<![CD"
     CDATA_START4,          // "<![CDA"
     CDATA_START5,          // "<![CDAT"
     CDATA_START6,          // "<![CDATA"
     CDATA_BODY,            // "<![CDATA["
     CDATA_END1,            // "]"
     CDATA_END2,            // "]]"
     TAG_ATTRIBUTE,         // "<x "
     TAG_ATTR_NAME,         // "<x y"
     TAG_ATTR_NAME_SPACE,   // "<x y "
     TAG_ATTR_EQ,           // "<x y="
     TAG_ATTR_VAL,          // "<x y=x" value terminated by whitespace or >
     TAG_ATTR_VALDQ,        // '<x y="' value terminated by double-quote
     TAG_ATTR_VALSQ,        // "<x y='" value terminated by single-quote
     LITERAL_TAG,           // "<script " or "<iframe "
     DIRECTIVE              // "<!x"
   };

   HtmlParse* html_parse_;
   State state_;
   std::string token_;       // accmulates tag names and comments
   std::string literal_;     // accumulates raw text to pass through
   std::string attr_name_;   // accumulates attribute name
   std::string attr_value_;  // accumulates attribute value
   const char* attr_quote_;  // accumulates quote used to delimit attribute
   bool has_attr_value_;     // distinguishes <a n=> from <a n>
   HtmlElement* element_;    // current element; used to collect attributes
   int line_;
   int tag_start_line_;      // line at which we last transitioned to TAG state
   std::string id_;
   std::string literal_close_;  // specific tag go close, e.g </script>

   AtomSet implicitly_closed_;
   AtomSet non_brief_terminated_tags_;
   AtomSet literal_tags_;
   std::vector<HtmlElement*> element_stack_;

   DISALLOW_COPY_AND_ASSIGN(HtmlLexer);
 };

 }  // namespace net_instaweb

 #endif  // NET_INSTAWEB_HTMLPARSE_HTML_LEXER_H_
	/**
	* Copyright 2010 Google Inc.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	// Author: jmarantz@google.com (Joshua Marantz)

	#ifndef NET_INSTAWEB_HTMLPARSE_HTML_LEXER_H_
	#define NET_INSTAWEB_HTMLPARSE_HTML_LEXER_H_

	#include <stdarg.h>
	#include <set>
	#include <vector>
	#include "base/basictypes.h"
	#include "net/instaweb/htmlparse/public/html_element.h"
	#include "net/instaweb/util/public/printf_format.h"
	#include <string>

	namespace net_instaweb {

	// Constructs a re-entrant HTML lexer. This lexer minimally parses tags,
	// attributes, and comments. It is intended to parse the Wild West of the
	// Web. It's designed to be tolerant of syntactic transgressions, merely
	// passing through unparseable chunks as Characters.
	//
	// TODO(jmarantz): refactor this with html_parse, so that this class owns
	// the symbol table and the event queue, and no longer needs to mutually
	// depend on HtmlParse. That will make it easier to unit-test.
	class HtmlLexer {
	public:
	explicit HtmlLexer(HtmlParse* html_parse);
	~HtmlLexer();

	// Initialize a new parse session, id is only used for error messages.
	void StartParse(const StringPiece& id);

	// Parse a chunk of text, adding events to the parser by calling
	// html_parse_->AddEvent(...).
	void Parse(const char* text, int size);

	// Completes parse, reporting any leftover text as a final HtmlCharacterEvent.
	void FinishParse();

	// Determines whether a tag should be terminated in HTML.
	bool IsImplicitlyClosedTag(Atom tag) const;

	// Determines whether a tag can be terminated briefly (e.g. <tag/>)
	bool TagAllowsBriefTermination(Atom tag) const;

	// Print element stack to stdout (for debugging).
	void DebugPrintStack();

	// Returns the current lowest-level parent element in the element stack
	HtmlElement* Parent() const;

	private:
	inline void EvalStart(char c);
	inline void EvalTag(char c);
	inline void EvalTagOpen(char c);
	inline void EvalTagClose(char c);
	inline void EvalTagCloseTerminate(char c);
	inline void EvalTagBriefClose(char c);
	inline void EvalTagBriefCloseAttr(char c);
	inline void EvalCommentStart1(char c);
	inline void EvalCommentStart2(char c);
	inline void EvalCommentBody(char c);
	inline void EvalCommentEnd1(char c);
	inline void EvalCommentEnd2(char c);
	inline void EvalCdataStart1(char c);
	inline void EvalCdataStart2(char c);
	inline void EvalCdataStart3(char c);
	inline void EvalCdataStart4(char c);
	inline void EvalCdataStart5(char c);
	inline void EvalCdataStart6(char c);
	inline void EvalCdataBody(char c);
	inline void EvalCdataEnd1(char c);
	inline void EvalCdataEnd2(char c);
	inline void EvalAttribute(char c);
	inline void EvalAttrName(char c);
	inline void EvalAttrEq(char c);
	inline void EvalAttrVal(char c);
	inline void EvalAttrValSq(char c);
	inline void EvalAttrValDq(char c);
	inline void EvalLiteralTag(char c);
	inline void EvalDirective(char c);

	void MakeElement();
	void MakeAttribute(bool has_value);
	void FinishAttribute(char c, bool has_value, bool brief_close);

	void EmitCdata();
	void EmitComment();
	void EmitLiteral();
	void EmitTagOpen(bool allow_implicit_close);
	void EmitTagClose(HtmlElement::CloseStyle close_style);
	void EmitTagBriefClose();
	void EmitDirective();

	// Emits an error message.
	void Warning(const char* format, ...) INSTAWEB_PRINTF_FORMAT(2, 3);

	// Takes an interned tag, and tries to find a matching HTML element on
	// the stack. If it finds it, it pops all the intervening elements off
	// the stack, issuing warnings for each discarded tag, the matching element
	// is also popped off the stack, and returned.
	//
	// If the tag is not matched, then no mutations are done to the stack,
	// and NULL is returned.
	//
	// The tag name should be interned.
	// TODO(jmarantz): use type system
	HtmlElement* PopElementMatchingTag(Atom tag);

	HtmlElement* PopElement();
	void CloseElement(HtmlElement* element, HtmlElement::CloseStyle close_style,
	int line_nubmer);

	// Minimal i18n analysis. With utf-8 and gb2312 we can do this
	// context-free, and thus the method can be static. If we add
	// more encodings we may need to turn this into a non-static method.
	static inline bool IsI18nChar(char c) {return (((c) & 0x80) != 0); }

	// Determines whether a character can be used in a tag name as first char ...
	static inline bool IsLegalTagFirstChar(char c);
	// ... or subsequent char.
	static inline bool IsLegalTagChar(char c);

	// Determines whether a character can be used in an attribute name.
	static inline bool IsLegalAttrNameChar(char c);

	// Determines whether a character can be used in an attribute value.
	static inline bool IsLegalAttrValChar(char c);

	// The lexer is implemented as a pure state machine. There is
	// no lookahead. The state is understood primarily in this
	// enum, although there are a few state flavors that are managed
	// by the other member variables, notably: has_attr_value_ and
	// attr_name_.empty(). Those could be eliminated by adding
	// a few more explicit states.
	enum State {
	START,
	TAG, // "<"
	TAG_CLOSE, // "</"
	TAG_CLOSE_TERMINATE, // "</x "
	TAG_OPEN, // "<x"
	TAG_BRIEF_CLOSE, // "<x/"
	TAG_BRIEF_CLOSE_ATTR, // "<x /" or "<x y/" or "x y=/z" etc
	COMMENT_START1, // "<!"
	COMMENT_START2, // "<!-"
	COMMENT_BODY, // "<!--"
	COMMENT_END1, // "-"
	COMMENT_END2, // "--"
	CDATA_START1, // "<!["
	CDATA_START2, // "<![C"
	CDATA_START3, // "<![CD"
	CDATA_START4, // "<![CDA"
	CDATA_START5, // "<![CDAT"
	CDATA_START6, // "<![CDATA"
	CDATA_BODY, // "<![CDATA["
	CDATA_END1, // "]"
	CDATA_END2, // "]]"
	TAG_ATTRIBUTE, // "<x "
	TAG_ATTR_NAME, // "<x y"
	TAG_ATTR_NAME_SPACE, // "<x y "
	TAG_ATTR_EQ, // "<x y="
	TAG_ATTR_VAL, // "<x y=x" value terminated by whitespace or >
	TAG_ATTR_VALDQ, // '<x y="' value terminated by double-quote
	TAG_ATTR_VALSQ, // "<x y='" value terminated by single-quote
	LITERAL_TAG, // "<script " or "<iframe "
	DIRECTIVE // "<!x"
	};

	HtmlParse* html_parse_;
	State state_;
	std::string token_; // accmulates tag names and comments
	std::string literal_; // accumulates raw text to pass through
	std::string attr_name_; // accumulates attribute name
	std::string attr_value_; // accumulates attribute value
	const char* attr_quote_; // accumulates quote used to delimit attribute
	bool has_attr_value_; // distinguishes <a n=> from <a n>
	HtmlElement* element_; // current element; used to collect attributes
	int line_;
	int tag_start_line_; // line at which we last transitioned to TAG state
	std::string id_;
	std::string literal_close_; // specific tag go close, e.g </script>

	AtomSet implicitly_closed_;
	AtomSet non_brief_terminated_tags_;
	AtomSet literal_tags_;
	std::vector<HtmlElement*> element_stack_;

	DISALLOW_COPY_AND_ASSIGN(HtmlLexer);
	};

	} // namespace net_instaweb

	#endif // NET_INSTAWEB_HTMLPARSE_HTML_LEXER_H_