src/pagespeed/kernel/html/html_parse.h - incubator-pagespeed-debian - Git at Google

 /*
  * Copyright 2010 Google Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 // Author: jmarantz@google.com (Joshua Marantz)

 #ifndef PAGESPEED_KERNEL_HTML_HTML_PARSE_H_
 #define PAGESPEED_KERNEL_HTML_HTML_PARSE_H_

 #include <cstdarg>
 #include <cstddef>
 #include <list>
 #include <map>
 #include <set>
 #include <utility>
 #include <vector>

 #include "pagespeed/kernel/base/basictypes.h"
 #include "pagespeed/kernel/base/arena.h"
 #include "pagespeed/kernel/base/printf_format.h"
 #include "pagespeed/kernel/base/scoped_ptr.h"
 #include "pagespeed/kernel/base/string.h"
 #include "pagespeed/kernel/base/string_util.h"
 #include "pagespeed/kernel/base/symbol_table.h"
 #include "pagespeed/kernel/html/html_element.h"
 #include "pagespeed/kernel/html/html_name.h"
 #include "pagespeed/kernel/html/html_node.h"
 #include "pagespeed/kernel/http/content_type.h"
 #include "pagespeed/kernel/http/google_url.h"

 namespace net_instaweb {

 class DocType;
 class HtmlEvent;
 class HtmlFilter;
 class HtmlLexer;
 class MessageHandler;
 class Timer;

 typedef std::set <const HtmlEvent*> ConstHtmlEventSet;

 // Streaming Html Parser API.  Callbacks defined in HtmlFilter are
 // called on each parser token.
 //
 // Any number of filters can be added to the Html Parser; they are
 // organized in a chain.  Each filter processes a stream of SAX events
 // (HtmlEvent), interspersed by Flushes.  The filter operates on the
 // sequence of events between flushes (a flush-window), and the system
 // passes the (possibly mutated) event-stream to the next filter.
 //
 // An HTML Event is a lexical token provided by the parser, including:
 //     begin document
 //     end document
 //     begin element
 //     end element
 //     whitespace
 //     characters
 //     cdata
 //     comment
 //
 // The parser retains the sequence of events as a data structure:
 // list<HtmlEvent>.  HtmlEvents are sent to filters (HtmlFilter), as follows:
 //   foreach filter in filter-chain
 //     foreach event in flush-window
 //       apply filter to event
 //
 // Filters may mutate the event streams as they are being processed,
 // and these mutations be seen by downstream filters.  The filters can
 // mutate any event that has not been flushed.  Supported mutations include:
 //   - Removing an HTML element whose begin/end tags are both within
 //     the flush window.  This will also remove any nested elements.
 //   - Removing other HTML events
 //   - Inserting new elements (automatically inserts begin/end events)
 //     before or after "current" event
 //   - Inserting new events, before or after "current" event
 class HtmlParse {
  public:
   explicit HtmlParse(MessageHandler* message_handler);
   virtual ~HtmlParse();

   // Application methods for parsing functions and adding filters

   // Add a new html filter to the filter-chain, without taking ownership
   // of it.
   void AddFilter(HtmlFilter* filter);

   // Initiate a chunked parsing session.  Finish with FinishParse.  The
   // url is only used to resolve relative URLs; the contents are not
   // directly fetched.  The caller must supply the text and call ParseText.
   //
   // Returns whether the URL is valid.
   bool StartParse(const StringPiece& url) {
     return StartParseWithType(url, kContentTypeHtml);
   }
   bool StartParseWithType(const StringPiece& url,
                           const ContentType& content_type) {
     return StartParseId(url, url, content_type);
   }

   // Returns whether the google_url() URL is valid.
   bool is_url_valid() const { return url_valid_; }

   // Mostly useful for file-based rewriters so that messages can reference
   // the HTML file and produce navigable errors.
   //
   // Returns whether the URL is valid.
   virtual bool StartParseId(const StringPiece& url, const StringPiece& id,
                             const ContentType& content_type);

   // Sets url() for test purposes. Normally this is done by StartParseId,
   // but sometimes tests need to set it without worrying about parse
   // state.
   void SetUrlForTesting(const StringPiece& url);

   // Parses an arbitrary block of an html file, queuing up the events.  Call
   // Flush to send the events through the Filter.
   //
   // To parse an entire file, first call StartParse(), then call
   // ParseText on the file contents (in whatever size chunks are convenient),
   // then call FinishParse().
   //
   // It is invalid to call ParseText when the StartParse* routines returned
   // false.
   void ParseText(const char* content, int size) {
     ParseTextInternal(content, size);
   }
   void ParseText(const StringPiece& sp) {
     ParseTextInternal(sp.data(), sp.size());
   }

   // Flush the currently queued events through the filters.  It is desirable
   // for large web pages, particularly dynamically generated ones, to start
   // getting delivered to the browser as soon as they are ready.  On the
   // other hand, rewriting is more powerful when more of the content can
   // be considered for image/css/js spriting.  This method should be called
   // when the controlling network process wants to induce a new chunk of
   // output.  The less you call this function the better the rewriting will
   // be.
   //
   // It is invalid to call Flush when the StartParse* routines returned
   // false.
   //
   // If this is called from a Filter, the request will be deferred until after
   // currently active filters are completed.
   virtual void Flush();

   // Finish a chunked parsing session.  This also induces a Flush.
   //
   // It is invalid to call FinishParse when the StartParse* routines returned
   // false.
   virtual void FinishParse();


   // Utility methods for implementing filters

   // These "New*" functions do *not* append the new node to the parent; you
   // must do that yourself.  Also note that in the context of a filter, you
   // must add parents to the DOM in some fashion, before appending children to
   // parents.
   HtmlCdataNode* NewCdataNode(HtmlElement* parent,
                               const StringPiece& contents);
   HtmlCharactersNode* NewCharactersNode(HtmlElement* parent,
                                         const StringPiece& literal);
   HtmlCommentNode* NewCommentNode(HtmlElement* parent,
                                   const StringPiece& contents);
   HtmlDirectiveNode* NewDirectiveNode(HtmlElement* parent,
                                       const StringPiece& contents);
   HtmlIEDirectiveNode* NewIEDirectiveNode(HtmlElement* parent,
                                           const StringPiece& contents);
   void InsertScriptAfterCurrent(StringPiece text, bool external);
   void InsertScriptBeforeCurrent(StringPiece text, bool external);

   // Creates and appends an Anchor tag into the HTML, and then returns it.
   // TODO(jmaessen): refactor and use this in the relevant places.
   HtmlElement* AppendAnchor(StringPiece link, StringPiece text,
                             HtmlElement* parent);

   // DOM-manipulation methods.
   // TODO(sligocki): Find Javascript equivalents and list them or even change
   // our names to be consistent.

   // This and downstream filters will then see inserted elements but upstream
   // filters will not.

   // Note: In Javascript the first is called insertBefore and takes the arg
   // in the opposite order.
   // Note: new_node must not already be in the DOM.
   void InsertNodeBeforeNode(const HtmlNode* existing_node, HtmlNode* new_node);
   void InsertNodeAfterNode(const HtmlNode* existing_node, HtmlNode* new_node);

   // These are a backwards-compatibility wrapper for use by Pagespeed Insights.
   // TODO(morlovich): Remove them after PSI is synced.
   void InsertElementBeforeElement(const HtmlNode* existing_element,
                                   HtmlNode* new_element) {
     InsertNodeBeforeNode(existing_element, new_element);
   }

   void InsertElementAfterElement(const HtmlNode* existing_element,
                                  HtmlNode* new_element) {
     InsertNodeAfterNode(existing_element, new_element);
   }

   // Add a new child element at the beginning or end of existing_parent's
   // children. Named after Javascript's appendChild method.
   // Note: new_child must not already be in the DOM.
   void PrependChild(const HtmlElement* existing_parent, HtmlNode* new_child);
   void AppendChild(const HtmlElement* existing_parent, HtmlNode* new_child);

   // Insert a new element before the current one.  current_ remains unchanged.
   // Note: new_node must not already be in the DOM.
   void InsertNodeBeforeCurrent(HtmlNode* new_node);

   // Insert a new element after the current one, moving current_ to the new
   // element.  In a Filter, the flush-loop will advance past this on
   // the next iteration.
   // Note: new_node must not already be in the DOM.
   void InsertNodeAfterCurrent(HtmlNode* new_node);

   // Enclose element around two elements in a sequence.  The first
   // element must be the same as, or precede the last element in the
   // event-stream, and this is not checked, but the two elements do
   // not need to be adjacent.  They must have the same parent to start
   // with.
   bool AddParentToSequence(HtmlNode* first, HtmlNode* last,
                            HtmlElement* new_parent);

   // Moves current node (and all children) to an already-existing parent,
   // where they will be placed as the last elements in that parent.
   // Returns false if the operation could not be performed because either
   // the node or its parent was partially or wholly flushed.
   // Note: Will not work if called from StartElement() event.
   //
   // This differs from AppendChild() because it moves the current node,
   // which is already in the DOM, rather than adding a new node.
   bool MoveCurrentInto(HtmlElement* new_parent);

   // Moves current node (and all children) directly before existing_node.
   // Note: Will not work if called from StartElement() event.
   //
   // This differs from InsertNodeBeforeNode() because it moves the
   // current node, which is already in the DOM, rather than adding a new node.
   bool MoveCurrentBefore(HtmlNode* existing_node);

   // If the given node is rewritable, delete it and all of its children (if
   // any) and return true; otherwise, do nothing and return false.
   // Note: Javascript appears to use removeChild for this.
   bool DeleteNode(HtmlNode* node);

   // Delete a parent element, retaining any children and moving them to
   // reside under the parent's parent.  Note that an element must be
   // fully inside the flush-window for this to work.  Returns false on
   // failure.
   //
   // See also MakeElementInvisible
   bool DeleteSavingChildren(HtmlElement* element);

   // Similar in effect to DeleteSavingChildren, but this has no structural
   // effect on the DOM.  Instead it sets a bit in the HtmlElement that prevents
   // it from being rendered by HtmlWriterFilter, though all its contents will
   // be rendered.
   //
   // This fails, returning false, if the element's StartElement event has
   // already been flushed.
   bool MakeElementInvisible(HtmlElement* element);

   // Determines whether the element, in the context of its flush
   // window, has children.  If the element is not rewritable, or
   // has not been closed yet, or inserted into the DOM event stream,
   // then 'false' is returned.
   //
   // Note that the concept of the Flush Window is important because the
   // knowledge of an element's children is not limited to the current
   // event being presented to a Filter.  A Filter can call this method
   // in the StartElement of an event to see if any children are going
   // to be coming.  Of course, if the StartElement is at the end of a
   // Flush window, then we won't know about the children, but IsRewritable
   // will also be false.
   bool HasChildrenInFlushWindow(HtmlElement* element);

   // If possible, replace the existing node with the new node and return true;
   // otherwise, do nothing and return false.
   bool ReplaceNode(HtmlNode* existing_node, HtmlNode* new_node);

   // Creates an another element with the same name and attributes as in_element.
   // Does not duplicate the children or insert it anywhere.
   HtmlElement* CloneElement(HtmlElement* in_element);

   HtmlElement* NewElement(HtmlElement* parent, const StringPiece& str) {
     return NewElement(parent, MakeName(str));
   }
   HtmlElement* NewElement(HtmlElement* parent, HtmlName::Keyword keyword) {
     return NewElement(parent, MakeName(keyword));
   }
   HtmlElement* NewElement(HtmlElement* parent, const HtmlName& name);

   // For both versions of AddAttribute
   // Pass in NULL for value to add an attribute with no value at all
   //   ex: <script data-pagespeed-no-transform>
   // Pass in "" for value if you want the value to be the empty string
   //   ex: <div style="">
   void AddAttribute(HtmlElement* element, HtmlName::Keyword keyword,
                     const StringPiece& value) {
     return element->AddAttribute(MakeName(keyword), value,
                                  HtmlElement::DOUBLE_QUOTE);
   }
   void AddAttribute(HtmlElement* element, StringPiece name,
                     const StringPiece& value) {
     return element->AddAttribute(MakeName(name), value,
                                  HtmlElement::DOUBLE_QUOTE);
   }
   void AddEscapedAttribute(HtmlElement* element, HtmlName::Keyword keyword,
                     const StringPiece& escaped_value) {
     return element->AddEscapedAttribute(MakeName(keyword), escaped_value,
                                         HtmlElement::DOUBLE_QUOTE);
   }
   void SetAttributeName(HtmlElement::Attribute* attribute,
                         HtmlName::Keyword keyword) {
     attribute->set_name(MakeName(keyword));
   }

   HtmlName MakeName(const StringPiece& str);
   HtmlName MakeName(HtmlName::Keyword keyword);

   bool IsRewritable(const HtmlNode* node) const;
   // IsRewritable will return false for a node if either the open or close tag
   // has been flushed, but this is too conservative if we only want to call
   // AppendChild on that node, since we can append even if the open tag has
   // already been flushed.
   bool CanAppendChild(const HtmlNode* node) const;

   void ClearElements();

   // Log the HtmlEvent queue_ to the message_handler_ for debugging.
   void DebugLogQueue();

   // Print the HtmlEvent queue_ to stdout for debugging.
   void DebugPrintQueue();

   // Implementation helper with detailed knowledge of html parsing libraries
   friend class HtmlLexer;

   // Determines whether a tag should be terminated in HTML, e.g. <meta ..>.
   // We do not expect to see a close-tag for meta and should never insert one.
   bool IsImplicitlyClosedTag(HtmlName::Keyword keyword) const;

   // Determines whether a tag should be interpreted as a 'literal'
   // tag. That is, a tag whose contents are not parsed until a
   // corresponding matching end tag is encountered.
   static bool IsLiteralTag(HtmlName::Keyword keyword);

   // Determines whether a tag is interpreted as a 'literal' tag in
   // some user agents. Since some user agents will interpret the
   // contents of these tags, our parser never treats them as literal
   // tags. However, a filter that wants to insert new tags that should
   // be processed by all user agents should not insert those tags into
   // a tag that is sometimes parsed as a literal tag. Those filters
   // can use this method to determine if they are within such a tag.
   static bool IsSometimesLiteralTag(HtmlName::Keyword keyword);

   // An optionally closed tag ranges from <p>, which is typically not closed,
   // but we infer the closing from context.  Also consider <html>, which usually
   // is closed but not always.  E.g. www.google.com does not close its html tag.
   bool IsOptionallyClosedTag(HtmlName::Keyword keyword) const;

   // Determines whether a tag allows brief termination in HTML, e.g. <tag/>
   bool TagAllowsBriefTermination(HtmlName::Keyword keyword) const;

   MessageHandler* message_handler() const { return message_handler_; }
   // Gets the current location information; typically to help with error
   // messages.
   const char* url() const { return url_.c_str(); }
   // Gets a parsed GoogleUrl& corresponding to url().
   const GoogleUrl& google_url() const { return google_url_; }
   const char* id() const { return id_.c_str(); }
   int line_number() const { return line_number_; }
   // Returns URL (or id) and line number as a string, to be used in messages.
   GoogleString UrlLine() const {
     return StringPrintf("%s:%d", id(), line_number());
   }

   // Return the current assumed doctype of the document (based on the content
   // type and any HTML directives encountered so far).
   const DocType& doctype() const;

   // Interface for any caller to report an error message via the message handler
   void Info(const char* filename, int line, const char* msg, ...)
       INSTAWEB_PRINTF_FORMAT(4, 5);
   void Warning(const char* filename, int line, const char* msg, ...)
       INSTAWEB_PRINTF_FORMAT(4, 5);
   void Error(const char* filename, int line, const char* msg, ...)
       INSTAWEB_PRINTF_FORMAT(4, 5);
   void FatalError(const char* filename, int line, const char* msg, ...)
       INSTAWEB_PRINTF_FORMAT(4, 5);

   void InfoV(const char* file, int line, const char *msg, va_list args);
   void WarningV(const char* file, int line, const char *msg, va_list args);
   void ErrorV(const char* file, int line, const char *msg, va_list args);
   void FatalErrorV(const char* file, int line, const char* msg, va_list args);

   // Report error message with current parsing filename and linenumber.
   void InfoHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3);
   void WarningHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3);
   void ErrorHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3);
   void FatalErrorHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3);

   // If set_log_rewrite_timing(true) has been called, logs the given message
   // at info level with a timeset offset from the parsing start time,
   void ShowProgress(const char* message);

   void InfoHereV(const char *msg, va_list args) {
     InfoV(id_.c_str(), line_number_, msg, args);
   }
   void WarningHereV(const char *msg, va_list args) {
     WarningV(id_.c_str(), line_number_, msg, args);
   }
   void ErrorHereV(const char *msg, va_list args) {
     ErrorV(id_.c_str(), line_number_, msg, args);
   }
   void FatalErrorHereV(const char* msg, va_list args) {
     FatalErrorV(id_.c_str(), line_number_, msg, args);
   }

   void AddElement(HtmlElement* element, int line_number);
   void CloseElement(HtmlElement* element, HtmlElement::Style style,
                     int line_number);

   // Run a filter on the current queue of parse nodes.
   void ApplyFilter(HtmlFilter* filter);

   // Provide timer to helping to report timing of each filter.  You must also
   // set_log_rewrite_timing(true) to turn on this reporting.
   void set_timer(Timer* timer) { timer_ = timer; }
   Timer* timer() const { return timer_; }
   void set_log_rewrite_timing(bool x) { log_rewrite_timing_ = x; }

   // Adds a filter to be called during parsing as new events are added.
   // Takes ownership of the HtmlFilter passed in.
   void add_event_listener(HtmlFilter* listener);

   // Inserts a comment before or after the current node.  The function tries to
   // pick an intelligent place depending on the document structure and
   // whether the current node is a start-element, end-element, or a leaf.
   // Returns true if it successfully added the comment, and false if it was not
   // safe for the comment to be inserted. This can happen when a comment is
   // inserted in a literal element (script or style) after the opening tag has
   // been flushed, but the closing tag has not been seen yet. In this case, the
   // caller can buffer the messages until EndElement is reached and call
   // InsertComment at that point.
   bool InsertComment(StringPiece sp);

   // Sets the limit on the maximum number of bytes that should be parsed.
   void set_size_limit(int64 x);
   // Returns whether we have exceeded the size limit.
   bool size_limit_exceeded() const;

   // For debugging purposes. If this vector is supplied, DetermineEnabledFilters
   // will populate it with the list of Filters that were disabled, plus the
   // associated reason, if supplied by the Filter. Caller retains ownership
   // of the pointer.
   void SetDynamicallyDisabledFilterList(StringVector* list) {
     dynamically_disabled_filter_list_ = list;
   }

   // Temporarily removes the current node from the parse tree.  This must
   // be run as part of a filter callback, and it is the responsibility of
   // the filter to save the node and call RestoreNode on it later.
   //
   // If current node is an HtmlElement, this must be called on the
   // StartElement event, not the EndElement event.  When an element is
   // deferred, all its children are deferred as well.
   //
   // It is fine to restore a node after a Flush.  Note that while most
   // HtmlNode objects are freed after a Flush window, a deferred one will
   // be retained until it is Restored, or until the end of the document.
   //
   // If a node is not restored at end of document, a warning will be
   // printed and the stored data cleaned up.  Functionally it will be
   // as if the filter called DeleteNode.
   //
   // Note that a filter that defers a node and never restores it will never
   // see the EndElement for that node.
   //
   // Note that if you defer a Characters node and restore it next to
   // another Characters node, they will be coalesced prior to the next
   // filter, but this filter will not see the coalesced nodes.
   // Similarly, if you defer a non-characters node that was previously
   // separating two characters nodes, that will also result in a
   // coalesce seen only by downstream filters.
   void DeferCurrentNode();

   // Restores a node, inserting it after the current event.  If the node
   // is an HtmlElement, the iteration will proceed with the first child node,
   // or, if there were no children, then the EndElement method.
   //
   // Note: you cannot restore during Flush().
   void RestoreDeferredNode(HtmlNode* deferred_node);

   // Returns whether the filter pipeline can rewrite urls.
   bool can_modify_urls() {
     return can_modify_urls_;
   }

  protected:
   typedef std::vector<HtmlFilter*> FilterVector;
   typedef std::list<HtmlFilter*> FilterList;
   typedef std::pair<HtmlNode*, HtmlEventList*> DeferredNode;
   typedef std::map<const HtmlNode*, HtmlEventList*> NodeToEventListMap;
   typedef std::map<HtmlFilter*, DeferredNode> FilterElementMap;
   typedef std::set<const HtmlNode*> NodeSet;

   // HtmlParse::FinishParse() is equivalent to the sequence of
   // BeginFinishParse(); Flush(); EndFinishParse().
   // Split up to permit asynchronous versions.
   void BeginFinishParse();
   void EndFinishParse();

   // Clears any cached state we have while this object is laying
   // around for recycling.
   void Clear();

   // Returns the number of events on the event queue.
   size_t GetEventQueueSize();

   virtual void ParseTextInternal(const char* content, int size);

   // Calls DetermineFiltersBehaviorImpl in an idempotent way.
   void DetermineFiltersBehavior() {
     if (!determine_filter_behavior_called_) {
       determine_filter_behavior_called_ = true;
       can_modify_urls_ = false;
       DetermineFiltersBehaviorImpl();
     }
   }

   void DetermineFilterListBehavior(const FilterList& list) {
     for (FilterList::const_iterator i = list.begin(); i != list.end(); ++i) {
       CheckFilterBehavior(*i);
     }
   }

   void CheckFilterBehavior(HtmlFilter* filter);

   // Call DetermineEnabled() on each filter. Should be called after
   // the property cache lookup has finished since some filters depend on
   // pcache results in their DetermineEnabled implementation. If a subclass has
   // filters that the base HtmlParse doesn't know about, it should override this
   // function and call DetermineEnabled on each of its filters, along with
   // calling the base DetermineEnabledFiltersImpl.
   // For all enabled filters the CanModifyUrl() flag will be aggregated (or'ed)
   // and can be queried on the can_modify_url function.
   virtual void DetermineFiltersBehaviorImpl();

  private:
   void ApplyFilterHelper(HtmlFilter* filter);
   HtmlEventListIterator Last();  // Last element in queue
   bool IsInEventWindow(const HtmlEventListIterator& iter) const;
   void InsertNodeBeforeEvent(const HtmlEventListIterator& event,
                              HtmlNode* new_node);
   void InsertNodeAfterEvent(const HtmlEventListIterator& event,
                             HtmlNode* new_node);
   bool MoveCurrentBeforeEvent(const HtmlEventListIterator& move_to);
   bool IsDescendantOf(const HtmlNode* possible_child,
                       const HtmlNode* possible_parent);
   void SanityCheck();
   void CheckEventParent(HtmlEvent* event, HtmlElement* expect,
                         HtmlElement* actual);
   void CheckParentFromAddEvent(HtmlEvent* event);
   void FixParents(const HtmlEventListIterator& begin,
                   const HtmlEventListIterator& end_inclusive,
                   HtmlElement* new_parent);
   void CoalesceAdjacentCharactersNodes();
   void ClearEvents();
   void EmitQueue(MessageHandler* handler);
   inline void NextEvent();
   void ClearDeferredNodes();
   inline bool IsRewritableIgnoringDeferral(const HtmlNode* node) const;
   inline bool IsRewritableIgnoringEnd(const HtmlNode* node) const;
   void SetupScript(StringPiece text, bool external, HtmlElement* script);

   // Visible for testing only, via HtmlTestingPeer
   friend class HtmlTestingPeer;
   void AddEvent(HtmlEvent* event);
   void SetCurrent(HtmlNode* node);
   void set_coalesce_characters(bool x) { coalesce_characters_ = x; }
   size_t symbol_table_size() const {
     return string_table_.string_bytes_allocated();
   }

   // If a FLUSH occurs in the middle of a script, style, or other tag
   // whose contents can only be a Characters block, then we will buffer
   // up the start of the script tag and not emit it and the Characters block
   // until after we see the close script tag.  This function enforces that
   // right before calling the Filters.
   void DelayLiteralTag();

   FilterVector event_listeners_;
   SymbolTableSensitive string_table_;
   FilterList filters_;
   HtmlLexer* lexer_;
   Arena<HtmlNode> nodes_;
   HtmlEventList queue_;
   HtmlEventListIterator current_;
   // Have we deleted current? Then we shouldn't do certain manipulations to it.
   MessageHandler* message_handler_;
   GoogleString url_;
   GoogleUrl google_url_;
   GoogleString id_;  // Per-request identifier string used in error messages.
   int line_number_;
   bool skip_increment_;
   bool determine_filter_behavior_called_;
   bool can_modify_urls_;
   bool determine_enabled_filters_called_;
   bool need_sanity_check_;
   bool coalesce_characters_;
   bool need_coalesce_characters_;
   bool url_valid_;
   bool log_rewrite_timing_;  // Should we time the speed of parsing?
   bool running_filters_;
   int64 parse_start_time_us_;
   scoped_ptr<HtmlEvent> delayed_start_literal_;
   Timer* timer_;
   HtmlFilter* current_filter_;      // Filter currently running in ApplyFilter

   // When deferring a node that spans a flush window, we present upstream
   // filters with a view of the event-stream that is not impacted by the
   // deferral.  To implement this, at the beginning of each flush window,
   // we do the queue_ mutation for any outstanding deferrals right before
   // running the filter that deferred them.
   FilterElementMap open_deferred_nodes_;

   // Keeps track of the deferred nodes that have not yet been restored.
   NodeToEventListMap deferred_nodes_;

   // We use the node-defer logic to implement DeleteNode for a node that
   // hasn't been closed yet.  The only difference is that you cannot
   // restore a deleted node, and the parser will not print a warning if
   // a deleted node is never restored.
   NodeSet deferred_deleted_nodes_;

   StringVector* dynamically_disabled_filter_list_;

   DISALLOW_COPY_AND_ASSIGN(HtmlParse);
 };

 }  // namespace net_instaweb

 #endif  // PAGESPEED_KERNEL_HTML_HTML_PARSE_H_