| /* |
| * Copyright 2010 Google Inc. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| // Author: jmarantz@google.com (Joshua Marantz) |
| |
| #ifndef PAGESPEED_KERNEL_HTML_HTML_PARSE_H_ |
| #define PAGESPEED_KERNEL_HTML_HTML_PARSE_H_ |
| |
| #include <cstdarg> |
| #include <cstddef> |
| #include <list> |
| #include <map> |
| #include <set> |
| #include <utility> |
| #include <vector> |
| |
| #include "pagespeed/kernel/base/basictypes.h" |
| #include "pagespeed/kernel/base/arena.h" |
| #include "pagespeed/kernel/base/printf_format.h" |
| #include "pagespeed/kernel/base/scoped_ptr.h" |
| #include "pagespeed/kernel/base/string.h" |
| #include "pagespeed/kernel/base/string_util.h" |
| #include "pagespeed/kernel/base/symbol_table.h" |
| #include "pagespeed/kernel/html/html_element.h" |
| #include "pagespeed/kernel/html/html_name.h" |
| #include "pagespeed/kernel/html/html_node.h" |
| #include "pagespeed/kernel/http/content_type.h" |
| #include "pagespeed/kernel/http/google_url.h" |
| |
| namespace net_instaweb { |
| |
| class DocType; |
| class HtmlEvent; |
| class HtmlFilter; |
| class HtmlLexer; |
| class MessageHandler; |
| class Timer; |
| |
| typedef std::set <const HtmlEvent*> ConstHtmlEventSet; |
| |
| // Streaming Html Parser API. Callbacks defined in HtmlFilter are |
| // called on each parser token. |
| // |
| // Any number of filters can be added to the Html Parser; they are |
| // organized in a chain. Each filter processes a stream of SAX events |
| // (HtmlEvent), interspersed by Flushes. The filter operates on the |
| // sequence of events between flushes (a flush-window), and the system |
| // passes the (possibly mutated) event-stream to the next filter. |
| // |
| // An HTML Event is a lexical token provided by the parser, including: |
| // begin document |
| // end document |
| // begin element |
| // end element |
| // whitespace |
| // characters |
| // cdata |
| // comment |
| // |
| // The parser retains the sequence of events as a data structure: |
| // list<HtmlEvent>. HtmlEvents are sent to filters (HtmlFilter), as follows: |
| // foreach filter in filter-chain |
| // foreach event in flush-window |
| // apply filter to event |
| // |
| // Filters may mutate the event streams as they are being processed, |
| // and these mutations be seen by downstream filters. The filters can |
| // mutate any event that has not been flushed. Supported mutations include: |
| // - Removing an HTML element whose begin/end tags are both within |
| // the flush window. This will also remove any nested elements. |
| // - Removing other HTML events |
| // - Inserting new elements (automatically inserts begin/end events) |
| // before or after "current" event |
| // - Inserting new events, before or after "current" event |
| class HtmlParse { |
| public: |
| explicit HtmlParse(MessageHandler* message_handler); |
| virtual ~HtmlParse(); |
| |
| // Application methods for parsing functions and adding filters |
| |
| // Add a new html filter to the filter-chain, without taking ownership |
| // of it. |
| void AddFilter(HtmlFilter* filter); |
| |
| // Initiate a chunked parsing session. Finish with FinishParse. The |
| // url is only used to resolve relative URLs; the contents are not |
| // directly fetched. The caller must supply the text and call ParseText. |
| // |
| // Returns whether the URL is valid. |
| bool StartParse(const StringPiece& url) { |
| return StartParseWithType(url, kContentTypeHtml); |
| } |
| bool StartParseWithType(const StringPiece& url, |
| const ContentType& content_type) { |
| return StartParseId(url, url, content_type); |
| } |
| |
| // Returns whether the google_url() URL is valid. |
| bool is_url_valid() const { return url_valid_; } |
| |
| // Mostly useful for file-based rewriters so that messages can reference |
| // the HTML file and produce navigable errors. |
| // |
| // Returns whether the URL is valid. |
| virtual bool StartParseId(const StringPiece& url, const StringPiece& id, |
| const ContentType& content_type); |
| |
| // Sets url() for test purposes. Normally this is done by StartParseId, |
| // but sometimes tests need to set it without worrying about parse |
| // state. |
| void SetUrlForTesting(const StringPiece& url); |
| |
| // Parses an arbitrary block of an html file, queuing up the events. Call |
| // Flush to send the events through the Filter. |
| // |
| // To parse an entire file, first call StartParse(), then call |
| // ParseText on the file contents (in whatever size chunks are convenient), |
| // then call FinishParse(). |
| // |
| // It is invalid to call ParseText when the StartParse* routines returned |
| // false. |
| void ParseText(const char* content, int size) { |
| ParseTextInternal(content, size); |
| } |
| void ParseText(const StringPiece& sp) { |
| ParseTextInternal(sp.data(), sp.size()); |
| } |
| |
| // Flush the currently queued events through the filters. It is desirable |
| // for large web pages, particularly dynamically generated ones, to start |
| // getting delivered to the browser as soon as they are ready. On the |
| // other hand, rewriting is more powerful when more of the content can |
| // be considered for image/css/js spriting. This method should be called |
| // when the controlling network process wants to induce a new chunk of |
| // output. The less you call this function the better the rewriting will |
| // be. |
| // |
| // It is invalid to call Flush when the StartParse* routines returned |
| // false. |
| // |
| // If this is called from a Filter, the request will be deferred until after |
| // currently active filters are completed. |
| virtual void Flush(); |
| |
| // Finish a chunked parsing session. This also induces a Flush. |
| // |
| // It is invalid to call FinishParse when the StartParse* routines returned |
| // false. |
| virtual void FinishParse(); |
| |
| |
| // Utility methods for implementing filters |
| |
| // These "New*" functions do *not* append the new node to the parent; you |
| // must do that yourself. Also note that in the context of a filter, you |
| // must add parents to the DOM in some fashion, before appending children to |
| // parents. |
| HtmlCdataNode* NewCdataNode(HtmlElement* parent, |
| const StringPiece& contents); |
| HtmlCharactersNode* NewCharactersNode(HtmlElement* parent, |
| const StringPiece& literal); |
| HtmlCommentNode* NewCommentNode(HtmlElement* parent, |
| const StringPiece& contents); |
| HtmlDirectiveNode* NewDirectiveNode(HtmlElement* parent, |
| const StringPiece& contents); |
| HtmlIEDirectiveNode* NewIEDirectiveNode(HtmlElement* parent, |
| const StringPiece& contents); |
| void InsertScriptAfterCurrent(StringPiece text, bool external); |
| void InsertScriptBeforeCurrent(StringPiece text, bool external); |
| |
| // Creates and appends an Anchor tag into the HTML, and then returns it. |
| // TODO(jmaessen): refactor and use this in the relevant places. |
| HtmlElement* AppendAnchor(StringPiece link, StringPiece text, |
| HtmlElement* parent); |
| |
| // DOM-manipulation methods. |
| // TODO(sligocki): Find Javascript equivalents and list them or even change |
| // our names to be consistent. |
| |
| // This and downstream filters will then see inserted elements but upstream |
| // filters will not. |
| |
| // Note: In Javascript the first is called insertBefore and takes the arg |
| // in the opposite order. |
| // Note: new_node must not already be in the DOM. |
| void InsertNodeBeforeNode(const HtmlNode* existing_node, HtmlNode* new_node); |
| void InsertNodeAfterNode(const HtmlNode* existing_node, HtmlNode* new_node); |
| |
| // These are a backwards-compatibility wrapper for use by Pagespeed Insights. |
| // TODO(morlovich): Remove them after PSI is synced. |
| void InsertElementBeforeElement(const HtmlNode* existing_element, |
| HtmlNode* new_element) { |
| InsertNodeBeforeNode(existing_element, new_element); |
| } |
| |
| void InsertElementAfterElement(const HtmlNode* existing_element, |
| HtmlNode* new_element) { |
| InsertNodeAfterNode(existing_element, new_element); |
| } |
| |
| // Add a new child element at the beginning or end of existing_parent's |
| // children. Named after Javascript's appendChild method. |
| // Note: new_child must not already be in the DOM. |
| void PrependChild(const HtmlElement* existing_parent, HtmlNode* new_child); |
| void AppendChild(const HtmlElement* existing_parent, HtmlNode* new_child); |
| |
| // Insert a new element before the current one. current_ remains unchanged. |
| // Note: new_node must not already be in the DOM. |
| void InsertNodeBeforeCurrent(HtmlNode* new_node); |
| |
| // Insert a new element after the current one, moving current_ to the new |
| // element. In a Filter, the flush-loop will advance past this on |
| // the next iteration. |
| // Note: new_node must not already be in the DOM. |
| void InsertNodeAfterCurrent(HtmlNode* new_node); |
| |
| // Enclose element around two elements in a sequence. The first |
| // element must be the same as, or precede the last element in the |
| // event-stream, and this is not checked, but the two elements do |
| // not need to be adjacent. They must have the same parent to start |
| // with. |
| bool AddParentToSequence(HtmlNode* first, HtmlNode* last, |
| HtmlElement* new_parent); |
| |
| // Moves current node (and all children) to an already-existing parent, |
| // where they will be placed as the last elements in that parent. |
| // Returns false if the operation could not be performed because either |
| // the node or its parent was partially or wholly flushed. |
| // Note: Will not work if called from StartElement() event. |
| // |
| // This differs from AppendChild() because it moves the current node, |
| // which is already in the DOM, rather than adding a new node. |
| bool MoveCurrentInto(HtmlElement* new_parent); |
| |
| // Moves current node (and all children) directly before existing_node. |
| // Note: Will not work if called from StartElement() event. |
| // |
| // This differs from InsertNodeBeforeNode() because it moves the |
| // current node, which is already in the DOM, rather than adding a new node. |
| bool MoveCurrentBefore(HtmlNode* existing_node); |
| |
| // If the given node is rewritable, delete it and all of its children (if |
| // any) and return true; otherwise, do nothing and return false. |
| // Note: Javascript appears to use removeChild for this. |
| bool DeleteNode(HtmlNode* node); |
| |
| // Delete a parent element, retaining any children and moving them to |
| // reside under the parent's parent. Note that an element must be |
| // fully inside the flush-window for this to work. Returns false on |
| // failure. |
| // |
| // See also MakeElementInvisible |
| bool DeleteSavingChildren(HtmlElement* element); |
| |
| // Similar in effect to DeleteSavingChildren, but this has no structural |
| // effect on the DOM. Instead it sets a bit in the HtmlElement that prevents |
| // it from being rendered by HtmlWriterFilter, though all its contents will |
| // be rendered. |
| // |
| // This fails, returning false, if the element's StartElement event has |
| // already been flushed. |
| bool MakeElementInvisible(HtmlElement* element); |
| |
| // Determines whether the element, in the context of its flush |
| // window, has children. If the element is not rewritable, or |
| // has not been closed yet, or inserted into the DOM event stream, |
| // then 'false' is returned. |
| // |
| // Note that the concept of the Flush Window is important because the |
| // knowledge of an element's children is not limited to the current |
| // event being presented to a Filter. A Filter can call this method |
| // in the StartElement of an event to see if any children are going |
| // to be coming. Of course, if the StartElement is at the end of a |
| // Flush window, then we won't know about the children, but IsRewritable |
| // will also be false. |
| bool HasChildrenInFlushWindow(HtmlElement* element); |
| |
| // If possible, replace the existing node with the new node and return true; |
| // otherwise, do nothing and return false. |
| bool ReplaceNode(HtmlNode* existing_node, HtmlNode* new_node); |
| |
| // Creates an another element with the same name and attributes as in_element. |
| // Does not duplicate the children or insert it anywhere. |
| HtmlElement* CloneElement(HtmlElement* in_element); |
| |
| HtmlElement* NewElement(HtmlElement* parent, const StringPiece& str) { |
| return NewElement(parent, MakeName(str)); |
| } |
| HtmlElement* NewElement(HtmlElement* parent, HtmlName::Keyword keyword) { |
| return NewElement(parent, MakeName(keyword)); |
| } |
| HtmlElement* NewElement(HtmlElement* parent, const HtmlName& name); |
| |
| // For both versions of AddAttribute |
| // Pass in NULL for value to add an attribute with no value at all |
| // ex: <script data-pagespeed-no-transform> |
| // Pass in "" for value if you want the value to be the empty string |
| // ex: <div style=""> |
| void AddAttribute(HtmlElement* element, HtmlName::Keyword keyword, |
| const StringPiece& value) { |
| return element->AddAttribute(MakeName(keyword), value, |
| HtmlElement::DOUBLE_QUOTE); |
| } |
| void AddAttribute(HtmlElement* element, StringPiece name, |
| const StringPiece& value) { |
| return element->AddAttribute(MakeName(name), value, |
| HtmlElement::DOUBLE_QUOTE); |
| } |
| void AddEscapedAttribute(HtmlElement* element, HtmlName::Keyword keyword, |
| const StringPiece& escaped_value) { |
| return element->AddEscapedAttribute(MakeName(keyword), escaped_value, |
| HtmlElement::DOUBLE_QUOTE); |
| } |
| void SetAttributeName(HtmlElement::Attribute* attribute, |
| HtmlName::Keyword keyword) { |
| attribute->set_name(MakeName(keyword)); |
| } |
| |
| HtmlName MakeName(const StringPiece& str); |
| HtmlName MakeName(HtmlName::Keyword keyword); |
| |
| bool IsRewritable(const HtmlNode* node) const; |
| // IsRewritable will return false for a node if either the open or close tag |
| // has been flushed, but this is too conservative if we only want to call |
| // AppendChild on that node, since we can append even if the open tag has |
| // already been flushed. |
| bool CanAppendChild(const HtmlNode* node) const; |
| |
| void ClearElements(); |
| |
| // Log the HtmlEvent queue_ to the message_handler_ for debugging. |
| void DebugLogQueue(); |
| |
| // Print the HtmlEvent queue_ to stdout for debugging. |
| void DebugPrintQueue(); |
| |
| // Implementation helper with detailed knowledge of html parsing libraries |
| friend class HtmlLexer; |
| |
| // Determines whether a tag should be terminated in HTML, e.g. <meta ..>. |
| // We do not expect to see a close-tag for meta and should never insert one. |
| bool IsImplicitlyClosedTag(HtmlName::Keyword keyword) const; |
| |
| // Determines whether a tag should be interpreted as a 'literal' |
| // tag. That is, a tag whose contents are not parsed until a |
| // corresponding matching end tag is encountered. |
| static bool IsLiteralTag(HtmlName::Keyword keyword); |
| |
| // Determines whether a tag is interpreted as a 'literal' tag in |
| // some user agents. Since some user agents will interpret the |
| // contents of these tags, our parser never treats them as literal |
| // tags. However, a filter that wants to insert new tags that should |
| // be processed by all user agents should not insert those tags into |
| // a tag that is sometimes parsed as a literal tag. Those filters |
| // can use this method to determine if they are within such a tag. |
| static bool IsSometimesLiteralTag(HtmlName::Keyword keyword); |
| |
| // An optionally closed tag ranges from <p>, which is typically not closed, |
| // but we infer the closing from context. Also consider <html>, which usually |
| // is closed but not always. E.g. www.google.com does not close its html tag. |
| bool IsOptionallyClosedTag(HtmlName::Keyword keyword) const; |
| |
| // Determines whether a tag allows brief termination in HTML, e.g. <tag/> |
| bool TagAllowsBriefTermination(HtmlName::Keyword keyword) const; |
| |
| MessageHandler* message_handler() const { return message_handler_; } |
| // Gets the current location information; typically to help with error |
| // messages. |
| const char* url() const { return url_.c_str(); } |
| // Gets a parsed GoogleUrl& corresponding to url(). |
| const GoogleUrl& google_url() const { return google_url_; } |
| const char* id() const { return id_.c_str(); } |
| int line_number() const { return line_number_; } |
| // Returns URL (or id) and line number as a string, to be used in messages. |
| GoogleString UrlLine() const { |
| return StringPrintf("%s:%d", id(), line_number()); |
| } |
| |
| // Return the current assumed doctype of the document (based on the content |
| // type and any HTML directives encountered so far). |
| const DocType& doctype() const; |
| |
| // Interface for any caller to report an error message via the message handler |
| void Info(const char* filename, int line, const char* msg, ...) |
| INSTAWEB_PRINTF_FORMAT(4, 5); |
| void Warning(const char* filename, int line, const char* msg, ...) |
| INSTAWEB_PRINTF_FORMAT(4, 5); |
| void Error(const char* filename, int line, const char* msg, ...) |
| INSTAWEB_PRINTF_FORMAT(4, 5); |
| void FatalError(const char* filename, int line, const char* msg, ...) |
| INSTAWEB_PRINTF_FORMAT(4, 5); |
| |
| void InfoV(const char* file, int line, const char *msg, va_list args); |
| void WarningV(const char* file, int line, const char *msg, va_list args); |
| void ErrorV(const char* file, int line, const char *msg, va_list args); |
| void FatalErrorV(const char* file, int line, const char* msg, va_list args); |
| |
| // Report error message with current parsing filename and linenumber. |
| void InfoHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3); |
| void WarningHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3); |
| void ErrorHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3); |
| void FatalErrorHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3); |
| |
| // If set_log_rewrite_timing(true) has been called, logs the given message |
| // at info level with a timeset offset from the parsing start time, |
| void ShowProgress(const char* message); |
| |
| void InfoHereV(const char *msg, va_list args) { |
| InfoV(id_.c_str(), line_number_, msg, args); |
| } |
| void WarningHereV(const char *msg, va_list args) { |
| WarningV(id_.c_str(), line_number_, msg, args); |
| } |
| void ErrorHereV(const char *msg, va_list args) { |
| ErrorV(id_.c_str(), line_number_, msg, args); |
| } |
| void FatalErrorHereV(const char* msg, va_list args) { |
| FatalErrorV(id_.c_str(), line_number_, msg, args); |
| } |
| |
| void AddElement(HtmlElement* element, int line_number); |
| void CloseElement(HtmlElement* element, HtmlElement::Style style, |
| int line_number); |
| |
| // Run a filter on the current queue of parse nodes. |
| void ApplyFilter(HtmlFilter* filter); |
| |
| // Provide timer to helping to report timing of each filter. You must also |
| // set_log_rewrite_timing(true) to turn on this reporting. |
| void set_timer(Timer* timer) { timer_ = timer; } |
| Timer* timer() const { return timer_; } |
| void set_log_rewrite_timing(bool x) { log_rewrite_timing_ = x; } |
| |
| // Adds a filter to be called during parsing as new events are added. |
| // Takes ownership of the HtmlFilter passed in. |
| void add_event_listener(HtmlFilter* listener); |
| |
| // Inserts a comment before or after the current node. The function tries to |
| // pick an intelligent place depending on the document structure and |
| // whether the current node is a start-element, end-element, or a leaf. |
| // Returns true if it successfully added the comment, and false if it was not |
| // safe for the comment to be inserted. This can happen when a comment is |
| // inserted in a literal element (script or style) after the opening tag has |
| // been flushed, but the closing tag has not been seen yet. In this case, the |
| // caller can buffer the messages until EndElement is reached and call |
| // InsertComment at that point. |
| bool InsertComment(StringPiece sp); |
| |
| // Sets the limit on the maximum number of bytes that should be parsed. |
| void set_size_limit(int64 x); |
| // Returns whether we have exceeded the size limit. |
| bool size_limit_exceeded() const; |
| |
| // For debugging purposes. If this vector is supplied, DetermineEnabledFilters |
| // will populate it with the list of Filters that were disabled, plus the |
| // associated reason, if supplied by the Filter. Caller retains ownership |
| // of the pointer. |
| void SetDynamicallyDisabledFilterList(StringVector* list) { |
| dynamically_disabled_filter_list_ = list; |
| } |
| |
| // Temporarily removes the current node from the parse tree. This must |
| // be run as part of a filter callback, and it is the responsibility of |
| // the filter to save the node and call RestoreNode on it later. |
| // |
| // If current node is an HtmlElement, this must be called on the |
| // StartElement event, not the EndElement event. When an element is |
| // deferred, all its children are deferred as well. |
| // |
| // It is fine to restore a node after a Flush. Note that while most |
| // HtmlNode objects are freed after a Flush window, a deferred one will |
| // be retained until it is Restored, or until the end of the document. |
| // |
| // If a node is not restored at end of document, a warning will be |
| // printed and the stored data cleaned up. Functionally it will be |
| // as if the filter called DeleteNode. |
| // |
| // Note that a filter that defers a node and never restores it will never |
| // see the EndElement for that node. |
| // |
| // Note that if you defer a Characters node and restore it next to |
| // another Characters node, they will be coalesced prior to the next |
| // filter, but this filter will not see the coalesced nodes. |
| // Similarly, if you defer a non-characters node that was previously |
| // separating two characters nodes, that will also result in a |
| // coalesce seen only by downstream filters. |
| void DeferCurrentNode(); |
| |
| // Restores a node, inserting it after the current event. If the node |
| // is an HtmlElement, the iteration will proceed with the first child node, |
| // or, if there were no children, then the EndElement method. |
| // |
| // Note: you cannot restore during Flush(). |
| void RestoreDeferredNode(HtmlNode* deferred_node); |
| |
| // Returns whether the filter pipeline can rewrite urls. |
| bool can_modify_urls() { |
| return can_modify_urls_; |
| } |
| |
| protected: |
| typedef std::vector<HtmlFilter*> FilterVector; |
| typedef std::list<HtmlFilter*> FilterList; |
| typedef std::pair<HtmlNode*, HtmlEventList*> DeferredNode; |
| typedef std::map<const HtmlNode*, HtmlEventList*> NodeToEventListMap; |
| typedef std::map<HtmlFilter*, DeferredNode> FilterElementMap; |
| typedef std::set<const HtmlNode*> NodeSet; |
| |
| // HtmlParse::FinishParse() is equivalent to the sequence of |
| // BeginFinishParse(); Flush(); EndFinishParse(). |
| // Split up to permit asynchronous versions. |
| void BeginFinishParse(); |
| void EndFinishParse(); |
| |
| // Clears any cached state we have while this object is laying |
| // around for recycling. |
| void Clear(); |
| |
| // Returns the number of events on the event queue. |
| size_t GetEventQueueSize(); |
| |
| virtual void ParseTextInternal(const char* content, int size); |
| |
| // Calls DetermineFiltersBehaviorImpl in an idempotent way. |
| void DetermineFiltersBehavior() { |
| if (!determine_filter_behavior_called_) { |
| determine_filter_behavior_called_ = true; |
| can_modify_urls_ = false; |
| DetermineFiltersBehaviorImpl(); |
| } |
| } |
| |
| void DetermineFilterListBehavior(const FilterList& list) { |
| for (FilterList::const_iterator i = list.begin(); i != list.end(); ++i) { |
| CheckFilterBehavior(*i); |
| } |
| } |
| |
| void CheckFilterBehavior(HtmlFilter* filter); |
| |
| // Call DetermineEnabled() on each filter. Should be called after |
| // the property cache lookup has finished since some filters depend on |
| // pcache results in their DetermineEnabled implementation. If a subclass has |
| // filters that the base HtmlParse doesn't know about, it should override this |
| // function and call DetermineEnabled on each of its filters, along with |
| // calling the base DetermineEnabledFiltersImpl. |
| // For all enabled filters the CanModifyUrl() flag will be aggregated (or'ed) |
| // and can be queried on the can_modify_url function. |
| virtual void DetermineFiltersBehaviorImpl(); |
| |
| private: |
| void ApplyFilterHelper(HtmlFilter* filter); |
| HtmlEventListIterator Last(); // Last element in queue |
| bool IsInEventWindow(const HtmlEventListIterator& iter) const; |
| void InsertNodeBeforeEvent(const HtmlEventListIterator& event, |
| HtmlNode* new_node); |
| void InsertNodeAfterEvent(const HtmlEventListIterator& event, |
| HtmlNode* new_node); |
| bool MoveCurrentBeforeEvent(const HtmlEventListIterator& move_to); |
| bool IsDescendantOf(const HtmlNode* possible_child, |
| const HtmlNode* possible_parent); |
| void SanityCheck(); |
| void CheckEventParent(HtmlEvent* event, HtmlElement* expect, |
| HtmlElement* actual); |
| void CheckParentFromAddEvent(HtmlEvent* event); |
| void FixParents(const HtmlEventListIterator& begin, |
| const HtmlEventListIterator& end_inclusive, |
| HtmlElement* new_parent); |
| void CoalesceAdjacentCharactersNodes(); |
| void ClearEvents(); |
| void EmitQueue(MessageHandler* handler); |
| inline void NextEvent(); |
| void ClearDeferredNodes(); |
| inline bool IsRewritableIgnoringDeferral(const HtmlNode* node) const; |
| inline bool IsRewritableIgnoringEnd(const HtmlNode* node) const; |
| void SetupScript(StringPiece text, bool external, HtmlElement* script); |
| |
| // Visible for testing only, via HtmlTestingPeer |
| friend class HtmlTestingPeer; |
| void AddEvent(HtmlEvent* event); |
| void SetCurrent(HtmlNode* node); |
| void set_coalesce_characters(bool x) { coalesce_characters_ = x; } |
| size_t symbol_table_size() const { |
| return string_table_.string_bytes_allocated(); |
| } |
| |
| // If a FLUSH occurs in the middle of a script, style, or other tag |
| // whose contents can only be a Characters block, then we will buffer |
| // up the start of the script tag and not emit it and the Characters block |
| // until after we see the close script tag. This function enforces that |
| // right before calling the Filters. |
| void DelayLiteralTag(); |
| |
| FilterVector event_listeners_; |
| SymbolTableSensitive string_table_; |
| FilterList filters_; |
| HtmlLexer* lexer_; |
| Arena<HtmlNode> nodes_; |
| HtmlEventList queue_; |
| HtmlEventListIterator current_; |
| // Have we deleted current? Then we shouldn't do certain manipulations to it. |
| MessageHandler* message_handler_; |
| GoogleString url_; |
| GoogleUrl google_url_; |
| GoogleString id_; // Per-request identifier string used in error messages. |
| int line_number_; |
| bool skip_increment_; |
| bool determine_filter_behavior_called_; |
| bool can_modify_urls_; |
| bool determine_enabled_filters_called_; |
| bool need_sanity_check_; |
| bool coalesce_characters_; |
| bool need_coalesce_characters_; |
| bool url_valid_; |
| bool log_rewrite_timing_; // Should we time the speed of parsing? |
| bool running_filters_; |
| int64 parse_start_time_us_; |
| scoped_ptr<HtmlEvent> delayed_start_literal_; |
| Timer* timer_; |
| HtmlFilter* current_filter_; // Filter currently running in ApplyFilter |
| |
| // When deferring a node that spans a flush window, we present upstream |
| // filters with a view of the event-stream that is not impacted by the |
| // deferral. To implement this, at the beginning of each flush window, |
| // we do the queue_ mutation for any outstanding deferrals right before |
| // running the filter that deferred them. |
| FilterElementMap open_deferred_nodes_; |
| |
| // Keeps track of the deferred nodes that have not yet been restored. |
| NodeToEventListMap deferred_nodes_; |
| |
| // We use the node-defer logic to implement DeleteNode for a node that |
| // hasn't been closed yet. The only difference is that you cannot |
| // restore a deleted node, and the parser will not print a warning if |
| // a deleted node is never restored. |
| NodeSet deferred_deleted_nodes_; |
| |
| StringVector* dynamically_disabled_filter_list_; |
| |
| DISALLOW_COPY_AND_ASSIGN(HtmlParse); |
| }; |
| |
| } // namespace net_instaweb |
| |
| #endif // PAGESPEED_KERNEL_HTML_HTML_PARSE_H_ |