blob: 45dc52630f6cea51e4604979756f0964fd24feec [file] [log] [blame]
/**
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Author: jmarantz@google.com (Joshua Marantz)
#ifndef NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_PARSE_H_
#define NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_PARSE_H_
#include <stdarg.h>
#include <set>
#include <vector>
#include "base/basictypes.h"
#include "net/instaweb/htmlparse/public/html_element.h"
#include "net/instaweb/htmlparse/public/html_node.h"
#include "net/instaweb/htmlparse/public/html_parser_types.h"
#include "net/instaweb/util/public/google_url.h"
#include "net/instaweb/util/public/printf_format.h"
#include <string>
#include "net/instaweb/util/public/string_util.h"
#include "net/instaweb/util/public/symbol_table.h"
namespace net_instaweb {
class Timer;
class HtmlParse {
public:
explicit HtmlParse(MessageHandler* message_handler);
~HtmlParse();
// Application methods for parsing functions and adding filters
// Add a new html filter to the filter-chain
void AddFilter(HtmlFilter* filter);
// Initiate a chunked parsing session. Finish with FinishParse. The
// url is only used to resolve relative URLs; the contents are not
// directly fetched. The caller must supply the text and call ParseText.
void StartParse(const StringPiece& url) { StartParseId(url, url); }
// Use an error message id that is distinct from the url.
// Mostly useful for testing.
void StartParseId(const StringPiece& url, const StringPiece& id);
// Parses an arbitrary block of an html file, queuing up the events. Call
// Flush to send the events through the Filter.
//
// To parse an entire file, first call StartParse(), then call
// ParseText on the file contents (in whatever size chunks are convenient),
// then call FinishParse().
void ParseText(const char* content, int size);
void ParseText(const StringPiece& sp) { ParseText(sp.data(), sp.size()); }
// Flush the currently queued events through the filters. It is desirable
// for large web pages, particularly dynamically generated ones, to start
// getting delivered to the browser as soon as they are ready. On the
// other hand, rewriting is more powerful when more of the content can
// be considered for image/css/js spriting. This method should be called
// when the controlling network process wants to induce a new chunk of
// output. The less you call this function the better the rewriting will
// be.
void Flush();
// Finish a chunked parsing session. This also induces a Flush.
void FinishParse();
// Utility methods for implementing filters
HtmlCdataNode* NewCdataNode(HtmlElement* parent,
const StringPiece& contents);
HtmlCharactersNode* NewCharactersNode(HtmlElement* parent,
const StringPiece& literal);
HtmlCommentNode* NewCommentNode(HtmlElement* parent,
const StringPiece& contents);
HtmlDirectiveNode* NewDirectiveNode(HtmlElement* parent,
const StringPiece& contents);
HtmlIEDirectiveNode* NewIEDirectiveNode(HtmlElement* parent,
const StringPiece& contents);
// DOM-manipulation methods.
// TODO(sligocki): Find Javascript equivalents and list them or even change
// our names to be consistent.
// TODO(mdsteele): Rename these methods to e.g. InsertNodeBeforeNode.
// This and downstream filters will then see inserted elements but upstream
// filters will not.
// Note: In Javascript the first is called insertBefore and takes the arg
// in the oposite order.
bool InsertElementBeforeElement(const HtmlNode* existing_node,
HtmlNode* new_node);
bool InsertElementAfterElement(const HtmlNode* existing_node,
HtmlNode* new_node);
// Add child element at the begining or end of existing_parent's children.
// Named after Javascript's appendChild method.
bool PrependChild(const HtmlElement* existing_parent, HtmlNode* new_child);
bool AppendChild(const HtmlElement* existing_parent, HtmlNode* new_child);
// Insert element before the current one. current_ remains unchanged.
bool InsertElementBeforeCurrent(HtmlNode* node);
// Insert element after the current one, moving current_ to the new
// element. In a Filter, the flush-loop will advance past this on
// the next iteration.
bool InsertElementAfterCurrent(HtmlNode* node);
// Enclose element around two elements in a sequence. The first
// element must be the same as, or precede the last element in the
// event-stream, and this is not checked, but the two elements do
// not need to be adjacent. They must have the same parent to start
// with.
//
// This differs from MoveSequenceToParent in that the new parent is
// not yet in the DOM tree, and will be inserted around the
// elements.
bool AddParentToSequence(HtmlNode* first, HtmlNode* last,
HtmlElement* new_parent);
// Moves a node-sequence to an already-existing parent, where they
// will be placed as the last elements in that parent. Returns false
// if the operation could not be performed because either the node
// or its parent was partially or wholy flushed.
//
// This differs from AddParentToSequence in that the parent is already
// in the DOM-tree.
bool MoveCurrentInto(HtmlElement* new_parent);
// If the given node is rewritable, delete it and all of its children (if
// any) and return true; otherwise, do nothing and return false.
// Note: Javascript appears to use removeChild for this.
bool DeleteElement(HtmlNode* node);
// Delete a parent element, retaining any children and moving them to
// reside under the parent's parent.
bool DeleteSavingChildren(HtmlElement* element);
// If possible, replace the existing node with the new node and return true;
// otherwise, do nothing and return false.
bool ReplaceNode(HtmlNode* existing_node, HtmlNode* new_node);
HtmlElement* NewElement(HtmlElement* parent, Atom tag);
bool IsRewritable(const HtmlNode* node) const;
void ClearElements();
void DebugPrintQueue(); // Print queue (for debugging)
Atom Intern(const std::string& name) {
return string_table_.Intern(name);
}
Atom Intern(const char* name) {
return string_table_.Intern(name);
}
// Implementation helper with detailed knowledge of html parsing libraries
friend class HtmlLexer;
// Determines whether a tag should be terminated in HTML.
bool IsImplicitlyClosedTag(Atom tag) const;
// Determines whether a tag allows brief termination in HTML, e.g. <tag/>
bool TagAllowsBriefTermination(Atom tag) const;
MessageHandler* message_handler() const { return message_handler_; }
// Gets the current location information; typically to help with error
// messages.
const char* url() const { return url_.c_str(); }
// Gets a parsed GURL& corresponding to url().
const GURL& gurl() const { return gurl_; }
const char* id() const { return id_.c_str(); }
int line_number() const { return line_number_; }
// Interface for any caller to report an error message via the message handler
void Info(const char* filename, int line, const char* msg, ...)
INSTAWEB_PRINTF_FORMAT(4, 5);
void Warning(const char* filename, int line, const char* msg, ...)
INSTAWEB_PRINTF_FORMAT(4, 5);
void Error(const char* filename, int line, const char* msg, ...)
INSTAWEB_PRINTF_FORMAT(4, 5);
void FatalError(const char* filename, int line, const char* msg, ...)
INSTAWEB_PRINTF_FORMAT(4, 5);
void InfoV(const char* file, int line, const char *msg, va_list args);
void WarningV(const char* file, int line, const char *msg, va_list args);
void ErrorV(const char* file, int line, const char *msg, va_list args);
void FatalErrorV(const char* file, int line, const char* msg, va_list args);
// Report error message with current parsing filename and linenumber.
void InfoHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3);
void WarningHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3);
void ErrorHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3);
void FatalErrorHere(const char* msg, ...) INSTAWEB_PRINTF_FORMAT(2, 3);
void InfoHereV(const char *msg, va_list args) {
InfoV(id_.c_str(), line_number_, msg, args);
}
void WarningHereV(const char *msg, va_list args) {
WarningV(id_.c_str(), line_number_, msg, args);
}
void ErrorHereV(const char *msg, va_list args) {
ErrorV(id_.c_str(), line_number_, msg, args);
}
void FatalErrorHereV(const char* msg, va_list args) {
FatalErrorV(id_.c_str(), line_number_, msg, args);
}
void AddElement(HtmlElement* element, int line_number);
void CloseElement(HtmlElement* element, HtmlElement::CloseStyle close_style,
int line_number);
// Run a filter on the current queue of parse nodes. This is visible
// for testing.
void ApplyFilter(HtmlFilter* filter);
// Provide timer to helping to report timing of each filter. In the absense
// of a timer, reporting will be suppressed.
void set_timer(Timer* timer) { timer_ = timer; }
private:
HtmlEventListIterator Last(); // Last element in queue
bool IsInEventWindow(const HtmlEventListIterator& iter) const;
bool InsertElementBeforeEvent(const HtmlEventListIterator& event,
HtmlNode* new_node);
bool InsertElementAfterEvent(const HtmlEventListIterator& event,
HtmlNode* new_node);
void SanityCheck();
void CheckEventParent(HtmlEvent* event, HtmlElement* expect,
HtmlElement* actual);
void CheckParentFromAddEvent(HtmlEvent* event);
void FixParents(const HtmlEventListIterator& begin,
const HtmlEventListIterator& end_inclusive,
HtmlElement* new_parent);
void CoalesceAdjacentCharactersNodes();
void ShowProgress(const char* message);
// Visible for testing only, via HtmlTestingPeer
friend class HtmlTestingPeer;
void AddEvent(HtmlEvent* event);
void SetCurrent(HtmlNode* node);
void set_coalesce_characters(bool x) { coalesce_characters_ = x; }
SymbolTableInsensitive string_table_;
std::vector<HtmlFilter*> filters_;
HtmlLexer* lexer_;
int sequence_;
std::set<HtmlNode*> nodes_;
HtmlEventList queue_;
HtmlEventListIterator current_;
// Have we deleted current? Then we shouldn't do certain manipulations to it.
bool deleted_current_;
MessageHandler* message_handler_;
std::string url_;
GURL gurl_;
std::string id_; // Per-request identifier string used in error messages.
int line_number_;
bool need_sanity_check_;
bool coalesce_characters_;
bool need_coalesce_characters_;
int64 parse_start_time_us_;
Timer* timer_;
DISALLOW_COPY_AND_ASSIGN(HtmlParse);
};
} // namespace net_instaweb
#endif // NET_INSTAWEB_HTMLPARSE_PUBLIC_HTML_PARSE_H_