blob: 4ef1896d7f31e712e7b9b0493b733f959c5697cc [file] [log] [blame]
/**
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Author: jmarantz@google.com (Joshua Marantz)
#include "net/instaweb/htmlparse/public/html_parse.h"
#include <errno.h>
#include <stdio.h>
#include <string.h>
#include <utility> // for std::pair
#include "base/logging.h"
#include "net/instaweb/htmlparse/html_event.h"
#include "net/instaweb/htmlparse/html_lexer.h"
#include "net/instaweb/htmlparse/public/html_element.h"
#include "net/instaweb/htmlparse/public/html_escape.h"
#include "net/instaweb/htmlparse/public/html_filter.h"
#include "net/instaweb/util/public/google_url.h"
#include "net/instaweb/util/public/message_handler.h"
#include <string>
#include "net/instaweb/util/public/timer.h"
namespace net_instaweb {
HtmlParse::HtmlParse(MessageHandler* message_handler)
: lexer_(NULL), // Can't initialize here, since "this" should not be used
// in the initializer list (it generates an error in
// Visual Studio builds).
sequence_(0),
current_(queue_.end()),
deleted_current_(false),
message_handler_(message_handler),
line_number_(1),
need_sanity_check_(false),
coalesce_characters_(true),
need_coalesce_characters_(false),
parse_start_time_us_(0),
timer_(NULL) {
lexer_ = new HtmlLexer(this);
HtmlEscape::Init();
}
HtmlParse::~HtmlParse() {
delete lexer_;
ClearElements();
}
void HtmlParse::AddFilter(HtmlFilter* html_filter) {
filters_.push_back(html_filter);
}
HtmlEventListIterator HtmlParse::Last() {
HtmlEventListIterator p = queue_.end();
--p;
return p;
}
// Checks that the parent provided when creating the event's Node is
// consistent with the position in the list. An alternative approach
// here is to use this code and remove the explicit specification of
// parents when constructing nodes.
//
// A complexity we will run into with that approach is that the queue_ is
// cleared on a Flush, so we cannot reliably derive the correct parent
// from the queue. However, the Lexer keeps an element stack across
// flushes, and therefore can keep correct parent pointers. So we have
// to inject pessimism in this process.
//
// Note that we also have sanity checks that run after each filter.
void HtmlParse::CheckParentFromAddEvent(HtmlEvent* event) {
HtmlNode* node = event->GetNode();
if (node != NULL) {
message_handler_->Check(lexer_->Parent() == node->parent(),
"lexer_->Parent() != node->parent()");
}
}
// Testing helper method
void HtmlParse::AddEvent(HtmlEvent* event) {
CheckParentFromAddEvent(event);
queue_.push_back(event);
need_sanity_check_ = true;
need_coalesce_characters_ = true;
// If this is a leaf-node event, we need to set the iterator of the
// corresponding leaf node to point to this event's position in the queue.
// If this is an element event, then the iterators of the element will get
// set in HtmlParse::AddElement and HtmlParse::CloseElement, so there's no
// need to do it here. If this is some other kind of event, there are no
// iterators to set.
HtmlLeafNode* leaf = event->GetLeafNode();
if (leaf != NULL) {
leaf->set_iter(Last());
message_handler_->Check(IsRewritable(leaf), "!IsRewritable(leaf)");
}
}
// Testing helper method
void HtmlParse::SetCurrent(HtmlNode* node) {
current_ = node->begin();
}
HtmlCdataNode* HtmlParse::NewCdataNode(HtmlElement* parent,
const StringPiece& contents) {
HtmlCdataNode* cdata = new HtmlCdataNode(parent, contents, queue_.end());
nodes_.insert(cdata);
return cdata;
}
HtmlCharactersNode* HtmlParse::NewCharactersNode(HtmlElement* parent,
const StringPiece& literal) {
HtmlCharactersNode* characters =
new HtmlCharactersNode(parent, literal, queue_.end());
nodes_.insert(characters);
return characters;
}
HtmlCommentNode* HtmlParse::NewCommentNode(HtmlElement* parent,
const StringPiece& contents) {
HtmlCommentNode* comment = new HtmlCommentNode(parent, contents,
queue_.end());
nodes_.insert(comment);
return comment;
}
HtmlIEDirectiveNode* HtmlParse::NewIEDirectiveNode(
HtmlElement* parent, const StringPiece& contents) {
HtmlIEDirectiveNode* directive =
new HtmlIEDirectiveNode(parent, contents, queue_.end());
nodes_.insert(directive);
return directive;
}
HtmlDirectiveNode* HtmlParse::NewDirectiveNode(HtmlElement* parent,
const StringPiece& contents) {
HtmlDirectiveNode* directive = new HtmlDirectiveNode(parent, contents,
queue_.end());
nodes_.insert(directive);
return directive;
}
HtmlElement* HtmlParse::NewElement(HtmlElement* parent, Atom tag) {
HtmlElement* element = new HtmlElement(parent, tag, queue_.end(),
queue_.end());
nodes_.insert(element);
element->set_sequence(sequence_++);
return element;
}
void HtmlParse::AddElement(HtmlElement* element, int line_number) {
HtmlStartElementEvent* event =
new HtmlStartElementEvent(element, line_number);
AddEvent(event);
element->set_begin(Last());
element->set_begin_line_number(line_number);
}
void HtmlParse::StartParseId(const StringPiece& url, const StringPiece& id) {
url.CopyToString(&url_);
GURL gurl(url_);
// TODO(jmaessen): warn and propagate upwards. This will require
// major changes to the callers.
message_handler_->Check(gurl.is_valid(), "Invalid url %s", url_.c_str());
gurl_.Swap(&gurl);
line_number_ = 1;
id.CopyToString(&id_);
if (timer_ != NULL) {
parse_start_time_us_ = timer_->NowUs();
InfoHere("HtmlParse::StartParse");
}
AddEvent(new HtmlStartDocumentEvent(line_number_));
lexer_->StartParse(id);
}
void HtmlParse::ShowProgress(const char* message) {
if (timer_ != NULL) {
long delta = static_cast<long>(timer_->NowUs() - parse_start_time_us_);
InfoHere("%ldus: HtmlParse::%s", delta, message);
}
}
void HtmlParse::FinishParse() {
lexer_->FinishParse();
AddEvent(new HtmlEndDocumentEvent(line_number_));
Flush();
ClearElements();
ShowProgress("FinishParse");
}
void HtmlParse::ParseText(const char* text, int size) {
lexer_->Parse(text, size);
}
// This is factored out of Flush() for testing purposes.
void HtmlParse::ApplyFilter(HtmlFilter* filter) {
if (coalesce_characters_ && need_coalesce_characters_) {
CoalesceAdjacentCharactersNodes();
need_coalesce_characters_ = false;
}
ShowProgress(StrCat("ApplyFilter:", filter->Name()).c_str());
for (current_ = queue_.begin(); current_ != queue_.end(); ) {
HtmlEvent* event = *current_;
line_number_ = event->line_number();
event->Run(filter);
deleted_current_ = false;
++current_;
}
filter->Flush();
if (need_sanity_check_) {
SanityCheck();
need_sanity_check_ = false;
}
}
void HtmlParse::CoalesceAdjacentCharactersNodes() {
ShowProgress("CoalesceAdjacentCharactersNodes");
HtmlCharactersNode* prev = NULL;
for (current_ = queue_.begin(); current_ != queue_.end(); ) {
HtmlEvent* event = *current_;
HtmlCharactersNode* node = event->GetCharactersNode();
if ((node != NULL) && (prev != NULL)) {
prev->Append(node->contents());
current_ = queue_.erase(current_); // returns element after erased
delete event;
node->MarkAsDead(queue_.end());
need_sanity_check_ = true;
} else {
++current_;
prev = node;
}
}
}
void HtmlParse::CheckEventParent(HtmlEvent* event, HtmlElement* expect,
HtmlElement* actual) {
if ((expect != NULL) && (actual != expect)) {
std::string actual_buf, expect_buf, event_buf;
if (actual != NULL) {
actual->ToString(&actual_buf);
} else {
actual_buf = "(null)";
}
expect->ToString(&expect_buf);
event->ToString(&event_buf);
FatalErrorHere("HtmlElement Parents of %s do not match:\n"
"Actual: %s\n"
"Expected: %s\n",
event_buf.c_str(), actual_buf.c_str(), expect_buf.c_str());
}
}
void HtmlParse::SanityCheck() {
ShowProgress("SanityCheck");
// Sanity check that the Node parent-pointers are consistent with the
// begin/end-element events. This is done in a second pass to avoid
// confusion when the filter mutates the event-stream. Also note that
// a mid-HTML call to HtmlParse::Flush means that we may pop out beyond
// the stack we can detect in this event stream. This is represented
// here by an empty stack.
std::vector<HtmlElement*> element_stack;
HtmlElement* expect_parent = NULL;
for (current_ = queue_.begin(); current_ != queue_.end(); ++current_) {
HtmlEvent* event = *current_;
// Determine whether the event we are looking at is a StartElement,
// EndElement, or a leaf. We manipulate our temporary stack when
// we see StartElement and EndElement, and we always test to make
// sure the elements have the expected parent based on context, when
// we can figure out what the expected parent is.
HtmlElement* start_element = event->GetStartElement();
HtmlElement* actual_parent = NULL;
if (start_element != NULL) {
CheckEventParent(event, expect_parent, start_element->parent());
message_handler_->Check(start_element->begin() == current_,
"start_element->begin() != current_");
message_handler_->Check(start_element->live(), "!start_element->live()");
element_stack.push_back(start_element);
expect_parent = start_element;
} else {
HtmlElement* end_element = event->GetEndElement();
if (end_element != NULL) {
message_handler_->Check(end_element->end() == current_,
"end_element->end() != current_");
message_handler_->Check(end_element->live(),
"!end_element->live()");
if (!element_stack.empty()) {
// The element stack can be empty on End can happen due
// to this sequence:
// <tag1>
// FLUSH
// </tag1> <!-- tag1 close seen with empty stack -->
message_handler_->Check(element_stack.back() == end_element,
"element_stack.back() != end_element");
element_stack.pop_back();
}
actual_parent = end_element->parent();
expect_parent = element_stack.empty() ? NULL : element_stack.back();
CheckEventParent(event, expect_parent, end_element->parent());
} else {
// We only know for sure what the parents are once we have seen
// a start_element.
HtmlLeafNode* leaf_node = event->GetLeafNode();
if (leaf_node != NULL) { // Start/EndDocument are not leaf nodes
message_handler_->Check(leaf_node->live(), "!leaf_node->live()");
message_handler_->Check(leaf_node->end() == current_,
"leaf_node->end() != current_");
CheckEventParent(event, expect_parent, leaf_node->parent());
}
}
}
}
}
void HtmlParse::Flush() {
ShowProgress("Flush");
for (size_t i = 0; i < filters_.size(); ++i) {
HtmlFilter* filter = filters_[i];
ApplyFilter(filter);
}
// Detach all the elements from their events, as we are now invalidating
// the events, but not the elements.
for (current_ = queue_.begin(); current_ != queue_.end(); ++current_) {
HtmlEvent* event = *current_;
line_number_ = event->line_number();
HtmlElement* element = event->GetStartElement();
if (element != NULL) {
element->set_begin(queue_.end());
} else {
element = event->GetEndElement();
if (element != NULL) {
element->set_end(queue_.end());
} else {
HtmlLeafNode* leaf_node = event->GetLeafNode();
if (leaf_node != NULL) {
leaf_node->set_iter(queue_.end());
}
}
}
delete event;
}
queue_.clear();
need_sanity_check_ = false;
need_coalesce_characters_ = false;
}
bool HtmlParse::InsertElementBeforeElement(const HtmlNode* existing_node,
HtmlNode* new_node) {
// begin() == queue_.end() -> this is an invalid element.
// TODO(sligocki): Rather than checks, we should probably return this as
// the status.
message_handler_->Check(existing_node->begin() != queue_.end(),
"InsertElementBeforeElement: existing_node invalid");
new_node->set_parent(existing_node->parent());
return InsertElementBeforeEvent(existing_node->begin(), new_node);
}
bool HtmlParse::InsertElementAfterElement(const HtmlNode* existing_node,
HtmlNode* new_node) {
message_handler_->Check(existing_node->end() != queue_.end(),
"InsertElementAfterElement: existing_node invalid");
new_node->set_parent(existing_node->parent());
return InsertElementAfterEvent(existing_node->end(), new_node);
}
bool HtmlParse::PrependChild(const HtmlElement* existing_parent,
HtmlNode* new_child) {
message_handler_->Check(existing_parent->begin() != queue_.end(),
"PrependChild: existing_parent invalid");
new_child->set_parent(const_cast<HtmlElement*>(existing_parent));
return InsertElementAfterEvent(existing_parent->begin(), new_child);
}
bool HtmlParse::AppendChild(const HtmlElement* existing_parent,
HtmlNode* new_child) {
message_handler_->Check(existing_parent->end() != queue_.end(),
"AppendChild: existing_parent invalid");
new_child->set_parent(const_cast<HtmlElement*>(existing_parent));
return InsertElementBeforeEvent(existing_parent->end(), new_child);
}
bool HtmlParse::InsertElementBeforeCurrent(HtmlNode* new_node) {
if (deleted_current_) {
FatalErrorHere("InsertElementBeforeCurrent after current has been "
"deleted.");
}
if ((new_node->parent() == NULL) && (current_ != queue_.end())) {
// Add a parent if one was not provided in new_node. We figure out
// what the parent should be by looking at current_. If that's an
// EndElement event, then that means that we are adding a new child
// of that element. In all other cases, we are adding a sibling.
HtmlEvent* current_event = *current_;
HtmlElement* end_element = current_event->GetEndElement();
if (end_element != NULL) {
// The node pointed to by Current will be our new parent.
new_node->set_parent(end_element);
} else {
// The node pointed to by Current will be our new sibling, so
// we should grab its parent.
HtmlNode* node = current_event->GetNode();
message_handler_->Check(node, "Cannot compute parent for new node");
new_node->set_parent(node->parent());
}
}
return InsertElementBeforeEvent(current_, new_node);
}
bool HtmlParse::InsertElementBeforeEvent(const HtmlEventListIterator& event,
HtmlNode* new_node) {
need_sanity_check_ = true;
need_coalesce_characters_ = true;
new_node->SynthesizeEvents(event, &queue_);
// TODO(jmarantz): make this routine return void.
return true;
}
bool HtmlParse::InsertElementAfterEvent(const HtmlEventListIterator& event,
HtmlNode* new_node) {
message_handler_->Check(event != queue_.end(), "event == queue_.end()");
HtmlEventListIterator next_event = event;
++next_event;
return InsertElementBeforeEvent(next_event, new_node);
}
bool HtmlParse::InsertElementAfterCurrent(HtmlNode* new_node) {
if (deleted_current_) {
FatalErrorHere("InsertElementAfterCurrent after current has been "
"deleted.");
}
if (current_ == queue_.end()) {
FatalErrorHere("InsertElementAfterCurrent called with queue at end.");
}
++current_;
bool ret = InsertElementBeforeEvent(current_, new_node);
// We want to leave current_ pointing to the newly created element.
--current_;
message_handler_->Check((*current_)->GetNode() == new_node,
"(*current_)->GetNode() != new_node");
return ret;
}
bool HtmlParse::AddParentToSequence(HtmlNode* first, HtmlNode* last,
HtmlElement* new_parent) {
bool added = false;
HtmlElement* original_parent = first->parent();
if (IsRewritable(first) && IsRewritable(last) &&
(last->parent() == original_parent) &&
(new_parent->begin() == queue_.end()) &&
(new_parent->end() == queue_.end()) &&
InsertElementBeforeEvent(first->begin(), new_parent)) {
// This sequence of checks culminated in inserting the parent's begin
// and end before 'first'. Now we must mutate new_parent's end pointer
// to insert it after the last->end(). list::insert(iter) inserts
// *before* the iter, so we'll increment last->end().
HtmlEvent* end_element_event = *new_parent->end();
queue_.erase(new_parent->end());
HtmlEventListIterator p = last->end();
++p;
new_parent->set_end(queue_.insert(p, end_element_event));
FixParents(first->begin(), last->end(), new_parent);
added = true;
need_sanity_check_ = true;
need_coalesce_characters_ = true;
}
return added;
}
void HtmlParse::FixParents(const HtmlEventListIterator& begin,
const HtmlEventListIterator& end_inclusive,
HtmlElement* new_parent) {
HtmlEvent* event = *begin;
HtmlNode* first = event->GetNode();
HtmlElement* original_parent = first->parent();
// Loop over all the nodes from begin to end, inclusive,
// and set the parent pointer for the node, if there is one. A few
// event types don't have HtmlNodes, such as Comments and IEDirectives.
message_handler_->Check(end_inclusive != queue_.end(),
"end_inclusive == queue_.end()");
HtmlEventListIterator end = end_inclusive;
++end;
for (HtmlEventListIterator p = begin; p != end; ++p) {
HtmlNode* node = (*p)->GetNode();
if ((node != NULL) && (node->parent() == original_parent)) {
node->set_parent(new_parent);
}
}
}
bool HtmlParse::MoveCurrentInto(HtmlElement* new_parent) {
bool moved = false;
HtmlNode* node = (*current_)->GetNode();
if ((node != NULL) && (node != new_parent) &&
IsRewritable(node) && IsRewritable(new_parent)) {
HtmlEventListIterator begin = node->begin();
HtmlEventListIterator end = node->end();
++end; // splice is non-inclusive for the 'end' iterator.
// Manipulate current_ so that when Flush() iterates it lands
// you on object after current_'s original position, rather
// than re-iterating over the new_parent's EndElement event.
current_ = end;
queue_.splice(new_parent->end(), queue_, begin, end);
--current_;
// TODO(jmarantz): According to
// http://www.cplusplus.com/reference/stl/list/splice/
// the moved iterators are no longer valid, and we
// are retaining them in the HtmlNode, so we need to fix them.
//
// However, in practice they appear to remain valid. And
// I can't think of a reason they should be invalidated,
// as the iterator is a pointer to a node structure with
// next/prev pointers. splice can mutate the next/prev pointers
// in place.
//
// See http://stackoverflow.com/questions/143156
FixParents(node->begin(), node->end(), new_parent);
moved = true;
need_sanity_check_ = true;
need_coalesce_characters_ = true;
}
return moved;
}
bool HtmlParse::DeleteElement(HtmlNode* node) {
bool deleted = false;
if (IsRewritable(node)) {
bool done = false;
// If node is an HtmlLeafNode, then begin() and end() might be the same.
for (HtmlEventListIterator p = node->begin(); !done; ) {
// We want to include end, so once p == end we still have to do one more
// iteration.
done = (p == node->end());
// Clean up any nested elements/leaves as we get to their 'end' event.
HtmlEvent* event = *p;
HtmlNode* nested_node = event->GetEndElement();
if (nested_node == NULL) {
nested_node = event->GetLeafNode();
}
if (nested_node != NULL) {
std::set<HtmlNode*>::iterator iter = nodes_.find(nested_node);
message_handler_->Check(iter != nodes_.end(), "iter == nodes_.end()");
message_handler_->Check(nested_node->live(), "!nested_node->live()");
nested_node->MarkAsDead(queue_.end());
}
// Check if we're about to delete the current event.
bool move_current = (p == current_);
p = queue_.erase(p);
if (move_current) {
current_ = p; // p is the event *after* the old current.
--current_; // Go to *previous* event so that we don't skip p.
deleted_current_ = true;
line_number_ = (*current_)->line_number();
}
delete event;
}
// Our iteration should have covered the passed-in element as well.
message_handler_->Check(!node->live(), "node->live()");
deleted = true;
need_sanity_check_ = true;
need_coalesce_characters_ = true;
}
return deleted;
}
bool HtmlParse::DeleteSavingChildren(HtmlElement* element) {
bool deleted = false;
if (IsRewritable(element)) {
HtmlElement* new_parent = element->parent();
HtmlEventListIterator first = element->begin();
++first;
HtmlEventListIterator last = element->end();
if (first != last) {
--last;
FixParents(first, last, new_parent);
queue_.splice(element->begin(), queue_, first, element->end());
need_sanity_check_ = true;
need_coalesce_characters_ = true;
}
deleted = DeleteElement(element);
}
return deleted;
}
bool HtmlParse::ReplaceNode(HtmlNode* existing_node, HtmlNode* new_node) {
bool replaced = false;
if (IsRewritable(existing_node)) {
replaced = InsertElementBeforeElement(existing_node, new_node);
message_handler_->Check(replaced, "!replaced");
replaced = DeleteElement(existing_node);
message_handler_->Check(replaced, "!replaced");
}
return replaced;
}
bool HtmlParse::IsRewritable(const HtmlNode* node) const {
return IsInEventWindow(node->begin()) && IsInEventWindow(node->end());
}
bool HtmlParse::IsInEventWindow(const HtmlEventListIterator& iter) const {
return iter != queue_.end();
}
void HtmlParse::ClearElements() {
for (std::set<HtmlNode*>::iterator p = nodes_.begin(),
e = nodes_.end(); p != e; ++p) {
HtmlNode* node = *p;
delete node;
}
nodes_.clear();
}
void HtmlParse::DebugPrintQueue() {
for (HtmlEventList::iterator p = queue_.begin(), e = queue_.end();
p != e; ++p) {
std::string buf;
HtmlEvent* event = *p;
event->ToString(&buf);
long node_ptr = reinterpret_cast<long>(event->GetNode());
if (p == current_) {
fprintf(stdout, "* %s (0x%lx)\n", buf.c_str(), node_ptr);
} else {
fprintf(stdout, " %s (0x%lx)\n", buf.c_str(), node_ptr);
}
}
fflush(stdout);
}
bool HtmlParse::IsImplicitlyClosedTag(Atom tag) const {
return lexer_->IsImplicitlyClosedTag(tag);
}
bool HtmlParse::TagAllowsBriefTermination(Atom tag) const {
return lexer_->TagAllowsBriefTermination(tag);
}
void HtmlParse::InfoV(
const char* file, int line, const char *msg, va_list args) {
message_handler_->InfoV(file, line, msg, args);
}
void HtmlParse::WarningV(
const char* file, int line, const char *msg, va_list args) {
message_handler_->WarningV(file, line, msg, args);
}
void HtmlParse::ErrorV(
const char* file, int line, const char *msg, va_list args) {
message_handler_->ErrorV(file, line, msg, args);
}
void HtmlParse::FatalErrorV(
const char* file, int line, const char* msg, va_list args) {
message_handler_->FatalErrorV(file, line, msg, args);
}
void HtmlParse::Info(const char* file, int line, const char* msg, ...) {
va_list args;
va_start(args, msg);
InfoV(file, line, msg, args);
va_end(args);
}
void HtmlParse::Warning(const char* file, int line, const char* msg, ...) {
va_list args;
va_start(args, msg);
WarningV(file, line, msg, args);
va_end(args);
}
void HtmlParse::Error(const char* file, int line, const char* msg, ...) {
va_list args;
va_start(args, msg);
ErrorV(file, line, msg, args);
va_end(args);
}
void HtmlParse::FatalError(const char* file, int line, const char* msg, ...) {
va_list args;
va_start(args, msg);
FatalErrorV(file, line, msg, args);
va_end(args);
}
void HtmlParse::InfoHere(const char* msg, ...) {
va_list args;
va_start(args, msg);
InfoHereV(msg, args);
va_end(args);
}
void HtmlParse::WarningHere(const char* msg, ...) {
va_list args;
va_start(args, msg);
WarningHereV(msg, args);
va_end(args);
}
void HtmlParse::ErrorHere(const char* msg, ...) {
va_list args;
va_start(args, msg);
ErrorHereV(msg, args);
va_end(args);
}
void HtmlParse::FatalErrorHere(const char* msg, ...) {
va_list args;
va_start(args, msg);
FatalErrorHereV(msg, args);
va_end(args);
}
void HtmlParse::CloseElement(
HtmlElement* element, HtmlElement::CloseStyle close_style,
int line_number) {
HtmlEndElementEvent* end_event =
new HtmlEndElementEvent(element, line_number);
element->set_close_style(close_style);
AddEvent(end_event);
element->set_end(Last());
element->set_end_line_number(line_number);
}
} // namespace net_instaweb