blob: 0b6457563cef48e8b54a3cbb918964d3b0120490 [file] [log] [blame]
/*
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Author: jmarantz@google.com (Joshua Marantz)
#include "pagespeed/kernel/html/html_writer_filter.h"
#include "pagespeed/kernel/base/basictypes.h"
#include "pagespeed/kernel/base/message_handler.h"
#include "pagespeed/kernel/base/string.h"
#include "pagespeed/kernel/base/string_util.h"
#include "pagespeed/kernel/base/writer.h"
#include "pagespeed/kernel/html/html_element.h"
#include "pagespeed/kernel/html/html_name.h"
#include "pagespeed/kernel/html/html_node.h"
#include "pagespeed/kernel/html/html_parse.h"
namespace net_instaweb {
static const int kDefaultMaxColumn = -1;
HtmlWriterFilter::HtmlWriterFilter(HtmlParse* html_parse)
: html_parse_(html_parse),
writer_(NULL),
max_column_(kDefaultMaxColumn),
case_fold_(false) {
Clear();
}
HtmlWriterFilter::~HtmlWriterFilter() {
}
void HtmlWriterFilter::Clear() {
lazy_close_element_ = NULL;
column_ = 0;
write_errors_ = 0;
}
void HtmlWriterFilter::TerminateLazyCloseElement() {
if (lazy_close_element_ != NULL) {
lazy_close_element_ = NULL;
if (!writer_->Write(">", html_parse_->message_handler())) {
++write_errors_;
}
++column_;
}
}
void HtmlWriterFilter::EmitBytes(const StringPiece& str) {
TerminateLazyCloseElement();
// Search backward from the end for the last occurrence of a newline.
column_ += str.size(); // if there are no newlines, bump up column counter.
for (int i = str.size() - 1; i >= 0; --i) {
if (str[i] == '\n') {
column_ = str.size() - i - 1; // found a newline; so reset the column.
break;
}
}
if (!writer_->Write(str, html_parse_->message_handler())) {
++write_errors_;
}
}
void HtmlWriterFilter::EmitName(const HtmlName& name) {
if (case_fold_) {
name.value().CopyToString(&case_fold_buffer_);
LowerString(&case_fold_buffer_);
EmitBytes(case_fold_buffer_);
} else {
EmitBytes(name.value());
}
}
void HtmlWriterFilter::StartElement(HtmlElement* element) {
HtmlElement::Style element_style = GetElementStyle(element);
if (element_style == HtmlElement::INVISIBLE) {
return;
}
EmitBytes("<");
EmitName(element->name());
const HtmlElement::AttributeList& attrs = element->attributes();
for (HtmlElement::AttributeConstIterator i(attrs.begin());
i != attrs.end(); ++i) {
const HtmlElement::Attribute& attribute = *i;
// If the column has grown too large, insert a newline. It's always safe
// to insert whitespace in the middle of tag parameters.
int attr_length = 1 + attribute.name_str().size();
if (max_column_ > 0) {
if (attribute.escaped_value() != NULL) {
attr_length += 1 + strlen(attribute.escaped_value());
}
if ((column_ + attr_length) > max_column_) {
EmitBytes("\n");
}
}
EmitBytes(" ");
EmitName(attribute.name());
if (attribute.escaped_value() != NULL) {
EmitBytes("=");
StringPiece quote = attribute.quote_str();
EmitBytes(quote);
EmitBytes(attribute.escaped_value());
EmitBytes(quote);
}
}
// Attempt to briefly terminate any legal tag that was explicitly terminated
// in the input. Note that a rewrite pass might have injected events
// between the begin/end of an element that was closed briefly in the input
// html. In that case it cannot be closed briefly. It is up to this
// code to validate BRIEF_CLOSE on each element.
//
// TODO(jmarantz): Add a rewrite pass that morphs EXPLICIT_CLOSE into 'brief'
// when legal. Such a change will introduce textual diffs between
// input and output html that would cause htmlparse unit tests to require
// a regold. But the changes could be validated with the normalizer.
if (element_style == HtmlElement::BRIEF_CLOSE) {
lazy_close_element_ = element;
} else {
EmitBytes(">");
}
}
// Compute the tag-closing style for an element. If the style was specified
// on construction, then we use that. If the element was synthesized by
// a rewrite pass, then it's stored as AUTO_CLOSE, and we can determine
// whether the element is briefly closable or implicitly closed.
HtmlElement::Style HtmlWriterFilter::GetElementStyle(HtmlElement* element) {
HtmlElement::Style style = element->style();
if (style == HtmlElement::AUTO_CLOSE) {
HtmlName::Keyword keyword = element->keyword();
// Avoid writing closing-tag when original HTML was <li>1<li>2. We want
// the correct structure in our API but want to avoid spewing it in a
// more verbose form than the original HTML had when the browser will
// interpret it correctly as is.
//
// Note that programatically inserted tags that for which
// IsOptionallyClosedTag is true will be explicitly closed by default.
if (html_parse_->IsImplicitlyClosedTag(keyword) ||
html_parse_->IsOptionallyClosedTag(keyword)) {
style = HtmlElement::IMPLICIT_CLOSE;
} else if (html_parse_->TagAllowsBriefTermination(keyword)) {
style = HtmlElement::BRIEF_CLOSE;
} else {
style = HtmlElement::EXPLICIT_CLOSE;
}
}
return style;
}
void HtmlWriterFilter::EndElement(HtmlElement* element) {
HtmlElement::Style element_style = GetElementStyle(element);
switch (element_style) {
case HtmlElement::AUTO_CLOSE:
// This cannot happen because GetElementStyle won't return AUTO_CLOSE.
html_parse_->message_handler()->FatalError(
__FILE__, __LINE__,
"GetElementStyle should never return AUTO_CLOSE.");
break;
case HtmlElement::IMPLICIT_CLOSE:
// Nothing new to write; the ">" was written in StartElement
break;
case HtmlElement::BRIEF_CLOSE:
// even if the element is briefly closeable, if more text
// got written after the element open, then we must
// explicitly close it, so we fall through.
if (lazy_close_element_ == element) {
lazy_close_element_ = NULL;
// If this attribute was unquoted, or lacked a value, then we'll need
// to add a space here to ensure that HTML parsers don't interpret the
// '/' in the '/>' as part of the attribute.
if (!element->attributes().IsEmpty()) {
const HtmlElement::Attribute& attribute =
*element->attributes().Last();
if ((attribute.escaped_value() == NULL) ||
(attribute.quote_style() == HtmlElement::NO_QUOTE)) {
EmitBytes(" ");
}
}
EmitBytes("/>");
break;
}
FALLTHROUGH_INTENDED;
case HtmlElement::EXPLICIT_CLOSE:
EmitBytes("</");
EmitName(element->name());
EmitBytes(">");
break;
case HtmlElement::INVISIBLE:
case HtmlElement::UNCLOSED:
// Nothing new to write; the ">" was written in StartElement
break;
}
}
void HtmlWriterFilter::Characters(HtmlCharactersNode* chars) {
EmitBytes(chars->contents());
}
void HtmlWriterFilter::Cdata(HtmlCdataNode* cdata) {
EmitBytes("<![CDATA[");
EmitBytes(cdata->contents());
EmitBytes("]]>");
}
void HtmlWriterFilter::Comment(HtmlCommentNode* comment) {
EmitBytes("<!--");
EmitBytes(comment->contents());
EmitBytes("-->");
}
void HtmlWriterFilter::IEDirective(HtmlIEDirectiveNode* directive) {
EmitBytes("<!--");
EmitBytes(directive->contents());
EmitBytes("-->");
}
void HtmlWriterFilter::Directive(HtmlDirectiveNode* directive) {
EmitBytes("<!");
EmitBytes(directive->contents());
EmitBytes(">");
}
void HtmlWriterFilter::StartDocument() {
Clear();
}
void HtmlWriterFilter::EndDocument() {
EmitBytes(""); // flushes any lazily closed elements at end of the document.
}
void HtmlWriterFilter::Flush() {
if (!writer_->Flush(html_parse_->message_handler())) {
++write_errors_;
}
}
void HtmlWriterFilter::DetermineEnabled(GoogleString* disabled_reason) {
set_is_enabled(true);
}
} // namespace net_instaweb