blob: d12520a171361ca6cfc4230ab26ec757f0b683c5 [file] [log] [blame]
/**
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Author: mdsteele@google.com (Matthew D. Steele)
#include "net/instaweb/rewriter/public/collapse_whitespace_filter.h"
#include "base/logging.h"
#include "net/instaweb/htmlparse/public/html_parse.h"
#include "net/instaweb/htmlparse/public/html_element.h"
#include "net/instaweb/htmlparse/public/html_node.h"
#include <string>
namespace {
// Tags within which we should never try to collapse whitespace (note that this
// is not _quite_ the same thing as kLiteralTags in html_lexer.cc):
const char* const kSensitiveTags[] = {"pre", "script", "style", "textarea"};
bool IsHtmlWhiteSpace(char ch) {
// See http://www.w3.org/TR/html401/struct/text.html#h-9.1
return ch == ' ' || ch == '\n' || ch == '\r' || ch == '\t' || ch == '\f';
}
// Sentinel value for use in the CollapseWhitespace function:
const char kNotInWhitespace = '\0';
// Append the input to the output with whitespace collapsed. Specifically,
// each contiguous sequence of whitespace is replaced with the first
// (whitespace) character in the sequence, except that any sequence containing
// a newline is collapsed to a newline.
void CollapseWhitespace(const std::string& input, std::string* output) {
// This variable stores the first whitespace character in each whitespace
// sequence, or kNotInWhitespace.
char whitespace = kNotInWhitespace;
for (std::string::const_iterator iter = input.begin(), end = input.end();
iter != end; ++iter) {
const char ch = *iter;
if (IsHtmlWhiteSpace(ch)) {
// We let newlines take precedence over other kinds of whitespace, for
// aesthetic reasons.
if (whitespace == kNotInWhitespace || ch == '\n') {
whitespace = ch;
}
} else {
if (whitespace != kNotInWhitespace) {
*output += whitespace;
whitespace = kNotInWhitespace;
}
*output += ch;
}
}
if (whitespace != kNotInWhitespace) {
*output += whitespace;
}
}
} // namespace
namespace net_instaweb {
CollapseWhitespaceFilter::CollapseWhitespaceFilter(HtmlParse* html_parse)
: html_parse_(html_parse) {
for (size_t i = 0; i < arraysize(kSensitiveTags); ++i) {
sensitive_tags_.insert(html_parse->Intern(kSensitiveTags[i]));
}
}
void CollapseWhitespaceFilter::StartDocument() {
atom_stack_.clear();
}
void CollapseWhitespaceFilter::StartElement(HtmlElement* element) {
const Atom tag = element->tag();
if (sensitive_tags_.count(tag) > 0) {
atom_stack_.push_back(tag);
}
}
void CollapseWhitespaceFilter::EndElement(HtmlElement* element) {
const Atom tag = element->tag();
if (!atom_stack_.empty() && tag == atom_stack_.back()) {
atom_stack_.pop_back();
} else {
DCHECK(sensitive_tags_.count(tag) == 0);
}
}
void CollapseWhitespaceFilter::Characters(HtmlCharactersNode* characters) {
if (atom_stack_.empty()) {
std::string minified;
CollapseWhitespace(characters->contents(), &minified);
html_parse_->ReplaceNode(
characters,
html_parse_->NewCharactersNode(characters->parent(), minified));
}
}
} // namespace net_instaweb