blob: a6469ebd47c20e1c0fa2029103d5c6081a623e8a [file] [log] [blame]
/*
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Author: mdsteele@google.com (Matthew D. Steele)
#include "pagespeed/kernel/html/collapse_whitespace_filter.h"
#include <algorithm>
#include <cstddef>
#include <vector>
#include "base/logging.h"
#include "pagespeed/kernel/html/html_element.h"
#include "pagespeed/kernel/html/html_name.h"
#include "pagespeed/kernel/html/html_node.h"
#include "pagespeed/kernel/base/basictypes.h"
#include "pagespeed/kernel/base/string.h"
namespace net_instaweb {
class HtmlParse;
namespace {
// Tags within which we should never try to collapse whitespace (note that this
// is not _quite_ the same thing as kLiteralTags in html_lexer.cc):
const HtmlName::Keyword kSensitiveTags[] = {
HtmlName::kPre, HtmlName::kScript, HtmlName::kStyle, HtmlName::kTextarea
};
bool IsSensitiveKeyword(HtmlName::Keyword keyword) {
const HtmlName::Keyword* end = kSensitiveTags + arraysize(kSensitiveTags);
return std::binary_search(kSensitiveTags, end, keyword);
}
} // namespace
CollapseWhitespaceFilter::CollapseWhitespaceFilter(HtmlParse* html_parse)
: html_parse_(html_parse) {
for (size_t i = 1; i < arraysize(kSensitiveTags); ++i) {
DCHECK(kSensitiveTags[i - 1] < kSensitiveTags[i]);
}
}
CollapseWhitespaceFilter::~CollapseWhitespaceFilter() {}
void CollapseWhitespaceFilter::StartDocument() {
keyword_stack_.clear();
}
void CollapseWhitespaceFilter::StartElement(HtmlElement* element) {
HtmlName::Keyword keyword = element->keyword();
if (IsSensitiveKeyword(keyword)) {
keyword_stack_.push_back(keyword);
}
}
void CollapseWhitespaceFilter::EndElement(HtmlElement* element) {
HtmlName::Keyword keyword = element->keyword();
if (!keyword_stack_.empty() && (keyword == keyword_stack_.back())) {
keyword_stack_.pop_back();
} else {
DCHECK(!IsSensitiveKeyword(keyword));
}
}
void CollapseWhitespaceFilter::Characters(HtmlCharactersNode* characters) {
if (keyword_stack_.empty()) {
// Mutate the contents-string in-place for speed.
GoogleString* contents = characters->mutable_contents();
// It is safe to directly mutate the bytes in the string because
// we are only going to shrink it, never grow it.
char* read_ptr = &(*contents)[0];
char* write_ptr = read_ptr;
char* end = read_ptr + contents->size();
int in_whitespace = 0; // Used for pointer-subtraction so newlines dominate
for (; read_ptr != end; ++read_ptr) {
char ch = *read_ptr;
switch (ch) {
// See http://www.w3.org/TR/html401/struct/text.html#h-9.1
case ' ':
case '\t':
case '\r':
case '\f':
// Add whitespace if the previous character was not already
// whitespace. Note that the whitespace may be overwritten
// by a newline. This extra branch could be avoided if we folded
// the current whitespace-state into the switch via an OR.
if (in_whitespace == 0) {
*write_ptr++ = ch;
in_whitespace = 1;
}
break;
case '\n':
// If the previous character was a whitespace, then back up
// so that the 'write' in the default case will overwrite the
// previous whitespace with a newline. Avoid branches.
write_ptr -= in_whitespace;
in_whitespace = 1;
*write_ptr++ = ch;
break;
default:
in_whitespace = 0;
*write_ptr++ = ch;
break;
}
}
contents->resize(write_ptr - contents->data());
}
}
} // namespace net_instaweb