blob: 1296a2fcfbe6eef63544562c13d8fc37dd562a40 [file] [log] [blame]
/*
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Author: jmaessen@google.com (Jan Maessen)
#include "pagespeed/kernel/html/html_attribute_quote_removal.h"
#include "pagespeed/kernel/html/doctype.h"
#include "pagespeed/kernel/html/html_element.h"
#include "pagespeed/kernel/html/html_parse.h"
#include "pagespeed/kernel/base/string.h"
namespace {
// Explicit about signedness because we are
// loading a 0-indexed lookup table.
const unsigned char kNoQuoteChars[] =
"abcdefghijklmnopqrstuvwxyz"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"0123456789-._:";
} // namespace
// TODO(jmaessen): Make adjustable.
const bool kLogQuoteRemoval = false;
namespace net_instaweb {
// Remove quotes; see description in .h file.
HtmlAttributeQuoteRemoval::HtmlAttributeQuoteRemoval(HtmlParse* html_parse)
: total_quotes_removed_(0),
html_parse_(html_parse) {
// In pidgin Python:
// needs_no_quotes[:] = false
// needs_no_quotes[kNoQuoteChars] = true
// TODO(jmarantz): put this in a static Initialize method to avoid
// per-request construction costs.
memset(&needs_no_quotes_, 0, sizeof(needs_no_quotes_));
for (int i = 0; kNoQuoteChars[i] != '\0'; ++i) {
needs_no_quotes_[kNoQuoteChars[i]] = true;
}
// All 8-bit characters can remain unquoted.
// TODO(jmarantz): uncomment in a follow-up. This should be fine.
// for (int i = 128; i < 256; ++i) {
// needs_no_quotes_[i] = true;
// }
}
HtmlAttributeQuoteRemoval::~HtmlAttributeQuoteRemoval() {}
bool HtmlAttributeQuoteRemoval::NeedsQuotes(const char *val) {
bool needs_quotes = false;
int i = 0;
if (val != NULL) {
for (; val[i] != '\0'; ++i) {
// Explicit cast to unsigned char ensures that our offset
// into needs_no_quotes_ is positive.
needs_quotes = !needs_no_quotes_[static_cast<unsigned char>(val[i])];
if (needs_quotes) {
break;
}
}
}
// Note that due to inconsistencies in empty attribute parsing between Firefox
// and Chrome (Chrome seems to parse the next thing it sees after whitespace
// as the attribute value) we leave empty attributes intact.
return needs_quotes || i == 0;
}
void HtmlAttributeQuoteRemoval::StartElement(HtmlElement* element) {
// TODO(jmarantz): switch to using mimetype. To do that we need to have
// access to the RewriteDriver* to get the response-headers, and so this
// is not compatible with PageSpeed Insights that uses this filter for
// HTML minification.
if (html_parse_->doctype().IsXhtml()) {
return; // XHTML doctypes require quotes, so don't remove any.
}
int rewritten = 0;
HtmlElement::AttributeList* attrs = element->mutable_attributes();
for (HtmlElement::AttributeIterator i(attrs->begin());
i != attrs->end(); ++i) {
HtmlElement::Attribute& attr = *i;
if (attr.quote_style() != HtmlElement::NO_QUOTE &&
!NeedsQuotes(attr.escaped_value())) {
attr.set_quote_style(HtmlElement::NO_QUOTE);
rewritten++;
}
}
if (rewritten > 0) {
total_quotes_removed_ += rewritten;
if (kLogQuoteRemoval) {
const char* plural = (rewritten == 1) ? "" : "s";
html_parse_->InfoHere("Scrubbed quotes from %d attribute%s",
rewritten, plural);
}
}
}
} // namespace net_instaweb