blob: 54ff31a33d6ae1173d27fabe29d23343653f565e [file] [log] [blame]
/*
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Author: jmarantz@google.com (Joshua Marantz)
#include "pagespeed/kernel/html/html_keywords.h"
#include <algorithm>
#include <cstddef>
#include <map>
#include <utility>
#include "base/logging.h"
#include "pagespeed/kernel/base/basictypes.h"
#include "pagespeed/kernel/base/string.h"
#include "pagespeed/kernel/base/string_util.h"
#include "pagespeed/kernel/base/writer.h"
#include "pagespeed/kernel/html/html_name.h"
namespace net_instaweb {
namespace {
struct HtmlKeywordsSequence {
const char* sequence;
const unsigned char value[3];
};
// TODO(jmarantz): the multi-byte sequences are not working yet.
HtmlKeywordsSequence kHtmlKeywordsSequences[] = {
{ "AElig", {0xC6, 0x0} },
{ "Aacute", {0xC1, 0x0} },
{ "Acirc", {0xC2, 0x0} },
{ "Agrave", {0xC0, 0x0} },
{ "Aring", {0xC5, 0x0} },
{ "Atilde", {0xC3, 0x0} },
{ "Auml", {0xC4, 0x0} },
{ "Ccedil", {0xC7, 0x0} },
{ "ETH", {0xD0, 0x0} },
{ "Eacute", {0xC9, 0x0} },
{ "Ecirc", {0xCA, 0x0} },
{ "Egrave", {0xC8, 0x0} },
{ "Euml", {0xCB, 0x0} },
{ "Iacute", {0xCD, 0x0} },
{ "Icirc", {0xCE, 0x0} },
{ "Igrave", {0xCC, 0x0} },
{ "Iuml", {0xCF, 0x0} },
{ "Ntilde", {0xD1, 0x0} },
{ "Oacute", {0xD3, 0x0} },
{ "Ocirc", {0xD4, 0x0} },
{ "Ograve", {0xD2, 0x0} },
{ "Oslash", {0xD8, 0x0} },
{ "Otilde", {0xD5, 0x0} },
{ "Ouml", {0xD6, 0x0} },
{ "THORN", {0xDE, 0x0} },
{ "Uacute", {0xDA, 0x0} },
{ "Ucirc", {0xDB, 0x0} },
{ "Ugrave", {0xD9, 0x0} },
{ "Uuml", {0xDC, 0x0} },
{ "Yacute", {0xDD, 0x0} },
{ "aacute", {0xE1, 0x0} },
{ "acirc", {0xE2, 0x0} },
{ "acute", {0xB4, 0x0} },
{ "aelig", {0xE6, 0x0} },
{ "agrave", {0xE0, 0x0} },
{ "amp", {0x26, 0x0} },
{ "aring", {0xE5, 0x0} },
{ "atilde", {0xE3, 0x0} },
{ "auml", {0xE4, 0x0} },
{ "brvbar", {0xA6, 0x0} },
{ "ccedil", {0xE7, 0x0} },
{ "cedil", {0xB8, 0x0} },
{ "cent", {0xA2, 0x0} },
{ "copy", {0xA9, 0x0} },
{ "curren", {0xA4, 0x0} },
{ "deg", {0xB0, 0x0} },
{ "divide", {0xF7, 0x0} },
{ "eacute", {0xE9, 0x0} },
{ "ecirc", {0xEA, 0x0} },
{ "egrave", {0xE8, 0x0} },
{ "eth", {0xF0, 0x0} },
{ "euml", {0xEB, 0x0} },
{ "frac12", {0xBD, 0x0} },
{ "frac14", {0xBC, 0x0} },
{ "frac34", {0xBE, 0x0} },
{ "gt", {0x3E, 0x0} },
{ "iacute", {0xED, 0x0} },
{ "icirc", {0xEE, 0x0} },
{ "iexcl", {0xA1, 0x0} },
{ "igrave", {0xEC, 0x0} },
{ "iquest", {0xBF, 0x0} },
{ "iuml", {0xEF, 0x0} },
{ "laquo", {0xAB, 0x0} },
{ "lt", {0x3C, 0x0} },
{ "macr", {0xAF, 0x0} },
{ "micro", {0xB5, 0x0} },
{ "middot", {0xB7, 0x0} },
{ "nbsp", {0xA0, 0x0} },
{ "not", {0xAC, 0x0} },
{ "ntilde", {0xF1, 0x0} },
{ "oacute", {0xF3, 0x0} },
{ "ocirc", {0xF4, 0x0} },
{ "ograve", {0xF2, 0x0} },
{ "ordf", {0xAA, 0x0} },
{ "ordm", {0xBA, 0x0} },
{ "oslash", {0xF8, 0x0} },
{ "otilde", {0xF5, 0x0} },
{ "ouml", {0xF6, 0x0} },
{ "para", {0xB6, 0x0} },
{ "plusmn", {0xB1, 0x0} },
{ "pound", {0xA3, 0x0} },
{ "quot", {0x22, 0x0} },
{ "raquo", {0xBB, 0x0} },
{ "reg", {0xAE, 0x0} },
{ "sect", {0xA7, 0x0} },
{ "shy", {0xAD, 0x0} },
{ "sup1", {0xB9, 0x0} },
{ "sup2", {0xB2, 0x0} },
{ "sup3", {0xB3, 0x0} },
{ "szlig", {0xDF, 0x0} },
{ "thorn", {0xFE, 0x0} },
{ "times", {0xD7, 0x0} },
{ "uacute", {0xFA, 0x0} },
{ "ucirc", {0xFB, 0x0} },
{ "ugrave", {0xF9, 0x0} },
{ "uml", {0xA8, 0x0} },
{ "uuml", {0xFC, 0x0} },
{ "yacute", {0xFD, 0x0} },
{ "yen", {0xA5, 0x0} },
{ "yuml", {0xFF, 0x0} }
};
// http://www.w3.org/TR/html4/sgml/entities.html contains a list of multi-byte
// codes. When we see any of these in an HTML attribute, we cannot currently
// unescape it, because we have no general strategy for multi-byte encoding.
//
// TODO(jmarantz): 1. We could adopt UTF-8 as our in-memory representation for
// manipulating attribute values, independent of the source document encoding.
//
// TODO(jmarantz): 2. We could sense UTF-8 encoding on documents based
// on Content-Type &/or meta-tags & thereby allow parsing of HTML
// attributes with utf-8 encodings. This is not strictly necessary
// for URL attributes where this is not allowed. See
// http://www.w3.org/TR/html401/appendix/notes.html#h-B.2.
//
//
// See also: http://www.w3.org/TR/2011/WD-html5-20110113/
// named-character-references.html#named-character-references
// which appears to contradict other references in the validity of &apos; and
// likely other details. This may be issue of HTML5 vs HTML4.
struct HtmlMultiByteSequence {
const char* sequence;
int value; // ignored for now.
};
struct HtmlMultiByteSequenceCompare {
// TODO(jmarantz): Write a version of StringCaseCompare based on const char*
// to avoid extra calls to strlen.
bool operator()(const HtmlMultiByteSequence& a, const char* b) const {
return StringCaseCompare(a.sequence, b) < 0;
}
bool operator()(const char* a, const HtmlMultiByteSequence& b) const {
return StringCaseCompare(a, b.sequence) < 0;
}
// Seems to be needed by MSVC in debug build.
bool operator()(const HtmlMultiByteSequence& a,
const HtmlMultiByteSequence& b) const {
return StringCaseCompare(a.sequence, b.sequence) < 0;
}
};
HtmlMultiByteSequence kHtmlKeywordsSequencesMultiByte[] = {
{ "alefsym", 8501 },
{ "Alpha", 913 },
{ "alpha", 945 },
{ "and", 8743 },
{ "ang", 8736 },
{ "asymp", 8776 },
{ "bdquo", 8222 },
{ "Beta", 914 },
{ "beta", 946 },
{ "bull", 8226 },
{ "cap", 8745 },
{ "Chi", 935 },
{ "chi", 967 },
{ "circ", 710 },
{ "clubs", 9827 },
{ "cong", 8773 },
{ "crarr", 8629 },
{ "cup", 8746 },
{ "dagger", 8224 },
{ "Dagger", 8225 },
{ "darr", 8595 },
{ "dArr", 8659 },
{ "Delta", 916 },
{ "delta", 948 },
{ "diams", 9830 },
{ "empty", 8709 },
{ "emsp", 8195 },
{ "ensp", 8194 },
{ "Epsilon", 917 },
{ "epsilon", 949 },
{ "equiv", 8801 },
{ "Eta", 919 },
{ "eta", 951 },
{ "euro", 8364 },
{ "exist", 8707 },
{ "fnof", 402 },
{ "forall", 8704 },
{ "frasl", 8260 },
{ "Gamma", 915 },
{ "gamma", 947 },
{ "ge", 8805 },
{ "harr", 8596 },
{ "hArr", 8660 },
{ "hearts", 9829 },
{ "hellip", 8230 },
{ "image", 8465 },
{ "infin", 8734 },
{ "int", 8747 },
{ "Iota", 921 },
{ "iota", 953 },
{ "isin", 8712 },
{ "Kappa", 922 },
{ "kappa", 954 },
{ "Lambda", 923 },
{ "lambda", 955 },
{ "lang", 9001 },
{ "larr", 8592 },
{ "lArr", 8656 },
{ "lceil", 8968 },
{ "ldquo", 8220 },
{ "le", 8804 },
{ "lfloor", 8970 },
{ "lowast", 8727 },
{ "loz", 9674 },
{ "lrm", 8206 },
{ "lsaquo", 8249 },
{ "lsquo", 8216 },
{ "mdash", 8212 },
{ "minus", 8722 },
{ "Mu", 924 },
{ "mu", 956 },
{ "nabla", 8711 },
{ "ndash", 8211 },
{ "ne", 8800 },
{ "ni", 8715 },
{ "notin", 8713 },
{ "nsub", 8836 },
{ "Nu", 925 },
{ "nu", 957 },
{ "OElig", 338 },
{ "oelig", 339 },
{ "oline", 8254 },
{ "Omega", 937 },
{ "omega", 969 },
{ "Omicron", 927 },
{ "omicron", 959 },
{ "oplus", 8853 },
{ "or", 8744 },
{ "otimes", 8855 },
{ "part", 8706 },
{ "permil", 8240 },
{ "perp", 8869 },
{ "Phi", 934 },
{ "phi", 966 },
{ "Pi", 928 },
{ "pi", 960 },
{ "piv", 982 },
{ "prime", 8242 },
{ "Prime", 8243 },
{ "prod", 8719 },
{ "prop", 8733 },
{ "Psi", 936 },
{ "psi", 968 },
{ "radic", 8730 },
{ "rang", 9002 },
{ "rarr", 8594 },
{ "rArr", 8658 },
{ "rceil", 8969 },
{ "rdquo", 8221 },
{ "real", 8476 },
{ "rfloor", 8971 },
{ "Rho", 929 },
{ "rho", 961 },
{ "rlm", 8207 },
{ "rsaquo", 8250 },
{ "rsquo", 8217 },
{ "sbquo", 8218 },
{ "Scaron", 352 },
{ "scaron", 353 },
{ "sdot", 8901 },
{ "Sigma", 931 },
{ "sigma", 963 },
{ "sigmaf", 962 },
{ "sim", 8764 },
{ "spades", 9824 },
{ "sub", 8834 },
{ "sube", 8838 },
{ "sum", 8721 },
{ "sup", 8835 },
{ "supe", 8839 },
{ "Tau", 932 },
{ "tau", 964 },
{ "there4", 8756 },
{ "Theta", 920 },
{ "theta", 952 },
{ "thetasym", 977 },
{ "thinsp", 8201 },
{ "tilde", 732 },
{ "trade", 8482 },
{ "uarr", 8593 },
{ "uArr", 8657 },
{ "upsih", 978 },
{ "Upsilon", 933 },
{ "upsilon", 965 },
{ "weierp", 8472 },
{ "Xi", 926 },
{ "xi", 958 },
{ "Yuml", 376 },
{ "Zeta", 918 },
{ "zeta", 950 },
{ "zwj", 8205 },
{ "zwnj", 8204 },
};
// String constants used to populate maps at initialization time.
// These are a little more expressive than static arrays of keywords.
// The penalty for this expressiveness is lack of compile-time checking,
// and startup time. But the compile-time checking is replaced by
// debug-only init checks.
// Tables are a 4-level hierarchy:
// table > [thead tbody tfoot] > tr > [td th]
//
// Note: we use trailing spaces in all these strings so that they can
// be concatenated more easily. Note that we use 'omit_empty_strings'
// when we interpret via SplitStringPieceToVector.
const char kTableLeaves[] = "td th ";
const char kTableSections[] = "tbody tfoot thead ";
const char kTableElements[] = "td th tbody tfoot thead table tr ";
// TODO(jmarantz): consider caption, col, colgroup.
// Formatting elements are terminated by many other tags.
const char kFormattingElements[] =
"b i em font strong small s cite q dfn abbr time code var "
"samp kbd sub u mark bdi bdo ";
// TODO(jmarantz): consider ins and del & potentially lots more.
const char kListElements[] = "li ol ul ";
const char kDeclarationElements[] = "dl dt dd ";
const char kParagraphTerminators[] =
"address article aside blockquote dir div dl fieldset "
"footer form h1 h2 h3 h4 h5 h6 header hgroup hr menu nav ol p "
"pre section table ul";
// TODO(jmarantz): handle & test Ruby containment.
// const char kRubyElements[] = "ruby rt rp ";
} // namespace
HtmlKeywords* HtmlKeywords::singleton_ = NULL;
HtmlKeywords::HtmlKeywords() {
InitEscapeSequences();
InitAutoClose();
InitContains();
InitOptionallyClosedKeywords();
}
void HtmlKeywords::InitEscapeSequences() {
unescape_insensitive_map_.set_deleted_key("");
unescape_sensitive_map_.set_deleted_key("");
escape_map_.set_deleted_key("");
StringSetInsensitive case_sensitive_symbols;
for (size_t i = 0; i < arraysize(kHtmlKeywordsSequences); ++i) {
// Put all symbols in the case-sensitive map
const HtmlKeywordsSequence& seq = kHtmlKeywordsSequences[i];
unescape_sensitive_map_[seq.sequence] =
reinterpret_cast<const char*>(seq.value);
// Don't populate the case-insensitive map for symbols that we've
// already determined are case-sensitive.
if (case_sensitive_symbols.find(seq.sequence) ==
case_sensitive_symbols.end()) {
// If this symbol is already present in the insensitive map, then it
// must be case-sensitive. E.g. &AElig; and &aelig; are distinct.
StringStringSparseHashMapInsensitive::iterator p =
unescape_insensitive_map_.find(seq.sequence);
if (p != unescape_insensitive_map_.end()) {
// As this symbol is case-sensitive, we must remove it from the
// case-insensitive map. This way we will report an error for
// &Aelig;, rather than &AElig; or &aelig; unpredictably.
unescape_insensitive_map_.erase(p);
case_sensitive_symbols.insert(seq.sequence);
} else {
unescape_insensitive_map_[seq.sequence] =
reinterpret_cast<const char*>(seq.value);
}
// For now, we will only generate symbolic escaped-names for
// single-byte sequences
if (strlen(reinterpret_cast<const char*>(seq.value)) == 1) {
escape_map_[reinterpret_cast<const char*>(seq.value)] = seq.sequence;
}
}
}
// Initialialize the keywords from HtmlName into a reverse table. This could
// have been generated by gperf, but it isn't. It's easy enough to build it
// given an iterator.
keyword_vector_.resize(HtmlName::num_keywords());
for (HtmlName::Iterator iter; !iter.AtEnd(); iter.Next()) {
DCHECK_GE(HtmlName::num_keywords(), iter.keyword());
keyword_vector_[iter.keyword()] = StringPiece(iter.name());
}
// Make sure the multi-byte sequences are in order (case-insensitive). Ties
// are OK to account for case differences.
for (size_t i = 1; i < arraysize(kHtmlKeywordsSequencesMultiByte); ++i) {
DCHECK(StringCaseCompare(kHtmlKeywordsSequencesMultiByte[i - 1].sequence,
kHtmlKeywordsSequencesMultiByte[i].sequence)
<= 0);
}
}
void HtmlKeywords::Init() {
if (singleton_ == NULL) {
singleton_ = new HtmlKeywords();
}
}
void HtmlKeywords::ShutDown() {
if (singleton_ != NULL) {
delete singleton_;
singleton_ = NULL;
}
}
StringPiece HtmlKeywords::UnescapeHelper(const StringPiece& escaped,
GoogleString* buf,
bool* decoding_error) const {
if (escaped.empty()) {
*decoding_error = false;
return escaped;
}
*decoding_error = true;
// It would be nice to short-circuit the loop below via a memchr
// looking for "&". Unfortunately we must at least also scan for
// 8-bit characters, as we cannot unescape those in a manner that's
// bidirectionally safe. Consider CanonicalAttributesTest.Spanish,
// where a non-utf8 multi-byte 8-bit character is present. If we
// short-circuit looking for "&" we'll wind up escaping each piece
// of the multi-byte sequence individually and that will not reverse
// properly.
// if (memchr(escaped.data(), '&', escaped.size()) == NULL) {
// return escaped;
// }
buf->clear();
// Attribute values may have HTML escapes in them, e.g.
// href="host.com/path?v1&amp;v2"
// Un-escape the attribute value here before populating the
// attribute data structure.
GoogleString escape;
uint32 numeric_value = 0;
bool accumulate_numeric_code = false;
bool hex_mode = false;
bool in_escape = false;
bool found_ampersand = false;
for (size_t i = 0; i < escaped.size(); ++i) {
uint8 ch = static_cast<uint8>(escaped[i]);
if (!in_escape) {
if (ch == '&') {
if (!found_ampersand) {
found_ampersand = true;
buf->append(escaped.data(), i);
}
in_escape = true;
escape.clear();
numeric_value = 0;
accumulate_numeric_code = false;
hex_mode = false;
} else if (ch > 127) {
return StringPiece(NULL, 0);
} else if (found_ampersand) {
*buf += ch;
}
} else if (escape.empty() && (ch == '#')) {
escape += ch;
accumulate_numeric_code = true;
if (((i + 1) < escaped.size()) && (UpperChar(escaped[i + 1]) == 'X')) {
hex_mode = true;
++i;
}
} else if (ch == ';') {
if (!TryUnescape(accumulate_numeric_code, numeric_value, escape,
true, buf)) {
return StringPiece(NULL, 0);
}
in_escape = false;
} else if (ch > 127) {
return StringPiece(NULL, 0);
} else {
// Accumulate an escape code based on the current mode (symbolic, decimal,
// or hex). If the character is not plausible to accumulate into the
// escape sequence, then consider it improperly terminated and try to
// look it up.
bool improperly_terminated = false;
if (accumulate_numeric_code) {
improperly_terminated =
((hex_mode && !AccumulateHexValue(ch, &numeric_value)) ||
(!hex_mode && !AccumulateDecimalValue(ch, &numeric_value)));
} else if (!IsAsciiAlphaNumeric(ch)) {
improperly_terminated = true;
}
if (improperly_terminated) {
if (TryUnescape(accumulate_numeric_code, numeric_value, escape, false,
buf)) {
DCHECK_LT(static_cast<size_t>(0), i);
--i; // re-examine ch now that we've unescaped what came before it.
in_escape = false;
} else {
return StringPiece(NULL, 0);
}
} else {
escape += ch;
}
}
}
if (!found_ampersand) {
*decoding_error = false;
return escaped;
}
if (in_escape) {
if (escape.empty()) {
buf->push_back('&');
} else if (!TryUnescape(accumulate_numeric_code, numeric_value,
escape, false, buf)) {
return StringPiece(NULL, 0);
}
}
*decoding_error = false;
return StringPiece(*buf);
}
bool HtmlKeywords::TryUnescape(bool accumulate_numeric_code,
uint32 numeric_value,
const GoogleString& escape,
bool was_terminated,
GoogleString* buf) const {
if (accumulate_numeric_code && (escape.size() > 1)) {
// See the table in the Description in
// http://en.wikipedia.org/wiki/UTF-8#Codepage_layout .
if (numeric_value <= 0xff) {
*buf += static_cast<char>(numeric_value);
} else {
return false;
}
} else {
// Some symbols are case-sensitive (AElig vs aelig are different
// code-points) whereas some are case-insensitive (&quot; and
// &QUOT; both work. So do the case-sensitive lookup first, and
// if that fails, do an insensitive lookup.
StringStringSparseHashMapSensitive::const_iterator p =
unescape_sensitive_map_.find(escape);
if (p != unescape_sensitive_map_.end()) {
*buf += p->second;
} else {
// The sensitive lookup failed, but allow, for example, &QUOT; to work
// in place of &quot;. However, note that "yuml" is single
// byte and "Yuml" is multi-byte, and we first look in our multi-byte
// set before allowing doing the case-insensitive lookup in our
// single-byte encodings list.
if (std::binary_search(
kHtmlKeywordsSequencesMultiByte,
(kHtmlKeywordsSequencesMultiByte +
arraysize(kHtmlKeywordsSequencesMultiByte)),
escape.c_str(), HtmlMultiByteSequenceCompare())) {
// Reject known multi-byte
// sequences e.g. (&hellip;), as we would wind up adding an extra &amp;
// e.g. (&amp;&hellip;).
return false;
} else {
// Handle the case the user has used the wrong case for a
// valid escape sequence, e.g. QUOT;, but there is no
// multi-byte match (e.g. Yuml;). We can allow sloppy
// interpretation with a case insensitive lookup here.
StringStringSparseHashMapInsensitive::const_iterator q =
unescape_insensitive_map_.find(escape);
if (q != unescape_insensitive_map_.end()) {
*buf += q->second;
} else {
// &apos; is a special case. It is *not* legal HTML but many
// web designers think it is. It does not work on IE. So
// recognize that & fix it.
// http://fishbowl.pastiche.org/2003/07/01/the_curse_of_apos/
//
// This is special-cased here because we want to unescape &apos;
// into a single-quote, but do not want to re-escape ' into &apos;,
// but instead into &#39;. If we put that in the keyword map
// above then we'd get the reverse mapping we don't want.
//
// TODO(jmarantz): consider allowing ' to &apos; for XHTML.
if (StringCaseEqual(escape, "apos")) {
*buf += '\'';
} else {
// Let random words through (e.g. accept a&b literal, which we would
// re-escape & "improve" as a&amp;b).
*buf += "&";
*buf += escape;
if (was_terminated) {
*buf += ";";
}
}
}
}
}
}
return true;
}
StringPiece HtmlKeywords::EscapeHelper(const StringPiece& unescaped,
GoogleString* buf) const {
if (unescaped.data() == NULL) {
return unescaped;
}
buf->clear();
GoogleString char_to_escape;
for (size_t i = 0; i < unescaped.size(); ++i) {
int ch = static_cast<unsigned char>(unescaped[i]);
// According to http://www.htmlescape.net/htmlescape_tool.html,
// single-quote does not need to be escaped. However, input HTML
// might have used single-quote to quote attribute values, in
// which case we better escape any single-quotes in the value.
//
// This function, unfortunately, does not know what quoting was used.
// TODO(jmarantz): in remove_quotes filter, switch between ' and " for
// quoting based on whatever is in the attr value.
if (!IsHtmlSpace(ch) &&
((ch > 127) || (ch < 32) || (ch == '"') || (ch == '\'') ||
(ch == '&') || (ch == '<') || (ch == '>'))) {
char_to_escape.clear();
char_to_escape += ch;
StringStringSparseHashMapSensitive::const_iterator p =
escape_map_.find(char_to_escape);
if (p == escape_map_.end()) {
StringAppendF(buf, "&#%02d;", static_cast<int>(ch));
} else {
*buf += '&';
*buf += p->second;
*buf += ';';
}
} else {
*buf += unescaped[i];
}
}
return StringPiece(*buf);
}
void HtmlKeywords::AddCrossProduct(const StringPiece& k1_list,
const StringPiece& k2_list,
KeywordPairVec* kmap) {
StringPieceVector v1, v2;
SplitStringPieceToVector(k1_list, " ", &v1, true);
SplitStringPieceToVector(k2_list, " ", &v2, true);
for (int i = 0, n1 = v1.size(); i < n1; ++i) {
HtmlName::Keyword k1 = HtmlName::Lookup(v1[i]);
DCHECK_NE(HtmlName::kNotAKeyword, k1) << v1[i];
for (int j = 0, n2 = v2.size(); j < n2; ++j) {
HtmlName::Keyword k2 = HtmlName::Lookup(v2[j]);
DCHECK_NE(HtmlName::kNotAKeyword, k2) << v2[j];
KeywordPair k1_k2 = MakeKeywordPair(k1, k2);
kmap->push_back(k1_k2);
}
}
}
void HtmlKeywords::AddToSet(const StringPiece& klist, KeywordVec* kset) {
StringPieceVector v;
SplitStringPieceToVector(klist, " ", &v, true);
for (int i = 0, n = v.size(); i < n; ++i) {
HtmlName::Keyword k = HtmlName::Lookup(v[i]);
DCHECK_NE(HtmlName::kNotAKeyword, k) << v[i];
kset->push_back(k);
}
}
namespace {
// Sorts the passed-in vector to enable binary_search. The vector is
// sorted by T::operator<. If in the future a binary search requires
// a custom comparator then this function should also be changed to
// take that comparator.
//
// vec must not be empty.
template<class T>
void PrepareForBinarySearch(std::vector<T>* vec) {
CHECK(!vec->empty());
std::sort(vec->begin(), vec->end());
// Make sure there are no duplicates
#ifndef NDEBUG
typename std::vector<T>::iterator p = std::unique(vec->begin(), vec->end());
if (p != vec->end()) {
T duplicate_value = *p;
LOG(DFATAL) << "Duplicate set element " << duplicate_value;
}
#endif
}
} // namespace
void HtmlKeywords::InitAutoClose() {
// The premise of our lookup machinery is that HtmlName::Keyword can
// be represented in a 16-bit int, so that we can make a pair using
// SHIFT+OR.
DCHECK_EQ(HtmlName::num_keywords(), HtmlName::num_keywords() & 0xffff);
// TODO(jmarantz): these deserve another pass through the HTML5 spec.
// Note that http://www.w3.org/TR/html5/syntax.html#optional-tags
// covers many of these cases, but omits the general situation that
// formatting elements should be automatically closed when they
// hit most other tags.
//
// However, there is discussion of relevance in and around:
// http://www.w3.org/TR/html5/the-end.html#misnested-tags:-b-i-b-i
AddAutoClose(kTableLeaves, kTableLeaves);
AddAutoClose(kTableLeaves, "tr");
AddAutoClose("tr", kTableSections);
AddAutoClose("tr", "tr");
AddAutoClose(kTableSections, kTableSections);
AddAutoClose("p", kParagraphTerminators);
AddAutoClose("li", "li");
AddAutoClose("dd dt", "dd dt");
AddAutoClose("rp rt", "rp rt");
AddAutoClose("optgroup", "optgroup");
AddAutoClose("option", "optgroup option");
AddAutoClose(kFormattingElements, StrCat("tr ", kListElements,
kDeclarationElements));
PrepareForBinarySearch(&auto_close_);
}
void HtmlKeywords::InitContains() {
// TODO(jmarantz): these deserve another pass through the HTML5 spec. Note
// that the HTML5 spec doesn't have a 'containment' section but there is
// discussion of the context in which tags can reside in the doc for each
// tag, and discussion of relevance in and around:
// http://www.w3.org/TR/html5/the-end.html#misnested-tags:-b-i-b-i
//
// Also see http://www.whatwg.org/specs/web-apps/current-work
// /multipage/syntax.html#optional-tags which describes auto-closing
// elements whose parents have no more content.
AddContained(kTableLeaves, "table");
AddContained("tr", "table");
AddContained(kTableSections, "table");
AddContained("li", "ul ol");
AddContained("dd dt", "dl");
AddContained("rt rp", "ruby");
AddContained(kFormattingElements, "td th");
PrepareForBinarySearch(&contained_);
}
// These tags do not need to be explicitly closed, but can be. See
// http://www.w3.org/TR/html5/syntax.html#optional-tags . Note that
// this is *not* consistent with
// http://www.w3schools.com/tags/tag_p.asp which claims that the <p>
// tag works the same in XHTML as HTML. This is clearly wrong since
// real XHTML has XML syntax which requires explicit closing tags.
//
// Note that we will close any of these tags without warning.
void HtmlKeywords::InitOptionallyClosedKeywords() {
AddToSet(kFormattingElements, &optionally_closed_);
AddToSet("body colgroup dd dt html optgroup option p", &optionally_closed_);
AddToSet(kListElements, &optionally_closed_);
AddToSet(kTableElements, &optionally_closed_);
PrepareForBinarySearch(&optionally_closed_);
}
bool HtmlKeywords::WritePre(StringPiece str, StringPiece style,
Writer* writer, MessageHandler* handler) {
GoogleString tag, escaped;
if (style.empty()) {
tag = "<pre>";
} else {
tag = StrCat("<pre style=\"", style, "\">");
}
bool ret = writer->Write(tag, handler);
ret &= writer->Write(HtmlKeywords::Escape(str, &escaped), handler);
ret &= writer->Write("</pre>\n", handler);
return ret;
}
} // namespace net_instaweb