blob: f6327bf75557fabfac5b945464c75c7bddf107eb [file] [log] [blame]
/**
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Copyright 2006, Google Inc. All rights reserved.
// Author: mec@google.com (Michael Chastain)
#include <ctype.h>
#include <string.h>
#include "webutil/html/htmltagindex.h"
#include "base/basictypes.h"
#include "base/logging.h"
#include "base/paranoid.h"
#include "strings/ascii_ctype.h"
#include "util/gtl/dense_hash_map.h"
#include "util/hash/hash.h"
// Assert character properties that the fast path depends on.
// (Uppercase letter | 0x20) == lowercase letter
// All digits have 0x20 bit set
// Several special chars have 0x20 bit set
COMPILE_ASSERT(('A'|0x20)=='a', A_a);
COMPILE_ASSERT(('B'|0x20)=='b', B_b);
COMPILE_ASSERT(('C'|0x20)=='c', C_c);
COMPILE_ASSERT(('D'|0x20)=='d', D_d);
COMPILE_ASSERT(('E'|0x20)=='e', E_e);
COMPILE_ASSERT(('F'|0x20)=='f', F_f);
COMPILE_ASSERT(('G'|0x20)=='g', G_g);
COMPILE_ASSERT(('H'|0x20)=='h', H_h);
COMPILE_ASSERT(('I'|0x20)=='i', I_i);
COMPILE_ASSERT(('J'|0x20)=='j', J_j);
COMPILE_ASSERT(('K'|0x20)=='k', K_k);
COMPILE_ASSERT(('L'|0x20)=='l', L_l);
COMPILE_ASSERT(('M'|0x20)=='m', M_m);
COMPILE_ASSERT(('N'|0x20)=='n', N_n);
COMPILE_ASSERT(('O'|0x20)=='o', O_o);
COMPILE_ASSERT(('P'|0x20)=='p', P_p);
COMPILE_ASSERT(('Q'|0x20)=='q', Q_q);
COMPILE_ASSERT(('R'|0x20)=='r', R_r);
COMPILE_ASSERT(('S'|0x20)=='s', S_s);
COMPILE_ASSERT(('T'|0x20)=='t', T_t);
COMPILE_ASSERT(('U'|0x20)=='u', U_u);
COMPILE_ASSERT(('V'|0x20)=='v', V_v);
COMPILE_ASSERT(('W'|0x20)=='w', W_w);
COMPILE_ASSERT(('X'|0x20)=='x', X_x);
COMPILE_ASSERT(('Y'|0x20)=='y', Y_y);
COMPILE_ASSERT(('Z'|0x20)=='z', Z_z);
COMPILE_ASSERT('0'&0x20, d0);
COMPILE_ASSERT('1'&0x20, d1);
COMPILE_ASSERT('2'&0x20, d2);
COMPILE_ASSERT('3'&0x20, d3);
COMPILE_ASSERT('4'&0x20, d4);
COMPILE_ASSERT('5'&0x20, d5);
COMPILE_ASSERT('6'&0x20, d6);
COMPILE_ASSERT('7'&0x20, d7);
COMPILE_ASSERT('8'&0x20, d8);
COMPILE_ASSERT('9'&0x20, d9);
COMPILE_ASSERT('!'&0x20, cBang);
COMPILE_ASSERT('-'&0x20, cDash);
COMPILE_ASSERT('?'&0x20, cQuestion);
// Ctor.
HtmlTagIndex::HtmlTagIndex()
: case_sensitive_fixed_(false),
index_max_(kHtmlTagBuiltinMax),
custom_tag_map_(NULL) {
this->SetCaseSensitive(false);
}
// Dtor.
HtmlTagIndex::~HtmlTagIndex() {
}
// Case sensitivity.
void HtmlTagIndex::SetCaseSensitive(bool case_sensitive) {
CHECK(!case_sensitive_fixed_);
case_sensitive_ = SanitizeBool(case_sensitive);
if (case_sensitive_) {
case_mask_1_ = case_mask_2_ = case_mask_3_ = case_mask_4_ = 0;
case_mask_5_ = case_mask_6_ = case_mask_7_ = case_mask_8_ = 0;
} else {
case_mask_1_ = 0x20;
case_mask_2_ = 0x2020;
case_mask_3_ = 0x202020;
case_mask_4_ = 0x20202020;
case_mask_5_ = 0x2020202020ull;
case_mask_6_ = 0x202020202020ull;
case_mask_7_ = 0x20202020202020ull;
case_mask_8_ = 0x2020202020202020ull;
}
}
// Return a case-aware string. If case-sensitive, this is the same string.
// If case-insensitive, this is the lower-case version.
static string CaseAwareString(bool case_sensitive,
const char* tag,
int length) {
CHECK_GE(length, 0);
string str;
if (case_sensitive) {
str.assign(tag, length);
} else {
for (int i = 0; i < length; ++i) {
str += ascii_tolower(tag[i]);
}
}
return str;
}
// Add a tag.
int HtmlTagIndex::AddHtmlTag(const char* tag, int length) {
// No more changing the case sensitivity.
case_sensitive_fixed_ = true;
// Look for existing tag.
const int tag_id = FindHtmlTag(tag, length);
if (tag_id != kHtmlTagUnknown)
return tag_id;
// Add to the custom table.
if (custom_tag_map_.get() == NULL) {
custom_tag_map_.reset(new CustomTagMap);
custom_tag_map_->set_empty_key(string(""));
}
string tag_copy(CaseAwareString(case_sensitive_, tag, length));
(*custom_tag_map_)[tag_copy] = index_max_;
return index_max_++;
}
// Find a tag.
// This is hard-wired to go fast.
int HtmlTagIndex::FindHtmlTag(const char* tag, int length) const {
// These macros convert a string to a uint32 or uint64
#define S1(s) ((s)[0])
#define S2(s) (S1(s) | ((s)[1] << 8))
#define S3(s) (S2(s) | ((s)[2] << 16))
#define S4(s) (S3(s) | ((s)[3] << 24))
#define S5(s) (S4(s) | (static_cast<uint64>((s)[4]) << 32))
#define S6(s) (S5(s) | (static_cast<uint64>((s)[5]) << 40))
#define S7(s) (S6(s) | (static_cast<uint64>((s)[6]) << 48))
#define S8(s) (S7(s) | (static_cast<uint64>((s)[7]) << 56))
// These macros convert a sequence of characters to a uint32 or uint64
#define C1(c0) (c0)
#define C2(c0, c1) (C1(c0) | ((c1) << 8))
#define C3(c0, c1, c2) (C2(c0, c1) | ((c2) << 16))
#define C4(c0, c1, c2, c3) (C3(c0, c1, c2) | ((c3) << 24))
#define C5(c0, c1, c2, c3, c4) \
(C4(c0, c1, c2, c3) | (static_cast<uint64>(c4) << 32))
#define C6(c0, c1, c2, c3, c4, c5) \
(C5(c0, c1, c2, c3, c4) | (static_cast<uint64>(c5) << 40))
#define C7(c0, c1, c2, c3, c4, c5, c6) \
(C6(c0, c1, c2, c3, c4, c5) | (static_cast<uint64>(c6) << 48))
#define C8(c0, c1, c2, c3, c4, c5, c6, c7) \
(C7(c0, c1, c2, c3, c4, c5, c6) | (static_cast<uint64>(c7) << 56))
switch (length) {
case 0:
// Empty tag
return kHtmlTagZeroLength;
case 1:
{
switch (S1(tag)|case_mask_1_) {
default: break;
// From html 4.01 spec
case C1('a'): return kHtmlTagA;
case C1('b'): return kHtmlTagB;
case C1('i'): return kHtmlTagI;
case C1('p'): return kHtmlTagP;
case C1('q'): return kHtmlTagQ;
case C1('s'): return kHtmlTagS;
case C1('u'): return kHtmlTagU;
}
}
break;
case 2:
{
switch (S2(tag)|case_mask_2_) {
default: break;
// From html 4.01 spec
case C2('b','r'): return kHtmlTagBr;
case C2('d','d'): return kHtmlTagDd;
case C2('d','l'): return kHtmlTagDl;
case C2('d','t'): return kHtmlTagDt;
case C2('e','m'): return kHtmlTagEm;
// Beware of matching "h\x11" or "H\x11" in case-insensitive mode.
case C2('h','1'): if (tag[1] == '1') return kHtmlTagH1; else break;
case C2('h','2'): if (tag[1] == '2') return kHtmlTagH2; else break;
case C2('h','3'): if (tag[1] == '3') return kHtmlTagH3; else break;
case C2('h','4'): if (tag[1] == '4') return kHtmlTagH4; else break;
case C2('h','5'): if (tag[1] == '5') return kHtmlTagH5; else break;
case C2('h','6'): if (tag[1] == '6') return kHtmlTagH6; else break;
case C2('h','r'): return kHtmlTagHr;
case C2('l','i'): return kHtmlTagLi;
case C2('o','l'): return kHtmlTagOl;
case C2('t','d'): return kHtmlTagTd;
case C2('t','h'): return kHtmlTagTh;
case C2('t','r'): return kHtmlTagTr;
case C2('t','t'): return kHtmlTagTt;
case C2('u','l'): return kHtmlTagUl;
}
}
break;
case 3:
{
switch (S3(tag)|case_mask_3_) {
default: break;
// From html 4.01 spec
case C3('b','d','o'): return kHtmlTagBdo;
case C3('b','i','g'): return kHtmlTagBig;
case C3('c','o','l'): return kHtmlTagCol;
case C3('d','e','l'): return kHtmlTagDel;
case C3('d','i','r'): return kHtmlTagDir;
case C3('d','i','v'): return kHtmlTagDiv;
case C3('d','f','n'): return kHtmlTagDfn;
case C3('i','m','g'): return kHtmlTagImg;
case C3('i','n','s'): return kHtmlTagIns;
case C3('k','b','d'): return kHtmlTagKbd;
case C3('m','a','p'): return kHtmlTagMap;
case C3('p','r','e'): return kHtmlTagPre;
case C3('s','u','b'): return kHtmlTagSub;
case C3('s','u','p'): return kHtmlTagSup;
case C3('v','a','r'): return kHtmlTagVar;
case C3('w','b','r'): return kHtmlTagWbr;
case C3('x','m','p'): return kHtmlTagXmp;
// Used in repository/lexer/html_lexer.cc
case C3('!','-','-'):
// These chars have no upper-lower case form so I haev to rematch.
if (S3(tag) == C3('!','-','-'))
return kHtmlTagBangDashDash;
break;
}
}
break;
case 4:
{
switch (S4(tag)|case_mask_4_) {
default: break;
// From html 4.01 spec
case C4('a','b','b','r'): return kHtmlTagAbbr;
case C4('a','r','e','a'): return kHtmlTagArea;
case C4('b','a','s','e'): return kHtmlTagBase;
case C4('b','o','d','y'): return kHtmlTagBody;
case C4('c','i','t','e'): return kHtmlTagCite;
case C4('c','o','d','e'): return kHtmlTagCode;
case C4('f','o','n','t'): return kHtmlTagFont;
case C4('f','o','r','m'): return kHtmlTagForm;
case C4('h','e','a','d'): return kHtmlTagHead;
case C4('h','t','m','l'): return kHtmlTagHtml;
case C4('l','i','n','k'): return kHtmlTagLink;
case C4('m','e','n','u'): return kHtmlTagMenu;
case C4('m','e','t','a'): return kHtmlTagMeta;
case C4('s','a','m','p'): return kHtmlTagSamp;
case C4('s','p','a','n'): return kHtmlTagSpan;
case C4('n','o','b','r'): return kHtmlTagNobr;
}
}
break;
case 5:
{
switch (S5(tag)|case_mask_5_) {
default: break;
// From html 4.01 spec
case C5('f','r','a','m','e'): return kHtmlTagFrame;
case C5('i','n','p','u','t'): return kHtmlTagInput;
case C5('l','a','b','e','l'): return kHtmlTagLabel;
case C5('p','a','r','a','m'): return kHtmlTagParam;
case C5('s','m','a','l','l'): return kHtmlTagSmall;
case C5('s','t','y','l','e'): return kHtmlTagStyle;
case C5('t','a','b','l','e'): return kHtmlTagTable;
case C5('t','b','o','d','y'): return kHtmlTagTbody;
case C5('t','f','o','o','t'): return kHtmlTagTfoot;
case C5('t','h','e','a','d'): return kHtmlTagThead;
case C5('t','i','t','l','e'): return kHtmlTagTitle;
// Used in repository/lexer/html_lexer.cc
case C5('b','l','i','n','k'): return kHtmlTagBlink;
// Used in repository/parsers/base/handler-parser.cc
case C5('e','m','b','e','d'): return kHtmlTagEmbed;
case C5('i','m','a','g','e'): return kHtmlTagImage;
// From Netscape Navigator 4.0
case C5('l','a','y','e','r'): return kHtmlTagLayer;
}
}
break;
case 6:
{
switch (S6(tag)|case_mask_6_) {
default: break;
// From html 4.01 spec
case C6('a','p','p','l','e','t'): return kHtmlTagApplet;
case C6('b','u','t','t','o','n'): return kHtmlTagButton;
case C6('c','e','n','t','e','r'): return kHtmlTagCenter;
case C6('i','f','r','a','m','e'): return kHtmlTagIframe;
case C6('l','e','g','e','n','d'): return kHtmlTagLegend;
case C6('o','b','j','e','c','t'): return kHtmlTagObject;
case C6('o','p','t','i','o','n'): return kHtmlTagOption;
case C6('s','c','r','i','p','t'): return kHtmlTagScript;
case C6('s','e','l','e','c','t'): return kHtmlTagSelect;
case C6('s','t','r','i','k','e'): return kHtmlTagStrike;
case C6('s','t','r','o','n','g'): return kHtmlTagStrong;
case C6('s','p','a','c','e','r'): return kHtmlTagSpacer;
// From Netscape Navigator 4.0
case C6('i','l','a','y','e','r'): return kHtmlTagIlayer;
case C6('k','e','y','g','e','n'): return kHtmlTagKeygen;
case C6('s','e','r','v','e','r'): return kHtmlTagServer;
}
}
break;
case 7:
{
switch (S7(tag)|case_mask_7_) {
default: break;
// From html 4.01 spec
case C7('a','c','r','o','n','y','m'): return kHtmlTagAcronym;
case C7('a','d','d','r','e','s','s'): return kHtmlTagAddress;
case C7('c','a','p','t','i','o','n'): return kHtmlTagCaption;
case C7('i','s','i','n','d','e','x'): return kHtmlTagIsindex;
// Used in repository/parsers/base/handler-parser.cc
case C7('m','a','r','q','u','e','e'): return kHtmlTagMarquee;
case C7('b','g','s','o','u','n','d'): return kHtmlTagBgsound;
case C7('l','i','s','t','i','n','g'): return kHtmlTagListing;
case C7('n','o','e','m','b','e','d'): return kHtmlTagNoembed;
// From Netscape Navigator 4.0
case C7('n','o','l','a','y','e','r'): return kHtmlTagNolayer;
// Legacy tag used mostly by Russian sites
case C7('n','o','i','n','d','e','x'): return kHtmlTagNoindex;
}
}
break;
case 8:
{
switch (S8(tag)|case_mask_8_) {
default: break;
// From html 4.01 spec
case C8('b','a','s','e','f','o','n','t'): return kHtmlTagBasefont;
case C8('c','o','l','g','r','o','u','p'): return kHtmlTagColgroup;
case C8('f','i','e','l','d','s','e','t'): return kHtmlTagFieldset;
case C8('f','r','a','m','e','s','e','t'): return kHtmlTagFrameset;
case C8('n','o','f','r','a','m','e','s'): return kHtmlTagNoframes;
case C8('n','o','s','c','r','i','p','t'): return kHtmlTagNoscript;
case C8('o','p','t','g','r','o','u','p'): return kHtmlTagOptgroup;
case C8('t','e','x','t','a','r','e','a'): return kHtmlTagTextarea;
// From Netscape Navigator 4.0
case C8('m','u','l','t','i','c','o','l'): return kHtmlTagMulticol;
}
}
break;
case 9:
{
if ((S4(tag+0)|case_mask_4_) == C4('p','l','a','i') &&
(S4(tag+4)|case_mask_4_) == C4('n','t','e','x') &&
(S1(tag+8)|case_mask_1_) == C1('t'))
return kHtmlTagPlaintext;
}
break;
case 10:
{
// From html 4.01 spec
if ((S4(tag+0)|case_mask_4_) == C4('b','l','o','c') &&
(S4(tag+4)|case_mask_4_) == C4('k','q','u','o') &&
(S2(tag+8)|case_mask_2_) == C2('t','e'))
return kHtmlTagBlockquote;
}
break;
}
// !doctype is special. Any tag name starting with !doctype (no need for
// exact match) is considered !doctype. Tested on IE 7.0 and Firefox 2.0.
// You can also refer to (ongoing work):
// http://whatwg.org/specs/web-apps/current-work/#markup
if (length >= 8 && S1(tag) == C1('!') &&
(S8(tag)|case_mask_8_) == C8('!','d','o','c','t','y','p','e'))
return kHtmlTagBangDoctype;
// Otherwise, !blahblah and ?blahblah are comments.
if (S1(tag) == C1('!') || S1(tag) == C1('?'))
return kHtmlTagBogusComment;
#undef C8
#undef C7
#undef C6
#undef C5
#undef C4
#undef C3
#undef C2
#undef C1
#undef S8
#undef S7
#undef S6
#undef S5
#undef S4
#undef S3
#undef S2
#undef S1
// Look in the custom table.
if (custom_tag_map_.get() != NULL) {
string tag_copy(CaseAwareString(case_sensitive_, tag, length));
CustomTagMap::const_iterator it = custom_tag_map_->find(tag_copy);
if (it != custom_tag_map_->end()) {
return it->second;
}
}
// Unknown tag.
return kHtmlTagUnknown;
}