| // Copyright 2006-2008, The Google Caja project. |
| // Modifications Copyright 2009 The Closure Library Authors. All Rights Reserved. |
| // All Rights Reserved |
| |
| /** |
| * @license Portions of this code are from the google-caja project, received by |
| * Google under the Apache license (http://code.google.com/p/google-caja/). |
| * All other code is Copyright 2009 Google, Inc. All Rights Reserved. |
| |
| // Copyright (C) 2006 Google Inc. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| */ |
| |
| /** |
| * @fileoverview A Html SAX parser. |
| * |
| * Examples of usage of the {@code goog.string.html.HtmlParser}: |
| * <pre> |
| * var handler = new MyCustomHtmlVisitorHandlerThatExtendsHtmlSaxHandler(); |
| * var parser = new goog.string.html.HtmlParser(); |
| * parser.parse(handler, '<html><a href="google.com">link found!</a></html>'); |
| * </pre> |
| * |
| * TODO(user, msamuel): validate sanitizer regex against the HTML5 grammar at |
| * http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html |
| * http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html |
| * http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html |
| * http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html |
| * |
| * @supported IE6, IE7, IE8, FF1.5, FF2, FF3, Chrome 3.0, Safari and Opera 10. |
| */ |
| |
| goog.provide('goog.string.html.HtmlParser'); |
| goog.provide('goog.string.html.HtmlParser.EFlags'); |
| goog.provide('goog.string.html.HtmlParser.Elements'); |
| goog.provide('goog.string.html.HtmlParser.Entities'); |
| goog.provide('goog.string.html.HtmlSaxHandler'); |
| |
| |
| /** |
| * An Html parser: {@code parse} takes a string and calls methods on |
| * {@code goog.string.html.HtmlSaxHandler} while it is visiting it. |
| * |
| * @constructor |
| */ |
| goog.string.html.HtmlParser = function() { |
| }; |
| |
| |
| /** |
| * HTML entities that are encoded/decoded. |
| * TODO(user): use {@code goog.string.htmlEncode} instead. |
| * @enum {string} |
| */ |
| goog.string.html.HtmlParser.Entities = { |
| lt: '<', |
| gt: '>', |
| amp: '&', |
| nbsp: '\240', |
| quot: '"', |
| apos: '\'' |
| }; |
| |
| |
| /** |
| * The html eflags, used internally on the parser. |
| * @enum {number} |
| */ |
| goog.string.html.HtmlParser.EFlags = { |
| OPTIONAL_ENDTAG: 1, |
| EMPTY: 2, |
| CDATA: 4, |
| RCDATA: 8, |
| UNSAFE: 16, |
| FOLDABLE: 32 |
| }; |
| |
| |
| /** |
| * A map of element to a bitmap of flags it has, used internally on the parser. |
| * @type {Object} |
| */ |
| goog.string.html.HtmlParser.Elements = { |
| 'a': 0, |
| 'abbr': 0, |
| 'acronym': 0, |
| 'address': 0, |
| 'applet': goog.string.html.HtmlParser.EFlags.UNSAFE, |
| 'area': goog.string.html.HtmlParser.EFlags.EMPTY, |
| 'b': 0, |
| 'base': goog.string.html.HtmlParser.EFlags.EMPTY | |
| goog.string.html.HtmlParser.EFlags.UNSAFE, |
| 'basefont': goog.string.html.HtmlParser.EFlags.EMPTY | |
| goog.string.html.HtmlParser.EFlags.UNSAFE, |
| 'bdo': 0, |
| 'big': 0, |
| 'blockquote': 0, |
| 'body': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG | |
| goog.string.html.HtmlParser.EFlags.UNSAFE | |
| goog.string.html.HtmlParser.EFlags.FOLDABLE, |
| 'br': goog.string.html.HtmlParser.EFlags.EMPTY, |
| 'button': 0, |
| 'caption': 0, |
| 'center': 0, |
| 'cite': 0, |
| 'code': 0, |
| 'col': goog.string.html.HtmlParser.EFlags.EMPTY, |
| 'colgroup': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG, |
| 'dd': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG, |
| 'del': 0, |
| 'dfn': 0, |
| 'dir': 0, |
| 'div': 0, |
| 'dl': 0, |
| 'dt': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG, |
| 'em': 0, |
| 'fieldset': 0, |
| 'font': 0, |
| 'form': 0, |
| 'frame': goog.string.html.HtmlParser.EFlags.EMPTY | |
| goog.string.html.HtmlParser.EFlags.UNSAFE, |
| 'frameset': goog.string.html.HtmlParser.EFlags.UNSAFE, |
| 'h1': 0, |
| 'h2': 0, |
| 'h3': 0, |
| 'h4': 0, |
| 'h5': 0, |
| 'h6': 0, |
| 'head': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG | |
| goog.string.html.HtmlParser.EFlags.UNSAFE | |
| goog.string.html.HtmlParser.EFlags.FOLDABLE, |
| 'hr': goog.string.html.HtmlParser.EFlags.EMPTY, |
| 'html': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG | |
| goog.string.html.HtmlParser.EFlags.UNSAFE | |
| goog.string.html.HtmlParser.EFlags.FOLDABLE, |
| 'i': 0, |
| 'iframe': goog.string.html.HtmlParser.EFlags.UNSAFE | |
| goog.string.html.HtmlParser.EFlags.CDATA, |
| 'img': goog.string.html.HtmlParser.EFlags.EMPTY, |
| 'input': goog.string.html.HtmlParser.EFlags.EMPTY, |
| 'ins': 0, |
| 'isindex': goog.string.html.HtmlParser.EFlags.EMPTY | |
| goog.string.html.HtmlParser.EFlags.UNSAFE, |
| 'kbd': 0, |
| 'label': 0, |
| 'legend': 0, |
| 'li': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG, |
| 'link': goog.string.html.HtmlParser.EFlags.EMPTY | |
| goog.string.html.HtmlParser.EFlags.UNSAFE, |
| 'map': 0, |
| 'menu': 0, |
| 'meta': goog.string.html.HtmlParser.EFlags.EMPTY | |
| goog.string.html.HtmlParser.EFlags.UNSAFE, |
| 'noframes': goog.string.html.HtmlParser.EFlags.UNSAFE | |
| goog.string.html.HtmlParser.EFlags.CDATA, |
| 'noscript': goog.string.html.HtmlParser.EFlags.UNSAFE | |
| goog.string.html.HtmlParser.EFlags.CDATA, |
| 'object': goog.string.html.HtmlParser.EFlags.UNSAFE, |
| 'ol': 0, |
| 'optgroup': 0, |
| 'option': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG, |
| 'p': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG, |
| 'param': goog.string.html.HtmlParser.EFlags.EMPTY | |
| goog.string.html.HtmlParser.EFlags.UNSAFE, |
| 'pre': 0, |
| 'q': 0, |
| 's': 0, |
| 'samp': 0, |
| 'script': goog.string.html.HtmlParser.EFlags.UNSAFE | |
| goog.string.html.HtmlParser.EFlags.CDATA, |
| 'select': 0, |
| 'small': 0, |
| 'span': 0, |
| 'strike': 0, |
| 'strong': 0, |
| 'style': goog.string.html.HtmlParser.EFlags.UNSAFE | |
| goog.string.html.HtmlParser.EFlags.CDATA, |
| 'sub': 0, |
| 'sup': 0, |
| 'table': 0, |
| 'tbody': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG, |
| 'td': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG, |
| 'textarea': goog.string.html.HtmlParser.EFlags.RCDATA, |
| 'tfoot': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG, |
| 'th': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG, |
| 'thead': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG, |
| 'title': goog.string.html.HtmlParser.EFlags.RCDATA | |
| goog.string.html.HtmlParser.EFlags.UNSAFE, |
| 'tr': goog.string.html.HtmlParser.EFlags.OPTIONAL_ENDTAG, |
| 'tt': 0, |
| 'u': 0, |
| 'ul': 0, |
| 'var': 0 |
| }; |
| |
| |
| /** |
| * Regular expression that matches &s. |
| * @type {RegExp} |
| * @private |
| */ |
| goog.string.html.HtmlParser.AMP_RE_ = /&/g; |
| |
| |
| /** |
| * Regular expression that matches loose &s. |
| * @type {RegExp} |
| * @private |
| */ |
| goog.string.html.HtmlParser.LOOSE_AMP_RE_ = |
| /&([^a-z#]|#(?:[^0-9x]|x(?:[^0-9a-f]|$)|$)|$)/gi; |
| |
| |
| /** |
| * Regular expression that matches <. |
| * @type {RegExp} |
| * @private |
| */ |
| goog.string.html.HtmlParser.LT_RE_ = /</g; |
| |
| |
| /** |
| * Regular expression that matches >. |
| * @type {RegExp} |
| * @private |
| */ |
| goog.string.html.HtmlParser.GT_RE_ = />/g; |
| |
| |
| /** |
| * Regular expression that matches ". |
| * @type {RegExp} |
| * @private |
| */ |
| goog.string.html.HtmlParser.QUOTE_RE_ = /\"/g; |
| |
| |
| /** |
| * Regular expression that matches =. |
| * @type {RegExp} |
| * @private |
| */ |
| goog.string.html.HtmlParser.EQUALS_RE_ = /=/g; |
| |
| |
| /** |
| * Regular expression that matches null characters. |
| * @type {RegExp} |
| * @private |
| */ |
| goog.string.html.HtmlParser.NULL_RE_ = /\0/g; |
| |
| |
| /** |
| * Regular expression that matches entities. |
| * @type {RegExp} |
| * @private |
| */ |
| goog.string.html.HtmlParser.ENTITY_RE_ = /&(#\d+|#x[0-9A-Fa-f]+|\w+);/g; |
| |
| |
| /** |
| * Regular expression that matches decimal numbers. |
| * @type {RegExp} |
| * @private |
| */ |
| goog.string.html.HtmlParser.DECIMAL_ESCAPE_RE_ = /^#(\d+)$/; |
| |
| |
| /** |
| * Regular expression that matches hexadecimal numbers. |
| * @type {RegExp} |
| * @private |
| */ |
| goog.string.html.HtmlParser.HEX_ESCAPE_RE_ = /^#x([0-9A-Fa-f]+)$/; |
| |
| |
| /** |
| * Regular expression that matches the next token to be processed. |
| * @type {RegExp} |
| * @private |
| */ |
| goog.string.html.HtmlParser.INSIDE_TAG_TOKEN_ = new RegExp( |
| // Don't capture space. |
| '^\\s*(?:' + |
| // Capture an attribute name in group 1, and value in group 3. |
| // We capture the fact that there was an attribute in group 2, since |
| // interpreters are inconsistent in whether a group that matches nothing |
| // is null, undefined, or the empty string. |
| ('(?:' + |
| '([a-z][a-z-]*)' + // attribute name |
| ('(' + // optionally followed |
| '\\s*=\\s*' + |
| ('(' + |
| // A double quoted string. |
| '\"[^\"]*\"' + |
| // A single quoted string. |
| '|\'[^\']*\'' + |
| // The positive lookahead is used to make sure that in |
| // <foo bar= baz=boo>, the value for bar is blank, not "baz=boo". |
| '|(?=[a-z][a-z-]*\\s*=)' + |
| // An unquoted value that is not an attribute name. |
| // We know it is not an attribute name because the previous |
| // zero-width match would've eliminated that possibility. |
| '|[^>\"\'\\s]*' + |
| ')' |
| ) + |
| ')' |
| ) + '?' + |
| ')' |
| ) + |
| // End of tag captured in group 3. |
| '|(/?>)' + |
| // Don't capture cruft |
| '|[^a-z\\s>]+)', |
| 'i'); |
| |
| |
| /** |
| * Regular expression that matches the next token to be processed when we are |
| * outside a tag. |
| * @type {RegExp} |
| * @private |
| */ |
| goog.string.html.HtmlParser.OUTSIDE_TAG_TOKEN_ = new RegExp( |
| '^(?:' + |
| // Entity captured in group 1. |
| '&(\\#[0-9]+|\\#[x][0-9a-f]+|\\w+);' + |
| // Comment, doctypes, and processing instructions not captured. |
| '|<[!]--[\\s\\S]*?-->|<!\\w[^>]*>|<\\?[^>*]*>' + |
| // '/' captured in group 2 for close tags, and name captured in group 3. |
| '|<(/)?([a-z][a-z0-9]*)' + |
| // Text captured in group 4. |
| '|([^<&>]+)' + |
| // Cruft captured in group 5. |
| '|([<&>]))', |
| 'i'); |
| |
| |
| /** |
| * Given a SAX-like {@code goog.string.html.HtmlSaxHandler} parses a |
| * {@code htmlText} and lets the {@code handler} know the structure while |
| * visiting the nodes. |
| * |
| * @param {goog.string.html.HtmlSaxHandler} handler The HtmlSaxHandler that will |
| * receive the events. |
| * @param {string} htmlText The html text. |
| */ |
| goog.string.html.HtmlParser.prototype.parse = function(handler, htmlText) { |
| var htmlLower = null; |
| var inTag = false; // True iff we're currently processing a tag. |
| var attribs = []; // Accumulates attribute names and values. |
| var tagName; // The name of the tag currently being processed. |
| var eflags; // The element flags for the current tag. |
| var openTag; // True if the current tag is an open tag. |
| |
| // Lets the handler know that we are starting to parse the document. |
| handler.startDoc(); |
| |
| // Consumes tokens from the htmlText and stops once all tokens are processed. |
| while (htmlText) { |
| var regex = inTag ? |
| goog.string.html.HtmlParser.INSIDE_TAG_TOKEN_ : |
| goog.string.html.HtmlParser.OUTSIDE_TAG_TOKEN_; |
| // Gets the next token |
| var m = htmlText.match(regex); |
| // And removes it from the string |
| htmlText = htmlText.substring(m[0].length); |
| |
| // TODO(goto): cleanup this code breaking it into separate methods. |
| if (inTag) { |
| if (m[1]) { // Attribute. |
| // SetAttribute with uppercase names doesn't work on IE6. |
| var attribName = goog.string.html.toLowerCase(m[1]); |
| var decodedValue; |
| if (m[2]) { |
| var encodedValue = m[3]; |
| switch (encodedValue.charCodeAt(0)) { // Strip quotes. |
| case 34: case 39: |
| encodedValue = encodedValue.substring( |
| 1, encodedValue.length - 1); |
| break; |
| } |
| decodedValue = this.unescapeEntities_(this.stripNULs_(encodedValue)); |
| } else { |
| // Use name as value for valueless attribs, so |
| // <input type=checkbox checked> |
| // gets attributes ['type', 'checkbox', 'checked', 'checked'] |
| decodedValue = attribName; |
| } |
| attribs.push(attribName, decodedValue); |
| } else if (m[4]) { |
| if (eflags !== void 0) { // False if not in whitelist. |
| if (openTag) { |
| if (handler.startTag) { |
| handler.startTag(/** @type {string} */ (tagName), attribs); |
| } |
| } else { |
| if (handler.endTag) { |
| handler.endTag(/** @type {string} */ (tagName)); |
| } |
| } |
| } |
| |
| if (openTag && (eflags & |
| (goog.string.html.HtmlParser.EFlags.CDATA | |
| goog.string.html.HtmlParser.EFlags.RCDATA))) { |
| if (htmlLower === null) { |
| htmlLower = goog.string.html.toLowerCase (htmlText); |
| } else { |
| htmlLower = htmlLower.substring( |
| htmlLower.length - htmlText.length); |
| } |
| var dataEnd = htmlLower.indexOf('</' + tagName); |
| if (dataEnd < 0) { |
| dataEnd = htmlText.length; |
| } |
| if (eflags & goog.string.html.HtmlParser.EFlags.CDATA) { |
| if (handler.cdata) { |
| handler.cdata(htmlText.substring(0, dataEnd)); |
| } |
| } else if (handler.rcdata) { |
| handler.rcdata( |
| this.normalizeRCData_(htmlText.substring(0, dataEnd))); |
| } |
| htmlText = htmlText.substring(dataEnd); |
| } |
| |
| tagName = eflags = openTag = void 0; |
| attribs.length = 0; |
| inTag = false; |
| } |
| } else { |
| if (m[1]) { // Entity. |
| handler.pcdata(m[0]); |
| } else if (m[3]) { // Tag. |
| openTag = !m[2]; |
| inTag = true; |
| tagName = goog.string.html.toLowerCase (m[3]); |
| eflags = goog.string.html.HtmlParser.Elements.hasOwnProperty(tagName) ? |
| goog.string.html.HtmlParser.Elements[tagName] : void 0; |
| } else if (m[4]) { // Text. |
| handler.pcdata(m[4]); |
| } else if (m[5]) { // Cruft. |
| switch (m[5]) { |
| case '<': handler.pcdata('<'); break; |
| case '>': handler.pcdata('>'); break; |
| default: handler.pcdata('&'); break; |
| } |
| } |
| } |
| } |
| |
| // Lets the handler know that we are done parsing the document. |
| handler.endDoc(); |
| }; |
| |
| |
| /** |
| * Decodes an HTML entity. |
| * |
| * @param {string} name The content between the '&' and the ';'. |
| * @return {string} A single unicode code-point as a string. |
| * @private |
| */ |
| goog.string.html.HtmlParser.prototype.lookupEntity_ = function(name) { |
| // TODO(goto): use {goog.string.htmlDecode} instead ? |
| // TODO(goto): π is different from Π |
| name = goog.string.html.toLowerCase(name); |
| if (goog.string.html.HtmlParser.Entities.hasOwnProperty(name)) { |
| return goog.string.html.HtmlParser.Entities[name]; |
| } |
| var m = name.match(goog.string.html.HtmlParser.DECIMAL_ESCAPE_RE_); |
| if (m) { |
| return String.fromCharCode(parseInt(m[1], 10)); |
| } else if ( |
| !!(m = name.match(goog.string.html.HtmlParser.HEX_ESCAPE_RE_))) { |
| return String.fromCharCode(parseInt(m[1], 16)); |
| } |
| return ''; |
| }; |
| |
| |
| /** |
| * Removes null characters on the string. |
| * @param {string} s The string to have the null characters removed. |
| * @return {string} A string without null characters. |
| * @private |
| */ |
| goog.string.html.HtmlParser.prototype.stripNULs_ = function(s) { |
| return s.replace(goog.string.html.HtmlParser.NULL_RE_, ''); |
| }; |
| |
| |
| /** |
| * The plain text of a chunk of HTML CDATA which possibly containing. |
| * |
| * TODO(goto): use {@code goog.string.unescapeEntities} instead ? |
| * @param {string} s A chunk of HTML CDATA. It must not start or end inside |
| * an HTML entity. |
| * @return {string} The unescaped entities. |
| * @private |
| */ |
| goog.string.html.HtmlParser.prototype.unescapeEntities_ = function(s) { |
| return s.replace( |
| goog.string.html.HtmlParser.ENTITY_RE_, |
| goog.bind(this.lookupEntity_, this)); |
| }; |
| |
| |
| /** |
| * Escape entities in RCDATA that can be escaped without changing the meaning. |
| * @param {string} rcdata The RCDATA string we want to normalize. |
| * @return {string} A normalized version of RCDATA. |
| * @private |
| */ |
| goog.string.html.HtmlParser.prototype.normalizeRCData_ = function(rcdata) { |
| return rcdata. |
| replace(goog.string.html.HtmlParser.LOOSE_AMP_RE_, '&$1'). |
| replace(goog.string.html.HtmlParser.LT_RE_, '<'). |
| replace(goog.string.html.HtmlParser.GT_RE_, '>'); |
| }; |
| |
| |
| /** |
| * TODO(goto): why isn't this in the string package ? does this solves any |
| * real problem ? move it to the goog.string package if it does. |
| * |
| * @param {string} str The string to lower case. |
| * @return {string} The str in lower case format. |
| */ |
| goog.string.html.toLowerCase = function(str) { |
| // The below may not be true on browsers in the Turkish locale. |
| if ('script' === 'SCRIPT'.toLowerCase()) { |
| return str.toLowerCase(); |
| } else { |
| return str.replace(/[A-Z]/g, function(ch) { |
| return String.fromCharCode(ch.charCodeAt(0) | 32); |
| }); |
| } |
| }; |
| |
| |
| /** |
| * An interface to the {@code goog.string.html.HtmlParser} visitor, that gets |
| * called while the HTML is being parsed. |
| * |
| * @constructor |
| */ |
| goog.string.html.HtmlSaxHandler = function() { |
| }; |
| |
| |
| /** |
| * Handler called when the parser found a new tag. |
| * @param {string} name The name of the tag that is starting. |
| * @param {Array.<string>} attributes The attributes of the tag. |
| */ |
| goog.string.html.HtmlSaxHandler.prototype.startTag = goog.abstractMethod; |
| |
| |
| /** |
| * Handler called when the parser found a closing tag. |
| * @param {string} name The name of the tag that is ending. |
| */ |
| goog.string.html.HtmlSaxHandler.prototype.endTag = goog.abstractMethod; |
| |
| |
| /** |
| * Handler called when PCDATA is found. |
| * @param {string} text The PCDATA text found. |
| */ |
| goog.string.html.HtmlSaxHandler.prototype.pcdata = goog.abstractMethod; |
| |
| |
| /** |
| * Handler called when RCDATA is found. |
| * @param {string} text The RCDATA text found. |
| */ |
| goog.string.html.HtmlSaxHandler.prototype.rcdata = goog.abstractMethod; |
| |
| |
| /** |
| * Handler called when CDATA is found. |
| * @param {string} text The CDATA text found. |
| */ |
| goog.string.html.HtmlSaxHandler.prototype.cdata = goog.abstractMethod; |
| |
| |
| /** |
| * Handler called when the parser is starting to parse the document. |
| */ |
| goog.string.html.HtmlSaxHandler.prototype.startDoc = goog.abstractMethod; |
| |
| |
| /** |
| * Handler called when the parsing is done. |
| */ |
| goog.string.html.HtmlSaxHandler.prototype.endDoc = goog.abstractMethod; |