blob: 10155c1fcc84865b909a3b6849500dbd110c108a [file] [log] [blame]
// Copyright 2014 The Closure Library Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS-IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/**
* @fileoverview
* An HTML sanitizer that takes untrusted HTML snippets and produces
* safe HTML by filtering/rewriting tags and attributes that contain
* high-privilege instructions.
*/
goog.provide('goog.labs.html.Sanitizer');
goog.require('goog.asserts');
goog.require('goog.html.SafeUrl');
goog.require('goog.labs.html.attributeRewriterPresubmitWorkaround');
goog.require('goog.labs.html.scrubber');
goog.require('goog.object');
goog.require('goog.string');
/**
* A sanitizer that converts untrusted, messy HTML into more regular HTML
* that cannot abuse high-authority constructs like the ability to execute
* arbitrary JavaScript.
* @constructor
*/
goog.labs.html.Sanitizer = function() {
/**
* Maps the lower-case names of allowed elements to attribute white-lists.
* An attribute white-list maps lower-case attribute names to functions
* from values to values or undefined to disallow.
*
* The special element name {@code "*"} contains a white-list of attributes
* allowed on any tag, which is useful for attributes like {@code title} and
* {@code id} which are widely available with element-agnostic meanings.
* It should not be used for attributes like {@code type} whose meaning
* differs based on the element on which it appears:
* e.g. {@code <input type=text>} vs {@code <style type=text/css>}.
*
* @type {!Object<string, !Object<string, goog.labs.html.AttributeRewriter>>}
* @private
*/
this.whitelist_ = goog.labs.html.Sanitizer.createBlankObject_();
this.whitelist_['*'] = goog.labs.html.Sanitizer.createBlankObject_();
// To use the sanitizer, we build inputs for the scrubber.
// These inputs are invalidated by changes to the policy, so we (re)build them
// lazily.
/**
* Maps element names to {@code true} so the scrubber does not have to do
* own property checks for every tag filtered.
*
* Built lazily and invalidated when the white-list is modified.
*
* @type {Object<string, boolean>}
* @private
*/
this.allowedElementSet_ = null;
};
// TODO(user): Should the return type be goog.html.SafeHtml?
// If we receive a safe HTML string as input, should we simply rebalance
// tags?
/**
* Yields a string of safe HTML that contains all and only the safe
* text-nodes and elements in the input.
*
* <p>
* For the purposes of this function, "safe" is defined thus:
* <ul>
* <li>Contains only elements explicitly allowed via {@code this.allow*}.
* <li>Contains only attributes explicitly allowed via {@code this.allow*}
* and having had all relevant transformations applied.
* <li>Contains an end tag for all and only non-void open tags.
* <li>Tags nest per XHTML rules.
* <li>Tags do not nest beyond a finite but fairly large level.
* </ul>
*
* @param {!string} unsafeHtml A string of HTML which need not originate with
* a trusted source.
* @return {!string} A string of HTML that contains only tags and attributes
* explicitly allowed by this sanitizer, and with end tags for all and only
* non-void elements.
*/
goog.labs.html.Sanitizer.prototype.sanitize = function(unsafeHtml) {
var unsafeHtmlString = '' + unsafeHtml;
/**
* @type {!Object<string, !Object<string, goog.labs.html.AttributeRewriter>>}
*/
var whitelist = this.whitelist_;
if (!this.allowedElementSet_) {
this.allowedElementSet_ = goog.object.createSet(
// This can lead to '*' in the allowed element set, but the scrubber
// will not parse "<*" as a tag beginning.
goog.object.getKeys(whitelist));
}
return goog.labs.html.scrubber.scrub(
this.allowedElementSet_, whitelist, unsafeHtmlString);
};
/**
* Adds the element names to the white-list of elements that are allowed
* in the safe HTML output.
* <p>
* Allowing elements does not, by itself, allow any attributes on
* those elements.
*
* @param {...!string} var_args element names that should be allowed in the
* safe HTML output.
* @return {!goog.labs.html.Sanitizer} {@code this}.
*/
goog.labs.html.Sanitizer.prototype.allowElements = function(var_args) {
this.allowedElementSet_ = null; // Invalidate.
var whitelist = this.whitelist_;
for (var i = 0; i < arguments.length; ++i) {
var elementName = arguments[i].toLowerCase();
goog.asserts.assert(
goog.labs.html.Sanitizer.isValidHtmlName_(elementName), elementName);
if (!Object.prototype.hasOwnProperty.call(whitelist, elementName)) {
whitelist[elementName] = goog.labs.html.Sanitizer.createBlankObject_();
}
}
return this;
};
/**
* Allows in the sanitized output
* <tt>&lt;<i>element</i> <i>attr</i>="..."&gt;</tt>
* when <i>element</i> is in {@code elementNames} and
* <i>attrNames</i> is in {@code attrNames}.
*
* If specified, {@code opt_valueXform} is a function that takes the
* HTML-entity-decoded attribute value, and can choose to disallow the
* attribute by returning {@code null} or substitute a new value
* by returning a string with the new value.
*
* @param {!Array<string>|string} elementNames names (or name) on which the
* attributes are allowed.
*
* Element names should be allowed via {@code allowElements(...)} prior
* to white-listing attributes.
*
* The special element name {@code "*"} has the same meaning as in CSS
* selectors: it can be used to white-list attributes like {@code title}
* and {@code id} which are widely available with element-agnostic
* meanings.
*
* It should not be used for attributes like {@code type} whose meaning
* differs based on the element on which it appears:
* e.g. {@code <input type=text>} vs {@code <style type=text/css>}.
*
* @param {!Array<string>|string} attrNames names (or name) of the attribute
* that should be allowed.
*
* @param {goog.labs.html.AttributeRewriter=} opt_rewriteValue A function
* that receives the HTML-entity-decoded attribute value and can return
* {@code null} to disallow the attribute entirely or the value for the
* attribute as a string.
* <p>
* The default is the identity function ({@code function(x){return x}}),
* and the value rewriter is composed with an attribute specific handler:
* <table>
* <tr>
* <th>href, src</th>
* <td>Requires that the value be an absolute URL with a protocol in
* (http, https, mailto) or a protocol relative URL.
* </tr>
* </table>
*
* @return {!goog.labs.html.Sanitizer} {@code this}.
*/
goog.labs.html.Sanitizer.prototype.allowAttributes =
function(elementNames, attrNames, opt_rewriteValue) {
if (!goog.isArray(elementNames)) {
elementNames = [elementNames];
}
if (!goog.isArray(attrNames)) {
attrNames = [attrNames];
}
goog.asserts.assert(
!opt_rewriteValue || 'function' === typeof opt_rewriteValue,
'opt_rewriteValue should be a function');
var whitelist = this.whitelist_;
for (var ei = 0; ei < elementNames.length; ++ei) {
var elementName = elementNames[ei].toLowerCase();
goog.asserts.assert(
goog.labs.html.Sanitizer.isValidHtmlName_(elementName) ||
'*' === elementName,
elementName);
// If the element has not been white-listed then panic.
// TODO(user): allow allow{Elements,Attributes} to be called in any
// order if someone needs it.
if (!Object.prototype.hasOwnProperty.call(whitelist, elementName)) {
throw new Error(elementName);
}
var attrWhitelist = whitelist[elementName];
for (var ai = 0, an = attrNames.length; ai < an; ++ai) {
var attrName = attrNames[ai].toLowerCase();
goog.asserts.assert(
goog.labs.html.Sanitizer.isValidHtmlName_(attrName), attrName);
// If the value has already been allowed, then chain the rewriters
// so that both white-listers concerns are met.
// We do not use the default rewriter here since it should have
// been introduced by the call that created the initial white-list
// entry.
attrWhitelist[attrName] = goog.labs.html.Sanitizer.chain_(
opt_rewriteValue || goog.labs.html.Sanitizer.valueIdentity_,
Object.prototype.hasOwnProperty.call(attrWhitelist, attrName) ?
attrWhitelist[attrName] :
goog.labs.html.Sanitizer.defaultRewriterForAttr_(attrName));
}
}
return this;
};
/**
* A new object that is as blank as possible.
*
* Using {@code Object.create} to create an object with
* no prototype speeds up whitelist access since there's fewer prototypes
* to fall-back to for a common case where an element is not in the
* white-list, and reduces the chance of confusing a member of
* {@code Object.prototype} with a whitelist entry.
*
* @return {!Object<string, ?>} a reference to a newly allocated object that
* does not alias any reference that existed prior.
* @private
*/
goog.labs.html.Sanitizer.createBlankObject_ = function() {
return (Object.create || Object)(null);
};
/**
* HTML element and attribute names may be almost arbitrary strings, but the
* sanitizer is more restrictive as to what can be white-listed.
*
* Since HTML is case-insensitive, only lower-case identifiers composed of
* ASCII letters, digits, and select punctuation are allowed.
*
* @param {string} name
* @return {boolean} true iff name is a valid white-list key.
* @private
*/
goog.labs.html.Sanitizer.isValidHtmlName_ = function(name) {
return 'string' === typeof name && // Names must be strings.
// Names must be lower-case and ASCII identifier chars only.
/^[a-z][a-z0-9\-:]*$/.test(name);
};
/**
* @param {goog.labs.html.AttributeValue} x
* @return {goog.labs.html.AttributeValue}
* @private
*/
goog.labs.html.Sanitizer.valueIdentity_ = function(x) {
return x;
};
/**
* @param {goog.labs.html.AttributeValue} x
* @return {null}
* @private
*/
goog.labs.html.Sanitizer.disallow_ = function(x) {
return null;
};
/**
* Chains attribute rewriters.
*
* @param {goog.labs.html.AttributeRewriter} f
* @param {goog.labs.html.AttributeRewriter} g
* @return {goog.labs.html.AttributeRewriter}
* a function that return g(f(x)) or null if f(x) is null.
* @private
*/
goog.labs.html.Sanitizer.chain_ = function(f, g) {
// Sometimes white-listing code ends up allowing things multiple times.
if (f === goog.labs.html.Sanitizer.valueIdentity_) {
return g;
}
if (g === goog.labs.html.Sanitizer.valueIdentity_) {
return f;
}
// If someone tries to white-list a really problematic value, we reject
// it by returning disallow_. Disallow it quickly.
if (f === goog.labs.html.Sanitizer.disallow_) {
return f;
}
if (g === goog.labs.html.Sanitizer.disallow_) {
return g;
}
return (
/**
* @param {goog.labs.html.AttributeValue} x
* @return {goog.labs.html.AttributeValue}
*/
function(x) {
var y = f(x);
return y != null ? g(y) : null;
});
};
/**
* Given an attribute name, returns a value rewriter that enforces some
* minimal safety properties.
*
* <p>
* For url atributes, it checks that any protocol is on a safe set that
* doesn't allow script execution.
* <p>
* It also blanket disallows CSS and event handler attributes.
*
* @param {string} attrName lower-cased attribute name.
* @return {goog.labs.html.AttributeRewriter}
* @private
*/
goog.labs.html.Sanitizer.defaultRewriterForAttr_ = function(attrName) {
if ('href' === attrName || 'src' === attrName) {
return goog.labs.html.Sanitizer.checkUrl_;
} else if ('style' === attrName || 'on' === attrName.substr(0, 2)) {
// TODO(user): delegate to a CSS sanitizer if one is available.
return goog.labs.html.Sanitizer.disallow_;
}
return goog.labs.html.Sanitizer.valueIdentity_;
};
/**
* Applied automatically to URL attributes to check that they are safe as per
* {@link SafeUrl}.
*
* @param {goog.labs.html.AttributeValue} attrValue a decoded attribute value.
* @return {goog.html.SafeUrl | null} a URL that is equivalent to the
* input or {@code null} if the input is not a safe URL.
* @private
*/
goog.labs.html.Sanitizer.checkUrl_ = function(attrValue) {
if (attrValue == null) {
return null;
}
/** @type {!goog.html.SafeUrl} */
var safeUrl;
if (attrValue instanceof goog.html.SafeUrl) {
safeUrl = /** @type {!goog.html.SafeUrl} */ (attrValue);
} else {
if (typeof attrValue === 'string') {
// Whitespace at the ends of URL-valued attributes in HTML is ignored.
attrValue = goog.string.trim(/** @type {string} */ (attrValue));
}
safeUrl = goog.html.SafeUrl.sanitize(
/** @type {!goog.string.TypedString | string} */ (attrValue));
}
if (goog.html.SafeUrl.unwrap(safeUrl) == goog.html.SafeUrl.INNOCUOUS_STRING) {
return null;
} else {
return safeUrl;
}
};
goog.labs.html.attributeRewriterPresubmitWorkaround();