| // Copyright 2008 The Closure Library Authors. All Rights Reserved. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS-IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| /** |
| * @fileoverview Provides functions to parse and pretty-print HTML strings. |
| * |
| */ |
| |
| goog.provide('goog.format.HtmlPrettyPrinter'); |
| goog.provide('goog.format.HtmlPrettyPrinter.Buffer'); |
| |
| goog.require('goog.object'); |
| goog.require('goog.string.StringBuffer'); |
| |
| |
| |
| /** |
| * This class formats HTML to be more human-readable. |
| * TODO(user): Add hierarchical indentation. |
| * @param {number=} opt_timeOutMillis Max # milliseconds to spend on #format. If |
| * this time is exceeded, return partially formatted. 0 or negative number |
| * indicates no timeout. |
| * @constructor |
| * @final |
| */ |
| goog.format.HtmlPrettyPrinter = function(opt_timeOutMillis) { |
| /** |
| * Max # milliseconds to spend on #format. |
| * @type {number} |
| * @private |
| */ |
| this.timeOutMillis_ = opt_timeOutMillis && opt_timeOutMillis > 0 ? |
| opt_timeOutMillis : 0; |
| }; |
| |
| |
| /** |
| * Singleton. |
| * @type {goog.format.HtmlPrettyPrinter?} |
| * @private |
| */ |
| goog.format.HtmlPrettyPrinter.instance_ = null; |
| |
| |
| /** |
| * Singleton lazy initializer. |
| * @return {!goog.format.HtmlPrettyPrinter} Singleton. |
| * @private |
| */ |
| goog.format.HtmlPrettyPrinter.getInstance_ = function() { |
| if (!goog.format.HtmlPrettyPrinter.instance_) { |
| goog.format.HtmlPrettyPrinter.instance_ = |
| new goog.format.HtmlPrettyPrinter(); |
| } |
| return goog.format.HtmlPrettyPrinter.instance_; |
| }; |
| |
| |
| /** |
| * Static utility function. See prototype #format. |
| * @param {string} html The HTML text to pretty print. |
| * @return {string} Formatted result. |
| */ |
| goog.format.HtmlPrettyPrinter.format = function(html) { |
| return goog.format.HtmlPrettyPrinter.getInstance_().format(html); |
| }; |
| |
| |
| /** |
| * List of patterns used to tokenize HTML for pretty printing. Cache |
| * subexpression for tag name. |
| * comment|meta-tag|tag|text|other-less-than-characters |
| * @type {RegExp} |
| * @private |
| */ |
| goog.format.HtmlPrettyPrinter.TOKEN_REGEX_ = |
| /(?:<!--.*?-->|<!.*?>|<(\/?)(\w+)[^>]*>|[^<]+|<)/g; |
| |
| |
| /** |
| * Tags whose contents we don't want pretty printed. |
| * @type {Object} |
| * @private |
| */ |
| goog.format.HtmlPrettyPrinter.NON_PRETTY_PRINTED_TAGS_ = goog.object.createSet( |
| 'script', |
| 'style', |
| 'pre', |
| 'xmp'); |
| |
| |
| /** |
| * 'Block' tags. We should add newlines before and after these tags during |
| * pretty printing. Tags drawn mostly from HTML4 definitions for block and other |
| * non-online tags, excepting the ones in |
| * #goog.format.HtmlPrettyPrinter.NON_PRETTY_PRINTED_TAGS_. |
| * |
| * @type {Object} |
| * @private |
| */ |
| goog.format.HtmlPrettyPrinter.BLOCK_TAGS_ = goog.object.createSet( |
| 'address', |
| 'applet', |
| 'area', |
| 'base', |
| 'basefont', |
| 'blockquote', |
| 'body', |
| 'caption', |
| 'center', |
| 'col', |
| 'colgroup', |
| 'dir', |
| 'div', |
| 'dl', |
| 'fieldset', |
| 'form', |
| 'frame', |
| 'frameset', |
| 'h1', |
| 'h2', |
| 'h3', |
| 'h4', |
| 'h5', |
| 'h6', |
| 'head', |
| 'hr', |
| 'html', |
| 'iframe', |
| 'isindex', |
| 'legend', |
| 'link', |
| 'menu', |
| 'meta', |
| 'noframes', |
| 'noscript', |
| 'ol', |
| 'optgroup', |
| 'option', |
| 'p', |
| 'param', |
| 'table', |
| 'tbody', |
| 'td', |
| 'tfoot', |
| 'th', |
| 'thead', |
| 'title', |
| 'tr', |
| 'ul'); |
| |
| |
| /** |
| * Non-block tags that break flow. We insert a line break after, but not before |
| * these. Tags drawn from HTML4 definitions. |
| * @type {Object} |
| * @private |
| */ |
| goog.format.HtmlPrettyPrinter.BREAKS_FLOW_TAGS_ = goog.object.createSet( |
| 'br', |
| 'dd', |
| 'dt', |
| 'br', |
| 'li', |
| 'noframes'); |
| |
| |
| /** |
| * Empty tags. These are treated as both start and end tags. |
| * @type {Object} |
| * @private |
| */ |
| goog.format.HtmlPrettyPrinter.EMPTY_TAGS_ = goog.object.createSet( |
| 'br', |
| 'hr', |
| 'isindex'); |
| |
| |
| /** |
| * Breaks up HTML so it's easily readable by the user. |
| * @param {string} html The HTML text to pretty print. |
| * @return {string} Formatted result. |
| * @throws {Error} Regex error, data loss, or endless loop detected. |
| */ |
| goog.format.HtmlPrettyPrinter.prototype.format = function(html) { |
| // Trim leading whitespace, but preserve first indent; in other words, keep |
| // any spaces immediately before the first non-whitespace character (that's |
| // what $1 is), but remove all other leading whitespace. This adjustment |
| // historically had been made in Docs. The motivation is that some |
| // browsers prepend several line breaks in designMode. |
| html = html.replace(/^\s*?( *\S)/, '$1'); |
| |
| // Trim trailing whitespace. |
| html = html.replace(/\s+$/, ''); |
| |
| // Keep track of how much time we've used. |
| var timeOutMillis = this.timeOutMillis_; |
| var startMillis = timeOutMillis ? goog.now() : 0; |
| |
| // Handles concatenation of the result and required line breaks. |
| var buffer = new goog.format.HtmlPrettyPrinter.Buffer(); |
| |
| // Declare these for efficiency since we access them in a loop. |
| var tokenRegex = goog.format.HtmlPrettyPrinter.TOKEN_REGEX_; |
| var nonPpTags = goog.format.HtmlPrettyPrinter.NON_PRETTY_PRINTED_TAGS_; |
| var blockTags = goog.format.HtmlPrettyPrinter.BLOCK_TAGS_; |
| var breaksFlowTags = goog.format.HtmlPrettyPrinter.BREAKS_FLOW_TAGS_; |
| var emptyTags = goog.format.HtmlPrettyPrinter.EMPTY_TAGS_; |
| |
| // Used to verify we're making progress through our regex tokenization. |
| var lastIndex = 0; |
| |
| // Use this to track non-pretty-printed tags and childen. |
| var nonPpTagStack = []; |
| |
| // Loop through each matched token. |
| var match; |
| while (match = tokenRegex.exec(html)) { |
| // Get token. |
| var token = match[0]; |
| |
| // Is this token a tag? match.length == 3 for tags, 1 for all others. |
| if (match.length == 3) { |
| var tagName = match[2]; |
| if (tagName) { |
| tagName = tagName.toLowerCase(); |
| } |
| |
| // Non-pretty-printed tags? |
| if (nonPpTags.hasOwnProperty(tagName)) { |
| // End tag? |
| if (match[1] == '/') { |
| // Do we have a matching start tag? |
| var stackSize = nonPpTagStack.length; |
| var startTagName = stackSize ? nonPpTagStack[stackSize - 1] : null; |
| if (startTagName == tagName) { |
| // End of non-pretty-printed block. Line break after. |
| nonPpTagStack.pop(); |
| buffer.pushToken(false, token, !nonPpTagStack.length); |
| } else { |
| // Malformed HTML. No line breaks. |
| buffer.pushToken(false, token, false); |
| } |
| } else { |
| // Start of non-pretty-printed block. Line break before. |
| buffer.pushToken(!nonPpTagStack.length, token, false); |
| nonPpTagStack.push(tagName); |
| } |
| } else if (nonPpTagStack.length) { |
| // Inside non-pretty-printed block, no new line breaks. |
| buffer.pushToken(false, token, false); |
| } else if (blockTags.hasOwnProperty(tagName)) { |
| // Put line break before start block and after end block tags. |
| var isEmpty = emptyTags.hasOwnProperty(tagName); |
| var isEndTag = match[1] == '/'; |
| buffer.pushToken(isEmpty || !isEndTag, token, isEmpty || isEndTag); |
| } else if (breaksFlowTags.hasOwnProperty(tagName)) { |
| var isEmpty = emptyTags.hasOwnProperty(tagName); |
| var isEndTag = match[1] == '/'; |
| // Put line break after end flow-breaking tags. |
| buffer.pushToken(false, token, isEndTag || isEmpty); |
| } else { |
| // All other tags, no line break. |
| buffer.pushToken(false, token, false); |
| } |
| } else { |
| // Non-tags, no line break. |
| buffer.pushToken(false, token, false); |
| } |
| |
| // Double check that we're making progress. |
| var newLastIndex = tokenRegex.lastIndex; |
| if (!token || newLastIndex <= lastIndex) { |
| throw Error('Regex failed to make progress through source html.'); |
| } |
| lastIndex = newLastIndex; |
| |
| // Out of time? |
| if (timeOutMillis) { |
| if (goog.now() - startMillis > timeOutMillis) { |
| // Push unprocessed data as one big token and reset regex object. |
| buffer.pushToken(false, html.substring(tokenRegex.lastIndex), false); |
| tokenRegex.lastIndex = 0; |
| break; |
| } |
| } |
| } |
| |
| // Ensure we end in a line break. |
| buffer.lineBreak(); |
| |
| // Construct result string. |
| var result = String(buffer); |
| |
| // Length should be original length plus # line breaks added. |
| var expectedLength = html.length + buffer.breakCount; |
| if (result.length != expectedLength) { |
| throw Error('Lost data pretty printing html.'); |
| } |
| |
| return result; |
| }; |
| |
| |
| |
| /** |
| * This class is a buffer to which we push our output. It tracks line breaks to |
| * make sure we don't add unnecessary ones. |
| * @constructor |
| * @final |
| */ |
| goog.format.HtmlPrettyPrinter.Buffer = function() { |
| /** |
| * Tokens to be output in #toString. |
| * @type {goog.string.StringBuffer} |
| * @private |
| */ |
| this.out_ = new goog.string.StringBuffer(); |
| }; |
| |
| |
| /** |
| * Tracks number of line breaks added. |
| * @type {number} |
| */ |
| goog.format.HtmlPrettyPrinter.Buffer.prototype.breakCount = 0; |
| |
| |
| /** |
| * Tracks if we are at the start of a new line. |
| * @type {boolean} |
| * @private |
| */ |
| goog.format.HtmlPrettyPrinter.Buffer.prototype.isBeginningOfNewLine_ = true; |
| |
| |
| /** |
| * Tracks if we need a new line before the next token. |
| * @type {boolean} |
| * @private |
| */ |
| goog.format.HtmlPrettyPrinter.Buffer.prototype.needsNewLine_ = false; |
| |
| |
| /** |
| * Adds token and necessary line breaks to output buffer. |
| * @param {boolean} breakBefore If true, add line break before token if |
| * necessary. |
| * @param {string} token Token to push. |
| * @param {boolean} breakAfter If true, add line break after token if |
| * necessary. |
| */ |
| goog.format.HtmlPrettyPrinter.Buffer.prototype.pushToken = function( |
| breakBefore, token, breakAfter) { |
| // If this token needs a preceeding line break, and |
| // we haven't already added a line break, and |
| // this token does not start with a line break, |
| // then add line break. |
| // Due to FF3.0 bug with lists, we don't insert a /n |
| // right before </ul>. See bug 1520665. |
| if ((this.needsNewLine_ || breakBefore) && |
| !/^\r?\n/.test(token) && |
| !/\/ul/i.test(token)) { |
| this.lineBreak(); |
| } |
| |
| // Token. |
| this.out_.append(token); |
| |
| // Remember if this string ended with a line break so we know we don't have to |
| // insert another one before the next token. |
| this.isBeginningOfNewLine_ = /\r?\n$/.test(token); |
| |
| // Remember if this token requires a line break after it. We don't insert it |
| // here because we might not have to if the next token starts with a line |
| // break. |
| this.needsNewLine_ = breakAfter && !this.isBeginningOfNewLine_; |
| }; |
| |
| |
| /** |
| * Append line break if we need one. |
| */ |
| goog.format.HtmlPrettyPrinter.Buffer.prototype.lineBreak = function() { |
| if (!this.isBeginningOfNewLine_) { |
| this.out_.append('\n'); |
| ++this.breakCount; |
| } |
| }; |
| |
| |
| /** |
| * @return {string} String representation of tokens. |
| * @override |
| */ |
| goog.format.HtmlPrettyPrinter.Buffer.prototype.toString = function() { |
| return this.out_.toString(); |
| }; |