blob: 3fbb5bdf52dcc971bd47ca321100a33b96043451 [file] [log] [blame]
// Copyright 2008 The Closure Library Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS-IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/**
* @fileoverview Provides functions to parse and pretty-print HTML strings.
*
*/
goog.provide('goog.format.HtmlPrettyPrinter');
goog.provide('goog.format.HtmlPrettyPrinter.Buffer');
goog.require('goog.object');
goog.require('goog.string.StringBuffer');
/**
* This class formats HTML to be more human-readable.
* TODO(user): Add hierarchical indentation.
* @param {number=} opt_timeOutMillis Max # milliseconds to spend on #format. If
* this time is exceeded, return partially formatted. 0 or negative number
* indicates no timeout.
* @constructor
* @final
*/
goog.format.HtmlPrettyPrinter = function(opt_timeOutMillis) {
/**
* Max # milliseconds to spend on #format.
* @type {number}
* @private
*/
this.timeOutMillis_ = opt_timeOutMillis && opt_timeOutMillis > 0 ?
opt_timeOutMillis : 0;
};
/**
* Singleton.
* @type {goog.format.HtmlPrettyPrinter?}
* @private
*/
goog.format.HtmlPrettyPrinter.instance_ = null;
/**
* Singleton lazy initializer.
* @return {!goog.format.HtmlPrettyPrinter} Singleton.
* @private
*/
goog.format.HtmlPrettyPrinter.getInstance_ = function() {
if (!goog.format.HtmlPrettyPrinter.instance_) {
goog.format.HtmlPrettyPrinter.instance_ =
new goog.format.HtmlPrettyPrinter();
}
return goog.format.HtmlPrettyPrinter.instance_;
};
/**
* Static utility function. See prototype #format.
* @param {string} html The HTML text to pretty print.
* @return {string} Formatted result.
*/
goog.format.HtmlPrettyPrinter.format = function(html) {
return goog.format.HtmlPrettyPrinter.getInstance_().format(html);
};
/**
* List of patterns used to tokenize HTML for pretty printing. Cache
* subexpression for tag name.
* comment|meta-tag|tag|text|other-less-than-characters
* @type {RegExp}
* @private
*/
goog.format.HtmlPrettyPrinter.TOKEN_REGEX_ =
/(?:<!--.*?-->|<!.*?>|<(\/?)(\w+)[^>]*>|[^<]+|<)/g;
/**
* Tags whose contents we don't want pretty printed.
* @type {Object}
* @private
*/
goog.format.HtmlPrettyPrinter.NON_PRETTY_PRINTED_TAGS_ = goog.object.createSet(
'script',
'style',
'pre',
'xmp');
/**
* 'Block' tags. We should add newlines before and after these tags during
* pretty printing. Tags drawn mostly from HTML4 definitions for block and other
* non-online tags, excepting the ones in
* #goog.format.HtmlPrettyPrinter.NON_PRETTY_PRINTED_TAGS_.
*
* @type {Object}
* @private
*/
goog.format.HtmlPrettyPrinter.BLOCK_TAGS_ = goog.object.createSet(
'address',
'applet',
'area',
'base',
'basefont',
'blockquote',
'body',
'caption',
'center',
'col',
'colgroup',
'dir',
'div',
'dl',
'fieldset',
'form',
'frame',
'frameset',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'head',
'hr',
'html',
'iframe',
'isindex',
'legend',
'link',
'menu',
'meta',
'noframes',
'noscript',
'ol',
'optgroup',
'option',
'p',
'param',
'table',
'tbody',
'td',
'tfoot',
'th',
'thead',
'title',
'tr',
'ul');
/**
* Non-block tags that break flow. We insert a line break after, but not before
* these. Tags drawn from HTML4 definitions.
* @type {Object}
* @private
*/
goog.format.HtmlPrettyPrinter.BREAKS_FLOW_TAGS_ = goog.object.createSet(
'br',
'dd',
'dt',
'br',
'li',
'noframes');
/**
* Empty tags. These are treated as both start and end tags.
* @type {Object}
* @private
*/
goog.format.HtmlPrettyPrinter.EMPTY_TAGS_ = goog.object.createSet(
'br',
'hr',
'isindex');
/**
* Breaks up HTML so it's easily readable by the user.
* @param {string} html The HTML text to pretty print.
* @return {string} Formatted result.
* @throws {Error} Regex error, data loss, or endless loop detected.
*/
goog.format.HtmlPrettyPrinter.prototype.format = function(html) {
// Trim leading whitespace, but preserve first indent; in other words, keep
// any spaces immediately before the first non-whitespace character (that's
// what $1 is), but remove all other leading whitespace. This adjustment
// historically had been made in Docs. The motivation is that some
// browsers prepend several line breaks in designMode.
html = html.replace(/^\s*?( *\S)/, '$1');
// Trim trailing whitespace.
html = html.replace(/\s+$/, '');
// Keep track of how much time we've used.
var timeOutMillis = this.timeOutMillis_;
var startMillis = timeOutMillis ? goog.now() : 0;
// Handles concatenation of the result and required line breaks.
var buffer = new goog.format.HtmlPrettyPrinter.Buffer();
// Declare these for efficiency since we access them in a loop.
var tokenRegex = goog.format.HtmlPrettyPrinter.TOKEN_REGEX_;
var nonPpTags = goog.format.HtmlPrettyPrinter.NON_PRETTY_PRINTED_TAGS_;
var blockTags = goog.format.HtmlPrettyPrinter.BLOCK_TAGS_;
var breaksFlowTags = goog.format.HtmlPrettyPrinter.BREAKS_FLOW_TAGS_;
var emptyTags = goog.format.HtmlPrettyPrinter.EMPTY_TAGS_;
// Used to verify we're making progress through our regex tokenization.
var lastIndex = 0;
// Use this to track non-pretty-printed tags and childen.
var nonPpTagStack = [];
// Loop through each matched token.
var match;
while (match = tokenRegex.exec(html)) {
// Get token.
var token = match[0];
// Is this token a tag? match.length == 3 for tags, 1 for all others.
if (match.length == 3) {
var tagName = match[2];
if (tagName) {
tagName = tagName.toLowerCase();
}
// Non-pretty-printed tags?
if (nonPpTags.hasOwnProperty(tagName)) {
// End tag?
if (match[1] == '/') {
// Do we have a matching start tag?
var stackSize = nonPpTagStack.length;
var startTagName = stackSize ? nonPpTagStack[stackSize - 1] : null;
if (startTagName == tagName) {
// End of non-pretty-printed block. Line break after.
nonPpTagStack.pop();
buffer.pushToken(false, token, !nonPpTagStack.length);
} else {
// Malformed HTML. No line breaks.
buffer.pushToken(false, token, false);
}
} else {
// Start of non-pretty-printed block. Line break before.
buffer.pushToken(!nonPpTagStack.length, token, false);
nonPpTagStack.push(tagName);
}
} else if (nonPpTagStack.length) {
// Inside non-pretty-printed block, no new line breaks.
buffer.pushToken(false, token, false);
} else if (blockTags.hasOwnProperty(tagName)) {
// Put line break before start block and after end block tags.
var isEmpty = emptyTags.hasOwnProperty(tagName);
var isEndTag = match[1] == '/';
buffer.pushToken(isEmpty || !isEndTag, token, isEmpty || isEndTag);
} else if (breaksFlowTags.hasOwnProperty(tagName)) {
var isEmpty = emptyTags.hasOwnProperty(tagName);
var isEndTag = match[1] == '/';
// Put line break after end flow-breaking tags.
buffer.pushToken(false, token, isEndTag || isEmpty);
} else {
// All other tags, no line break.
buffer.pushToken(false, token, false);
}
} else {
// Non-tags, no line break.
buffer.pushToken(false, token, false);
}
// Double check that we're making progress.
var newLastIndex = tokenRegex.lastIndex;
if (!token || newLastIndex <= lastIndex) {
throw Error('Regex failed to make progress through source html.');
}
lastIndex = newLastIndex;
// Out of time?
if (timeOutMillis) {
if (goog.now() - startMillis > timeOutMillis) {
// Push unprocessed data as one big token and reset regex object.
buffer.pushToken(false, html.substring(tokenRegex.lastIndex), false);
tokenRegex.lastIndex = 0;
break;
}
}
}
// Ensure we end in a line break.
buffer.lineBreak();
// Construct result string.
var result = String(buffer);
// Length should be original length plus # line breaks added.
var expectedLength = html.length + buffer.breakCount;
if (result.length != expectedLength) {
throw Error('Lost data pretty printing html.');
}
return result;
};
/**
* This class is a buffer to which we push our output. It tracks line breaks to
* make sure we don't add unnecessary ones.
* @constructor
* @final
*/
goog.format.HtmlPrettyPrinter.Buffer = function() {
/**
* Tokens to be output in #toString.
* @type {goog.string.StringBuffer}
* @private
*/
this.out_ = new goog.string.StringBuffer();
};
/**
* Tracks number of line breaks added.
* @type {number}
*/
goog.format.HtmlPrettyPrinter.Buffer.prototype.breakCount = 0;
/**
* Tracks if we are at the start of a new line.
* @type {boolean}
* @private
*/
goog.format.HtmlPrettyPrinter.Buffer.prototype.isBeginningOfNewLine_ = true;
/**
* Tracks if we need a new line before the next token.
* @type {boolean}
* @private
*/
goog.format.HtmlPrettyPrinter.Buffer.prototype.needsNewLine_ = false;
/**
* Adds token and necessary line breaks to output buffer.
* @param {boolean} breakBefore If true, add line break before token if
* necessary.
* @param {string} token Token to push.
* @param {boolean} breakAfter If true, add line break after token if
* necessary.
*/
goog.format.HtmlPrettyPrinter.Buffer.prototype.pushToken = function(
breakBefore, token, breakAfter) {
// If this token needs a preceeding line break, and
// we haven't already added a line break, and
// this token does not start with a line break,
// then add line break.
// Due to FF3.0 bug with lists, we don't insert a /n
// right before </ul>. See bug 1520665.
if ((this.needsNewLine_ || breakBefore) &&
!/^\r?\n/.test(token) &&
!/\/ul/i.test(token)) {
this.lineBreak();
}
// Token.
this.out_.append(token);
// Remember if this string ended with a line break so we know we don't have to
// insert another one before the next token.
this.isBeginningOfNewLine_ = /\r?\n$/.test(token);
// Remember if this token requires a line break after it. We don't insert it
// here because we might not have to if the next token starts with a line
// break.
this.needsNewLine_ = breakAfter && !this.isBeginningOfNewLine_;
};
/**
* Append line break if we need one.
*/
goog.format.HtmlPrettyPrinter.Buffer.prototype.lineBreak = function() {
if (!this.isBeginningOfNewLine_) {
this.out_.append('\n');
++this.breakCount;
}
};
/**
* @return {string} String representation of tokens.
* @override
*/
goog.format.HtmlPrettyPrinter.Buffer.prototype.toString = function() {
return this.out_.toString();
};