blob: 6fce2b32b2dfeb6e5d7392d8f0563a330bda1ca6 [file] [log] [blame]
/*******************************************************************************
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
******************************************************************************/
package org.apache.sling.scripting.sightly.impl.html.dom;
import java.io.CharArrayWriter;
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
/**
* HTML parser. Invokes a <code>DocumentHandler</code> whenever an event occurs.
*/
public final class HtmlParser {
private static int BUF_SIZE = 2048;
/** Internal character buffer */
private final CharArrayWriter buffer = new CharArrayWriter(256);
/** Tag tokenizer */
private final TagTokenizer tokenizer = new TagTokenizer();
/** Tag name buffer */
private final CharArrayWriter tagNameBuffer = new CharArrayWriter(30);
/** Tag name */
private String tagName;
/** Registered document handler */
private final DocumentHandler documentHandler;
private enum PARSE_STATE {
OUTSIDE,
TAG,
SCRIPT,
COMMENT,
STRING,
EXPRESSION
}
/** Tag type constant */
private final static int TT_NONE = 0;
/** Tag type constant */
private final static int TT_MAYBE = 1;
/** Tag type constant */
private final static int TT_TAG = 2;
/** Expression state constant */
private final static int EXPR_NONE = 0;
/** Expression state constant */
private final static int EXPR_MAYBE = 1;
final static Set<String> VOID_ELEMENTS = Collections.unmodifiableSet(new HashSet<>(
Arrays.asList("area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr")));
/** Parse state */
private PARSE_STATE parseState = PARSE_STATE.OUTSIDE;
/** Parse substate */
private int parseSubState;
/** Previous parse state */
private PARSE_STATE prevParseState;
/** Current tag type */
private int tagType;
/** Expression type */
private int exprType;
/** Quote character */
private char quoteChar;
public static void parse(final Reader reader, final DocumentHandler documentHandler)
throws IOException {
final HtmlParser parser = new HtmlParser(documentHandler);
parser.parse(reader);
}
/**
* Default constructor.
*/
private HtmlParser(final DocumentHandler documentHandler) {
this.documentHandler = documentHandler;
}
private void parse(final Reader reader)
throws IOException {
try {
this.documentHandler.onStart();
final char[] readBuffer = new char[BUF_SIZE];
int readLen = 0;
while ( (readLen = reader.read(readBuffer)) > 0 ) {
this.update(readBuffer, readLen);
}
this.flushBuffer();
this.documentHandler.onEnd();
} finally {
try {
reader.close();
} catch ( final IOException ignore) {
// ignore
}
}
}
/**
* Feed characters to the parser.
*
* @param buf character buffer
* @param len length of affected buffer
*/
private void update(final char[] buf, int len) throws IOException {
int start = 0;
final int end = len;
for (int curr = start; curr < end; curr++) {
final char c = buf[curr];
switch (parseState) {
case OUTSIDE:
if (c == '<') {
if (curr > start) {
documentHandler.onCharacters(buf, start, curr - start);
}
start = curr;
parseState = PARSE_STATE.TAG;
parseSubState = 0;
tagType = TT_MAYBE;
resetTagName();
} else if (c == '$') {
exprType = EXPR_MAYBE;
parseState = PARSE_STATE.EXPRESSION;
}
break;
case TAG:
switch (parseSubState) {
case -1:
if (c == '"' || c == '\'') {
quoteChar = c;
prevParseState = parseState;
parseState = PARSE_STATE.STRING;
parseSubState = -1;
} else if (c == '>') {
parseState = PARSE_STATE.OUTSIDE;
}
break;
case 0:
if (c == '!') {
parseState = PARSE_STATE.COMMENT;
parseSubState = 0;
tagType = TT_NONE;
// keep the accumulated buffer
} else if (c == '"' || c == '\'') {
quoteChar = c;
prevParseState = parseState;
parseState = PARSE_STATE.STRING;
parseSubState = -1;
tagType = TT_NONE;
flushBuffer();
} else if (c == '>') {
parseState = PARSE_STATE.OUTSIDE;
tagType = TT_NONE;
flushBuffer();
} else if (!Character.isWhitespace(c)) {
tagNameBuffer.write(c);
parseSubState = 1;
} else {
parseSubState = -1;
tagType = TT_NONE;
flushBuffer();
}
break;
case 1:
if (c == '"' || c == '\'') {
tagType = TT_TAG;
parseSubState = 2;
quoteChar = c;
prevParseState = parseState;
parseState = PARSE_STATE.STRING;
} else if (c == '>') {
parseState = processTag(buf, start, curr - start + 1) ? PARSE_STATE.SCRIPT : PARSE_STATE.OUTSIDE;
start = curr + 1;
tagType = TT_NONE;
parseSubState = 0;
} else if (Character.isWhitespace(c)) {
tagType = TT_TAG;
parseSubState = 2;
} else {
tagNameBuffer.write(c);
}
break;
case 2:
if (c == '"' || c == '\'') {
quoteChar = c;
prevParseState = parseState;
parseState = PARSE_STATE.STRING;
} else if (c == '>') {
if (tagType == TT_TAG) {
parseState = processTag(buf, start, curr - start + 1) ? PARSE_STATE.SCRIPT : PARSE_STATE.OUTSIDE;
start = curr + 1;
} else {
flushBuffer();
parseState = "SCRIPT".equalsIgnoreCase(getTagName()) ? PARSE_STATE.SCRIPT : PARSE_STATE.OUTSIDE;
}
tagType = TT_NONE;
parseSubState = 0;
}
break;
default:
break;
}
break;
case COMMENT:
switch (parseSubState) {
case 0:
if (c == '-') {
parseSubState++;
} else if (c == '"' || c == '\'') {
quoteChar = c;
prevParseState = PARSE_STATE.TAG;
parseState = PARSE_STATE.STRING;
parseSubState = -1;
tagType = TT_NONE;
flushBuffer();
} else if (c == '>') {
parseState = PARSE_STATE.OUTSIDE;
tagType = TT_NONE;
flushBuffer();
} else {
parseState = PARSE_STATE.TAG;
parseSubState = -1;
tagType = TT_NONE;
flushBuffer();
}
break;
case 1:
if (c == '-') {
parseSubState++;
} else if (c == '"' || c == '\'') {
quoteChar = c;
prevParseState = PARSE_STATE.TAG;
parseState = PARSE_STATE.STRING;
parseSubState = -1;
tagType = TT_NONE;
flushBuffer();
} else if (c == '>') {
parseState = PARSE_STATE.OUTSIDE;
tagType = TT_NONE;
flushBuffer();
} else {
parseState = PARSE_STATE.TAG;
parseSubState = -1;
tagType = TT_NONE;
flushBuffer();
}
break;
case 2:
if (c == '-') {
parseSubState++;
}
break;
case 3:
if (c == '-') {
parseSubState++;
} else {
parseSubState = 2;
}
break;
case 4:
if (c == '>') {
parseState = PARSE_STATE.OUTSIDE;
processComment(buf, start, curr - start + 1);
start = curr + 1;
} else {
parseSubState = 2;
}
break;
default:
break;
}
break;
case SCRIPT:
switch (parseSubState) {
case 0:
if (c == '<') {
if (curr > start) {
documentHandler.onCharacters(buf, start, curr - start);
}
start = curr;
tagType = TT_MAYBE;
parseSubState++;
}
break;
case 1:
if (c == '/') {
parseSubState++;
} else {
tagType = TT_NONE;
parseSubState = 0;
flushBuffer();
}
break;
case 2:
if (c == 'S' || c == 's') {
parseSubState++;
} else {
tagType = TT_NONE;
parseSubState = 0;
flushBuffer();
}
break;
case 3:
if (c == 'C' || c == 'c') {
parseSubState++;
} else {
tagType = TT_NONE;
parseSubState = 0;
flushBuffer();
}
break;
case 4:
if (c == 'R' || c == 'r') {
parseSubState++;
} else {
tagType = TT_NONE;
parseSubState = 0;
flushBuffer();
}
break;
case 5:
if (c == 'I' || c == 'i') {
parseSubState++;
} else {
tagType = TT_NONE;
parseSubState = 0;
flushBuffer();
}
break;
case 6:
if (c == 'P' || c == 'p') {
parseSubState++;
} else {
tagType = TT_NONE;
parseSubState = 0;
flushBuffer();
}
break;
case 7:
if (c == 'T' || c == 't') {
parseSubState++;
} else {
tagType = TT_NONE;
parseSubState = 0;
flushBuffer();
}
break;
case 8:
if (c == '>') {
processTag(buf, start, curr - start + 1);
start = curr + 1;
tagType = TT_NONE;
parseState = PARSE_STATE.OUTSIDE;
}
break;
default:
break;
}
break;
case STRING:
if (c == quoteChar) {
parseState = prevParseState;
}
break;
case EXPRESSION:
if (exprType == EXPR_MAYBE && c != '{') {
// not a valid expression
if (c == '<') {
//reset to process tag correctly
curr--;
}
parseState = PARSE_STATE.OUTSIDE;
} else if (c == '}') {
parseState = PARSE_STATE.OUTSIDE;
}
exprType = EXPR_NONE;
break;
default:
break;
}
}
if (start < end) {
if (tagType == TT_NONE && parseState != PARSE_STATE.COMMENT) {
documentHandler.onCharacters(buf, start, end - start);
} else {
buffer.write(buf, start, end - start);
}
}
}
/**
* Clears the internal tagname buffer and cache
*/
private void resetTagName() {
tagName = null;
tagNameBuffer.reset();
}
/**
* Returns the tagname scanned and resets the internal tagname buffer
*
* @return tagname
*/
private String getTagName() {
if (tagName == null) {
tagName = tagNameBuffer.toString();
}
return tagName;
}
/**
* Flush internal buffer. This forces the parser to flush the characters
* still held in its internal buffer, if the parsing state allows.
*/
private void flushBuffer() throws IOException {
if (buffer.size() > 0) {
final char[] chars = buffer.toCharArray();
documentHandler.onCharacters(chars, 0, chars.length);
buffer.reset();
}
}
/**
* Process a comment from current and accumulated character data
*
* @param ch character data work buffer
* @param off start offset for current data
* @param len length of current data
* @throws IOException
*/
private void processComment(char[] ch, int off, int len) throws IOException {
buffer.write(ch, off, len);
documentHandler.onComment(buffer.toString());
buffer.reset();
}
/**
* Decompose a tag and feed it to the document handler.
*
* @param ch
* character data
* @param off
* offset where character data starts
* @param len
* length of character data
*/
private boolean processTag(char[] ch, int off, int len) throws IOException {
buffer.write(ch, off, len);
final char[] snippet = buffer.toCharArray();
tokenizer.tokenize(snippet, 0, snippet.length);
if (!tokenizer.endTag()) {
documentHandler.onStartElement(tokenizer.tagName(), tokenizer.attributes(),
tokenizer.endSlash() || VOID_ELEMENTS.contains(tokenizer.tagName().toLowerCase()));
} else {
documentHandler.onEndElement(tokenizer.tagName());
}
buffer.reset();
return "SCRIPT".equalsIgnoreCase(tokenizer.tagName()) && !tokenizer.endSlash();
}
}