src/main/java/org/apache/sling/scripting/sightly/impl/html/dom/HtmlParser.java - sling-org-apache-sling-scripting-sightly-compiler - Git at Google

 /*******************************************************************************
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  ******************************************************************************/
 package org.apache.sling.scripting.sightly.impl.html.dom;

 import java.io.CharArrayWriter;
 import java.io.IOException;
 import java.io.Reader;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.HashSet;
 import java.util.Set;

 /**
  * HTML parser. Invokes a <code>DocumentHandler</code> whenever an event occurs.
  */
 public final class HtmlParser {

     private static int BUF_SIZE = 2048;

     /** Internal character buffer */
     private final CharArrayWriter buffer = new CharArrayWriter(256);

     /** Tag tokenizer */
     private final TagTokenizer tokenizer = new TagTokenizer();

     /** Tag name buffer */
     private final CharArrayWriter tagNameBuffer = new CharArrayWriter(30);

     /** Tag name */
     private String tagName;

     /** Registered document handler */
     private final DocumentHandler documentHandler;

     private enum PARSE_STATE {
         OUTSIDE,
         TAG,
         SCRIPT,
         COMMENT,
         STRING,
         EXPRESSION
     }

     /** Tag type constant */
     private final static int TT_NONE = 0;

     /** Tag type constant */
     private final static int TT_MAYBE = 1;

     /** Tag type constant */
     private final static int TT_TAG = 2;

     /** Expression state constant */
     private final static int EXPR_NONE = 0;

     /** Expression state constant */
     private final static int EXPR_MAYBE = 1;

     final static Set<String> VOID_ELEMENTS = Collections.unmodifiableSet(new HashSet<>(
             Arrays.asList("area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr")));

     /** Parse state */
     private PARSE_STATE parseState = PARSE_STATE.OUTSIDE;

     /** Parse substate */
     private int parseSubState;

     /** Previous parse state */
     private PARSE_STATE prevParseState;

     /** Current tag type */
     private int tagType;

     /** Expression type */
     private int exprType;

     /** Quote character */
     private char quoteChar;

     public static void parse(final Reader reader, final DocumentHandler documentHandler)
     throws IOException {
         final HtmlParser parser = new HtmlParser(documentHandler);
         parser.parse(reader);
     }

     /**
      * Default constructor.
      */
     private HtmlParser(final DocumentHandler documentHandler) {
         this.documentHandler = documentHandler;
     }

     private void parse(final Reader reader)
     throws IOException {
         try {
             this.documentHandler.onStart();
             final char[] readBuffer = new char[BUF_SIZE];
             int readLen = 0;
             while ( (readLen = reader.read(readBuffer)) > 0 ) {
                 this.update(readBuffer, readLen);
             }
             this.flushBuffer();
             this.documentHandler.onEnd();
         } finally {
             try {
                 reader.close();
             } catch ( final IOException ignore) {
                 // ignore
             }
         }
     }

     /**
      * Feed characters to the parser.
      *
      * @param buf character buffer
      * @param len length of affected buffer
      */
     private void update(final char[] buf, int len) throws IOException {
         int start = 0;
         final int end = len;

         for (int curr = start; curr < end; curr++) {
             final char c = buf[curr];

             switch (parseState) {
             case OUTSIDE:
                 if (c == '<') {
                     if (curr > start) {
                         documentHandler.onCharacters(buf, start, curr - start);
                     }
                     start = curr;
                     parseState = PARSE_STATE.TAG;
                     parseSubState = 0;
                     tagType = TT_MAYBE;
                     resetTagName();
                 } else if (c == '$') {
                     exprType = EXPR_MAYBE;
                     parseState = PARSE_STATE.EXPRESSION;
                 }
                 break;
             case TAG:
                 switch (parseSubState) {
                 case -1:
                     if (c == '"' || c == '\'') {
                         quoteChar = c;
                         prevParseState = parseState;
                         parseState = PARSE_STATE.STRING;
                         parseSubState = -1;
                     } else if (c == '>') {
                         parseState = PARSE_STATE.OUTSIDE;
                     }
                     break;
                 case 0:
                     if (c == '!') {
                         parseState = PARSE_STATE.COMMENT;
                         parseSubState = 0;
                         tagType = TT_NONE;
                         // keep the accumulated buffer
                     } else if (c == '"' || c == '\'') {
                         quoteChar = c;
                         prevParseState = parseState;
                         parseState = PARSE_STATE.STRING;
                         parseSubState = -1;
                         tagType = TT_NONE;
                         flushBuffer();
                     } else if (c == '>') {
                         parseState = PARSE_STATE.OUTSIDE;
                         tagType = TT_NONE;
                         flushBuffer();
                     } else if (!Character.isWhitespace(c)) {
                         tagNameBuffer.write(c);
                         parseSubState = 1;
                     } else {
                         parseSubState = -1;
                         tagType = TT_NONE;
                         flushBuffer();
                     }
                     break;
                 case 1:
                     if (c == '"' || c == '\'') {
                         tagType = TT_TAG;
                         parseSubState = 2;
                         quoteChar = c;
                         prevParseState = parseState;
                         parseState = PARSE_STATE.STRING;
                     } else if (c == '>') {
                         parseState = processTag(buf, start, curr - start + 1) ? PARSE_STATE.SCRIPT : PARSE_STATE.OUTSIDE;
                         start = curr + 1;
                         tagType = TT_NONE;
                         parseSubState = 0;
                     } else if (Character.isWhitespace(c)) {
                         tagType = TT_TAG;
                         parseSubState = 2;
                     } else {
                         tagNameBuffer.write(c);
                     }
                     break;
                 case 2:
                     if (c == '"' || c == '\'') {
                         quoteChar = c;
                         prevParseState = parseState;
                         parseState = PARSE_STATE.STRING;
                     } else if (c == '>') {
                         if (tagType == TT_TAG) {
                             parseState = processTag(buf, start, curr - start + 1) ? PARSE_STATE.SCRIPT : PARSE_STATE.OUTSIDE;
                             start = curr + 1;
                         } else {
                             flushBuffer();
                             parseState = "SCRIPT".equalsIgnoreCase(getTagName()) ? PARSE_STATE.SCRIPT : PARSE_STATE.OUTSIDE;
                         }
                         tagType = TT_NONE;
                         parseSubState = 0;
                     }
                     break;
                 default:
                     break;
                 }
                 break;
             case COMMENT:
                 switch (parseSubState) {
                 case 0:
                     if (c == '-') {
                         parseSubState++;
                     } else if (c == '"' || c == '\'') {
                         quoteChar = c;
                         prevParseState = PARSE_STATE.TAG;
                         parseState = PARSE_STATE.STRING;
                         parseSubState = -1;
                         tagType = TT_NONE;
                         flushBuffer();
                     } else if (c == '>') {
                         parseState = PARSE_STATE.OUTSIDE;
                         tagType = TT_NONE;
                         flushBuffer();
                     } else {
                         parseState = PARSE_STATE.TAG;
                         parseSubState = -1;
                         tagType = TT_NONE;
                         flushBuffer();
                     }
                     break;
                 case 1:
                     if (c == '-') {
                         parseSubState++;
                     } else if (c == '"' || c == '\'') {
                         quoteChar = c;
                         prevParseState = PARSE_STATE.TAG;
                         parseState = PARSE_STATE.STRING;
                         parseSubState = -1;
                         tagType = TT_NONE;
                         flushBuffer();
                     } else if (c == '>') {
                         parseState = PARSE_STATE.OUTSIDE;
                         tagType = TT_NONE;
                         flushBuffer();
                     } else {
                         parseState = PARSE_STATE.TAG;
                         parseSubState = -1;
                         tagType = TT_NONE;
                         flushBuffer();
                     }
                     break;
                 case 2:
                     if (c == '-') {
                         parseSubState++;
                     }
                     break;
                 case 3:
                     if (c == '-') {
                         parseSubState++;
                     } else {
                         parseSubState = 2;
                     }
                     break;
                 case 4:
                     if (c == '>') {
                         parseState = PARSE_STATE.OUTSIDE;
                         processComment(buf, start, curr - start + 1);
                         start = curr + 1;
                     } else {
                         parseSubState = 2;
                     }
                     break;
                 default:
                     break;
                 }
                 break;

             case SCRIPT:
                 switch (parseSubState) {
                 case 0:
                     if (c == '<') {
                         if (curr > start) {
                             documentHandler.onCharacters(buf, start, curr - start);
                         }
                         start = curr;
                         tagType = TT_MAYBE;
                         parseSubState++;
                     }
                     break;
                 case 1:
                     if (c == '/') {
                         parseSubState++;
                     } else {
                         tagType = TT_NONE;
                         parseSubState = 0;
                         flushBuffer();
                     }
                     break;
                 case 2:
                     if (c == 'S' || c == 's') {
                         parseSubState++;
                     } else {
                         tagType = TT_NONE;
                         parseSubState = 0;
                         flushBuffer();
                     }
                     break;
                 case 3:
                     if (c == 'C' || c == 'c') {
                         parseSubState++;
                     } else {
                         tagType = TT_NONE;
                         parseSubState = 0;
                         flushBuffer();
                     }
                     break;
                 case 4:
                     if (c == 'R' || c == 'r') {
                         parseSubState++;
                     } else {
                         tagType = TT_NONE;
                         parseSubState = 0;
                         flushBuffer();
                     }
                     break;
                 case 5:
                     if (c == 'I' || c == 'i') {
                         parseSubState++;
                     } else {
                         tagType = TT_NONE;
                         parseSubState = 0;
                         flushBuffer();
                     }
                     break;
                 case 6:
                     if (c == 'P' || c == 'p') {
                         parseSubState++;
                     } else {
                         tagType = TT_NONE;
                         parseSubState = 0;
                         flushBuffer();
                     }
                     break;
                 case 7:
                     if (c == 'T' || c == 't') {
                         parseSubState++;
                     } else {
                         tagType = TT_NONE;
                         parseSubState = 0;
                         flushBuffer();
                     }
                     break;
                 case 8:
                     if (c == '>') {
                         processTag(buf, start, curr - start + 1);
                         start = curr + 1;
                         tagType = TT_NONE;
                         parseState = PARSE_STATE.OUTSIDE;
                     }
                     break;
                 default:
                     break;
                 }
                 break;

             case STRING:
                 if (c == quoteChar) {
                     parseState = prevParseState;
                 }
                 break;

             case EXPRESSION:
                 if (exprType == EXPR_MAYBE && c != '{') {
                     // not a valid expression
                     if (c == '<') {
                         //reset to process tag correctly
                         curr--;
                     }
                     parseState = PARSE_STATE.OUTSIDE;
                 } else if (c == '}') {
                     parseState = PARSE_STATE.OUTSIDE;
                 }
                 exprType = EXPR_NONE;
                 break;
             default:
                 break;
             }
         }
         if (start < end) {
             if (tagType == TT_NONE && parseState != PARSE_STATE.COMMENT) {
                 documentHandler.onCharacters(buf, start, end - start);
             } else {
                 buffer.write(buf, start, end - start);
             }
         }
     }

     /**
      * Clears the internal tagname buffer and cache
      */
     private void resetTagName() {
         tagName = null;
         tagNameBuffer.reset();
     }

     /**
      * Returns the tagname scanned and resets the internal tagname buffer
      *
      * @return tagname
      */
     private String getTagName() {
         if (tagName == null) {
             tagName = tagNameBuffer.toString();
         }
         return tagName;
     }

     /**
      * Flush internal buffer. This forces the parser to flush the characters
      * still held in its internal buffer, if the parsing state allows.
      */
     private void flushBuffer() throws IOException {
         if (buffer.size() > 0) {
             final char[] chars = buffer.toCharArray();
             documentHandler.onCharacters(chars, 0, chars.length);
             buffer.reset();
         }
     }

     /**
      * Process a comment from current and accumulated character data
      *
      * @param ch character data work buffer
      * @param off start offset for current data
      * @param len length of current data
      * @throws IOException
      */
     private void processComment(char[] ch, int off, int len) throws IOException {
         buffer.write(ch, off, len);
         documentHandler.onComment(buffer.toString());
         buffer.reset();
     }

     /**
      * Decompose a tag and feed it to the document handler.
      *
      * @param ch
      *            character data
      * @param off
      *            offset where character data starts
      * @param len
      *            length of character data
      */
     private boolean processTag(char[] ch, int off, int len) throws IOException {
         buffer.write(ch, off, len);

         final char[] snippet = buffer.toCharArray();

         tokenizer.tokenize(snippet, 0, snippet.length);
         if (!tokenizer.endTag()) {
             documentHandler.onStartElement(tokenizer.tagName(), tokenizer.attributes(),
                     tokenizer.endSlash() || VOID_ELEMENTS.contains(tokenizer.tagName().toLowerCase()));
         } else {
             documentHandler.onEndElement(tokenizer.tagName());
         }

         buffer.reset();
         return "SCRIPT".equalsIgnoreCase(tokenizer.tagName()) && !tokenizer.endSlash();
     }
 }
	/*******************************************************************************
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	******************************************************************************/
	package org.apache.sling.scripting.sightly.impl.html.dom;

	import java.io.CharArrayWriter;
	import java.io.IOException;
	import java.io.Reader;
	import java.util.Arrays;
	import java.util.Collections;
	import java.util.HashSet;
	import java.util.Set;

	/**
	* HTML parser. Invokes a <code>DocumentHandler</code> whenever an event occurs.
	*/
	public final class HtmlParser {

	private static int BUF_SIZE = 2048;

	/** Internal character buffer */
	private final CharArrayWriter buffer = new CharArrayWriter(256);

	/** Tag tokenizer */
	private final TagTokenizer tokenizer = new TagTokenizer();

	/** Tag name buffer */
	private final CharArrayWriter tagNameBuffer = new CharArrayWriter(30);

	/** Tag name */
	private String tagName;

	/** Registered document handler */
	private final DocumentHandler documentHandler;

	private enum PARSE_STATE {
	OUTSIDE,
	TAG,
	SCRIPT,
	COMMENT,
	STRING,
	EXPRESSION
	}

	/** Tag type constant */
	private final static int TT_NONE = 0;

	/** Tag type constant */
	private final static int TT_MAYBE = 1;

	/** Tag type constant */
	private final static int TT_TAG = 2;

	/** Expression state constant */
	private final static int EXPR_NONE = 0;

	/** Expression state constant */
	private final static int EXPR_MAYBE = 1;

	final static Set<String> VOID_ELEMENTS = Collections.unmodifiableSet(new HashSet<>(
	Arrays.asList("area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr")));

	/** Parse state */
	private PARSE_STATE parseState = PARSE_STATE.OUTSIDE;

	/** Parse substate */
	private int parseSubState;

	/** Previous parse state */
	private PARSE_STATE prevParseState;

	/** Current tag type */
	private int tagType;

	/** Expression type */
	private int exprType;

	/** Quote character */
	private char quoteChar;

	public static void parse(final Reader reader, final DocumentHandler documentHandler)
	throws IOException {
	final HtmlParser parser = new HtmlParser(documentHandler);
	parser.parse(reader);
	}

	/**
	* Default constructor.
	*/
	private HtmlParser(final DocumentHandler documentHandler) {
	this.documentHandler = documentHandler;
	}

	private void parse(final Reader reader)
	throws IOException {
	try {
	this.documentHandler.onStart();
	final char[] readBuffer = new char[BUF_SIZE];
	int readLen = 0;
	while ( (readLen = reader.read(readBuffer)) > 0 ) {
	this.update(readBuffer, readLen);
	}
	this.flushBuffer();
	this.documentHandler.onEnd();
	} finally {
	try {
	reader.close();
	} catch ( final IOException ignore) {
	// ignore
	}
	}
	}

	/**
	* Feed characters to the parser.
	*
	* @param buf character buffer
	* @param len length of affected buffer
	*/
	private void update(final char[] buf, int len) throws IOException {
	int start = 0;
	final int end = len;

	for (int curr = start; curr < end; curr++) {
	final char c = buf[curr];

	switch (parseState) {
	case OUTSIDE:
	if (c == '<') {
	if (curr > start) {
	documentHandler.onCharacters(buf, start, curr - start);
	}
	start = curr;
	parseState = PARSE_STATE.TAG;
	parseSubState = 0;
	tagType = TT_MAYBE;
	resetTagName();
	} else if (c == '$') {
	exprType = EXPR_MAYBE;
	parseState = PARSE_STATE.EXPRESSION;
	}
	break;
	case TAG:
	switch (parseSubState) {
	case -1:
	if (c == '"' \|\| c == '\'') {
	quoteChar = c;
	prevParseState = parseState;
	parseState = PARSE_STATE.STRING;
	parseSubState = -1;
	} else if (c == '>') {
	parseState = PARSE_STATE.OUTSIDE;
	}
	break;
	case 0:
	if (c == '!') {
	parseState = PARSE_STATE.COMMENT;
	parseSubState = 0;
	tagType = TT_NONE;
	// keep the accumulated buffer
	} else if (c == '"' \|\| c == '\'') {
	quoteChar = c;
	prevParseState = parseState;
	parseState = PARSE_STATE.STRING;
	parseSubState = -1;
	tagType = TT_NONE;
	flushBuffer();
	} else if (c == '>') {
	parseState = PARSE_STATE.OUTSIDE;
	tagType = TT_NONE;
	flushBuffer();
	} else if (!Character.isWhitespace(c)) {
	tagNameBuffer.write(c);
	parseSubState = 1;
	} else {
	parseSubState = -1;
	tagType = TT_NONE;
	flushBuffer();
	}
	break;
	case 1:
	if (c == '"' \|\| c == '\'') {
	tagType = TT_TAG;
	parseSubState = 2;
	quoteChar = c;
	prevParseState = parseState;
	parseState = PARSE_STATE.STRING;
	} else if (c == '>') {
	parseState = processTag(buf, start, curr - start + 1) ? PARSE_STATE.SCRIPT : PARSE_STATE.OUTSIDE;
	start = curr + 1;
	tagType = TT_NONE;
	parseSubState = 0;
	} else if (Character.isWhitespace(c)) {
	tagType = TT_TAG;
	parseSubState = 2;
	} else {
	tagNameBuffer.write(c);
	}
	break;
	case 2:
	if (c == '"' \|\| c == '\'') {
	quoteChar = c;
	prevParseState = parseState;
	parseState = PARSE_STATE.STRING;
	} else if (c == '>') {
	if (tagType == TT_TAG) {
	parseState = processTag(buf, start, curr - start + 1) ? PARSE_STATE.SCRIPT : PARSE_STATE.OUTSIDE;
	start = curr + 1;
	} else {
	flushBuffer();
	parseState = "SCRIPT".equalsIgnoreCase(getTagName()) ? PARSE_STATE.SCRIPT : PARSE_STATE.OUTSIDE;
	}
	tagType = TT_NONE;
	parseSubState = 0;
	}
	break;
	default:
	break;
	}
	break;
	case COMMENT:
	switch (parseSubState) {
	case 0:
	if (c == '-') {
	parseSubState++;
	} else if (c == '"' \|\| c == '\'') {
	quoteChar = c;
	prevParseState = PARSE_STATE.TAG;
	parseState = PARSE_STATE.STRING;
	parseSubState = -1;
	tagType = TT_NONE;
	flushBuffer();
	} else if (c == '>') {
	parseState = PARSE_STATE.OUTSIDE;
	tagType = TT_NONE;
	flushBuffer();
	} else {
	parseState = PARSE_STATE.TAG;
	parseSubState = -1;
	tagType = TT_NONE;
	flushBuffer();
	}
	break;
	case 1:
	if (c == '-') {
	parseSubState++;
	} else if (c == '"' \|\| c == '\'') {
	quoteChar = c;
	prevParseState = PARSE_STATE.TAG;
	parseState = PARSE_STATE.STRING;
	parseSubState = -1;
	tagType = TT_NONE;
	flushBuffer();
	} else if (c == '>') {
	parseState = PARSE_STATE.OUTSIDE;
	tagType = TT_NONE;
	flushBuffer();
	} else {
	parseState = PARSE_STATE.TAG;
	parseSubState = -1;
	tagType = TT_NONE;
	flushBuffer();
	}
	break;
	case 2:
	if (c == '-') {
	parseSubState++;
	}
	break;
	case 3:
	if (c == '-') {
	parseSubState++;
	} else {
	parseSubState = 2;
	}
	break;
	case 4:
	if (c == '>') {
	parseState = PARSE_STATE.OUTSIDE;
	processComment(buf, start, curr - start + 1);
	start = curr + 1;
	} else {
	parseSubState = 2;
	}
	break;
	default:
	break;
	}
	break;

	case SCRIPT:
	switch (parseSubState) {
	case 0:
	if (c == '<') {
	if (curr > start) {
	documentHandler.onCharacters(buf, start, curr - start);
	}
	start = curr;
	tagType = TT_MAYBE;
	parseSubState++;
	}
	break;
	case 1:
	if (c == '/') {
	parseSubState++;
	} else {
	tagType = TT_NONE;
	parseSubState = 0;
	flushBuffer();
	}
	break;
	case 2:
	if (c == 'S' \|\| c == 's') {
	parseSubState++;
	} else {
	tagType = TT_NONE;
	parseSubState = 0;
	flushBuffer();
	}
	break;
	case 3:
	if (c == 'C' \|\| c == 'c') {
	parseSubState++;
	} else {
	tagType = TT_NONE;
	parseSubState = 0;
	flushBuffer();
	}
	break;
	case 4:
	if (c == 'R' \|\| c == 'r') {
	parseSubState++;
	} else {
	tagType = TT_NONE;
	parseSubState = 0;
	flushBuffer();
	}
	break;
	case 5:
	if (c == 'I' \|\| c == 'i') {
	parseSubState++;
	} else {
	tagType = TT_NONE;
	parseSubState = 0;
	flushBuffer();
	}
	break;
	case 6:
	if (c == 'P' \|\| c == 'p') {
	parseSubState++;
	} else {
	tagType = TT_NONE;
	parseSubState = 0;
	flushBuffer();
	}
	break;
	case 7:
	if (c == 'T' \|\| c == 't') {
	parseSubState++;
	} else {
	tagType = TT_NONE;
	parseSubState = 0;
	flushBuffer();
	}
	break;
	case 8:
	if (c == '>') {
	processTag(buf, start, curr - start + 1);
	start = curr + 1;
	tagType = TT_NONE;
	parseState = PARSE_STATE.OUTSIDE;
	}
	break;
	default:
	break;
	}
	break;

	case STRING:
	if (c == quoteChar) {
	parseState = prevParseState;
	}
	break;

	case EXPRESSION:
	if (exprType == EXPR_MAYBE && c != '{') {
	// not a valid expression
	if (c == '<') {
	//reset to process tag correctly
	curr--;
	}
	parseState = PARSE_STATE.OUTSIDE;
	} else if (c == '}') {
	parseState = PARSE_STATE.OUTSIDE;
	}
	exprType = EXPR_NONE;
	break;
	default:
	break;
	}
	}
	if (start < end) {
	if (tagType == TT_NONE && parseState != PARSE_STATE.COMMENT) {
	documentHandler.onCharacters(buf, start, end - start);
	} else {
	buffer.write(buf, start, end - start);
	}
	}
	}

	/**
	* Clears the internal tagname buffer and cache
	*/
	private void resetTagName() {
	tagName = null;
	tagNameBuffer.reset();
	}

	/**
	* Returns the tagname scanned and resets the internal tagname buffer
	*
	* @return tagname
	*/
	private String getTagName() {
	if (tagName == null) {
	tagName = tagNameBuffer.toString();
	}
	return tagName;
	}

	/**
	* Flush internal buffer. This forces the parser to flush the characters
	* still held in its internal buffer, if the parsing state allows.
	*/
	private void flushBuffer() throws IOException {
	if (buffer.size() > 0) {
	final char[] chars = buffer.toCharArray();
	documentHandler.onCharacters(chars, 0, chars.length);
	buffer.reset();
	}
	}

	/**
	* Process a comment from current and accumulated character data
	*
	* @param ch character data work buffer
	* @param off start offset for current data
	* @param len length of current data
	* @throws IOException
	*/
	private void processComment(char[] ch, int off, int len) throws IOException {
	buffer.write(ch, off, len);
	documentHandler.onComment(buffer.toString());
	buffer.reset();
	}

	/**
	* Decompose a tag and feed it to the document handler.
	*
	* @param ch
	* character data
	* @param off
	* offset where character data starts
	* @param len
	* length of character data
	*/
	private boolean processTag(char[] ch, int off, int len) throws IOException {
	buffer.write(ch, off, len);

	final char[] snippet = buffer.toCharArray();

	tokenizer.tokenize(snippet, 0, snippet.length);
	if (!tokenizer.endTag()) {
	documentHandler.onStartElement(tokenizer.tagName(), tokenizer.attributes(),
	tokenizer.endSlash() \|\| VOID_ELEMENTS.contains(tokenizer.tagName().toLowerCase()));
	} else {
	documentHandler.onEndElement(tokenizer.tagName());
	}

	buffer.reset();
	return "SCRIPT".equalsIgnoreCase(tokenizer.tagName()) && !tokenizer.endSlash();
	}
	}