/*******************************************************************************
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 ******************************************************************************/
package org.apache.sling.scripting.sightly.impl.html.dom;

import java.io.CharArrayWriter;
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;

/**
 * HTML parser. Invokes a <code>DocumentHandler</code> whenever an event occurs.
 */
public final class HtmlParser {

    private static int BUF_SIZE = 2048;

    /** Internal character buffer */
    private final CharArrayWriter buffer = new CharArrayWriter(256);

    /** Tag tokenizer */
    private final TagTokenizer tokenizer = new TagTokenizer();

    /** Tag name buffer */
    private final CharArrayWriter tagNameBuffer = new CharArrayWriter(30);

    /** Tag name */
    private String tagName;

    /** Registered document handler */
    private final DocumentHandler documentHandler;

    private enum PARSE_STATE {
        OUTSIDE,
        TAG,
        SCRIPT,
        COMMENT,
        STRING,
        EXPRESSION
    }

    /** Tag type constant */
    private final static int TT_NONE = 0;

    /** Tag type constant */
    private final static int TT_MAYBE = 1;

    /** Tag type constant */
    private final static int TT_TAG = 2;

    /** Expression state constant */
    private final static int EXPR_NONE = 0;

    /** Expression state constant */
    private final static int EXPR_MAYBE = 1;

    final static Set<String> VOID_ELEMENTS = Collections.unmodifiableSet(new HashSet<>(
            Arrays.asList("area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr")));

    /** Parse state */
    private PARSE_STATE parseState = PARSE_STATE.OUTSIDE;

    /** Parse substate */
    private int parseSubState;

    /** Previous parse state */
    private PARSE_STATE prevParseState;

    /** Current tag type */
    private int tagType;

    /** Expression type */
    private int exprType;

    /** Quote character */
    private char quoteChar;

    public static void parse(final Reader reader, final DocumentHandler documentHandler)
    throws IOException {
        final HtmlParser parser = new HtmlParser(documentHandler);
        parser.parse(reader);
    }

    /**
     * Default constructor.
     */
    private HtmlParser(final DocumentHandler documentHandler) {
        this.documentHandler = documentHandler;
    }

    private void parse(final Reader reader)
    throws IOException {
        try {
            this.documentHandler.onStart();
            final char[] readBuffer = new char[BUF_SIZE];
            int readLen = 0;
            while ( (readLen = reader.read(readBuffer)) > 0 ) {
                this.update(readBuffer, readLen);
            }
            this.flushBuffer();
            this.documentHandler.onEnd();
        } finally {
            try {
                reader.close();
            } catch ( final IOException ignore) {
                // ignore
            }
        }
    }

    /**
     * Feed characters to the parser.
     *
     * @param buf character buffer
     * @param len length of affected buffer
     */
    private void update(final char[] buf, int len) throws IOException {
        int start = 0;
        final int end = len;

        for (int curr = start; curr < end; curr++) {
            final char c = buf[curr];

            switch (parseState) {
            case OUTSIDE:
                if (c == '<') {
                    if (curr > start) {
                        documentHandler.onCharacters(buf, start, curr - start);
                    }
                    start = curr;
                    parseState = PARSE_STATE.TAG;
                    parseSubState = 0;
                    tagType = TT_MAYBE;
                    resetTagName();
                } else if (c == '$') {
                    exprType = EXPR_MAYBE;
                    parseState = PARSE_STATE.EXPRESSION;
                }
                break;
            case TAG:
                switch (parseSubState) {
                case -1:
                    if (c == '"' || c == '\'') {
                        quoteChar = c;
                        prevParseState = parseState;
                        parseState = PARSE_STATE.STRING;
                        parseSubState = -1;
                    } else if (c == '>') {
                        parseState = PARSE_STATE.OUTSIDE;
                    }
                    break;
                case 0:
                    if (c == '!') {
                        parseState = PARSE_STATE.COMMENT;
                        parseSubState = 0;
                        tagType = TT_NONE;
                        // keep the accumulated buffer
                    } else if (c == '"' || c == '\'') {
                        quoteChar = c;
                        prevParseState = parseState;
                        parseState = PARSE_STATE.STRING;
                        parseSubState = -1;
                        tagType = TT_NONE;
                        flushBuffer();
                    } else if (c == '>') {
                        parseState = PARSE_STATE.OUTSIDE;
                        tagType = TT_NONE;
                        flushBuffer();
                    } else if (!Character.isWhitespace(c)) {
                        tagNameBuffer.write(c);
                        parseSubState = 1;
                    } else {
                        parseSubState = -1;
                        tagType = TT_NONE;
                        flushBuffer();
                    }
                    break;
                case 1:
                    if (c == '"' || c == '\'') {
                        tagType = TT_TAG;
                        parseSubState = 2;
                        quoteChar = c;
                        prevParseState = parseState;
                        parseState = PARSE_STATE.STRING;
                    } else if (c == '>') {
                        parseState = processTag(buf, start, curr - start + 1) ? PARSE_STATE.SCRIPT : PARSE_STATE.OUTSIDE;
                        start = curr + 1;
                        tagType = TT_NONE;
                        parseSubState = 0;
                    } else if (Character.isWhitespace(c)) {
                        tagType = TT_TAG;
                        parseSubState = 2;
                    } else {
                        tagNameBuffer.write(c);
                    }
                    break;
                case 2:
                    if (c == '"' || c == '\'') {
                        quoteChar = c;
                        prevParseState = parseState;
                        parseState = PARSE_STATE.STRING;
                    } else if (c == '>') {
                        if (tagType == TT_TAG) {
                            parseState = processTag(buf, start, curr - start + 1) ? PARSE_STATE.SCRIPT : PARSE_STATE.OUTSIDE;
                            start = curr + 1;
                        } else {
                            flushBuffer();
                            parseState = "SCRIPT".equalsIgnoreCase(getTagName()) ? PARSE_STATE.SCRIPT : PARSE_STATE.OUTSIDE;
                        }
                        tagType = TT_NONE;
                        parseSubState = 0;
                    }
                    break;
                default:
                    break;
                }
                break;
            case COMMENT:
                switch (parseSubState) {
                case 0:
                    if (c == '-') {
                        parseSubState++;
                    } else if (c == '"' || c == '\'') {
                        quoteChar = c;
                        prevParseState = PARSE_STATE.TAG;
                        parseState = PARSE_STATE.STRING;
                        parseSubState = -1;
                        tagType = TT_NONE;
                        flushBuffer();
                    } else if (c == '>') {
                        parseState = PARSE_STATE.OUTSIDE;
                        tagType = TT_NONE;
                        flushBuffer();
                    } else {
                        parseState = PARSE_STATE.TAG;
                        parseSubState = -1;
                        tagType = TT_NONE;
                        flushBuffer();
                    }
                    break;
                case 1:
                    if (c == '-') {
                        parseSubState++;
                    } else if (c == '"' || c == '\'') {
                        quoteChar = c;
                        prevParseState = PARSE_STATE.TAG;
                        parseState = PARSE_STATE.STRING;
                        parseSubState = -1;
                        tagType = TT_NONE;
                        flushBuffer();
                    } else if (c == '>') {
                        parseState = PARSE_STATE.OUTSIDE;
                        tagType = TT_NONE;
                        flushBuffer();
                    } else {
                        parseState = PARSE_STATE.TAG;
                        parseSubState = -1;
                        tagType = TT_NONE;
                        flushBuffer();
                    }
                    break;
                case 2:
                    if (c == '-') {
                        parseSubState++;
                    }
                    break;
                case 3:
                    if (c == '-') {
                        parseSubState++;
                    } else {
                        parseSubState = 2;
                    }
                    break;
                case 4:
                    if (c == '>') {
                        parseState = PARSE_STATE.OUTSIDE;
                        processComment(buf, start, curr - start + 1);
                        start = curr + 1;
                    } else {
                        parseSubState = 2;
                    }
                    break;
                default:
                    break;
                }
                break;

            case SCRIPT:
                switch (parseSubState) {
                case 0:
                    if (c == '<') {
                        if (curr > start) {
                            documentHandler.onCharacters(buf, start, curr - start);
                        }
                        start = curr;
                        tagType = TT_MAYBE;
                        parseSubState++;
                    }
                    break;
                case 1:
                    if (c == '/') {
                        parseSubState++;
                    } else {
                        tagType = TT_NONE;
                        parseSubState = 0;
                        flushBuffer();
                    }
                    break;
                case 2:
                    if (c == 'S' || c == 's') {
                        parseSubState++;
                    } else {
                        tagType = TT_NONE;
                        parseSubState = 0;
                        flushBuffer();
                    }
                    break;
                case 3:
                    if (c == 'C' || c == 'c') {
                        parseSubState++;
                    } else {
                        tagType = TT_NONE;
                        parseSubState = 0;
                        flushBuffer();
                    }
                    break;
                case 4:
                    if (c == 'R' || c == 'r') {
                        parseSubState++;
                    } else {
                        tagType = TT_NONE;
                        parseSubState = 0;
                        flushBuffer();
                    }
                    break;
                case 5:
                    if (c == 'I' || c == 'i') {
                        parseSubState++;
                    } else {
                        tagType = TT_NONE;
                        parseSubState = 0;
                        flushBuffer();
                    }
                    break;
                case 6:
                    if (c == 'P' || c == 'p') {
                        parseSubState++;
                    } else {
                        tagType = TT_NONE;
                        parseSubState = 0;
                        flushBuffer();
                    }
                    break;
                case 7:
                    if (c == 'T' || c == 't') {
                        parseSubState++;
                    } else {
                        tagType = TT_NONE;
                        parseSubState = 0;
                        flushBuffer();
                    }
                    break;
                case 8:
                    if (c == '>') {
                        processTag(buf, start, curr - start + 1);
                        start = curr + 1;
                        tagType = TT_NONE;
                        parseState = PARSE_STATE.OUTSIDE;
                    }
                    break;
                default:
                    break;
                }
                break;

            case STRING:
                if (c == quoteChar) {
                    parseState = prevParseState;
                }
                break;

            case EXPRESSION:
                if (exprType == EXPR_MAYBE && c != '{') {
                    // not a valid expression
                    if (c == '<') {
                        //reset to process tag correctly
                        curr--;
                    }
                    parseState = PARSE_STATE.OUTSIDE;
                } else if (c == '}') {
                    parseState = PARSE_STATE.OUTSIDE;
                }
                exprType = EXPR_NONE;
                break;
            default:
                break;
            }
        }
        if (start < end) {
            if (tagType == TT_NONE && parseState != PARSE_STATE.COMMENT) {
                documentHandler.onCharacters(buf, start, end - start);
            } else {
                buffer.write(buf, start, end - start);
            }
        }
    }

    /**
     * Clears the internal tagname buffer and cache
     */
    private void resetTagName() {
        tagName = null;
        tagNameBuffer.reset();
    }

    /**
     * Returns the tagname scanned and resets the internal tagname buffer
     *
     * @return tagname
     */
    private String getTagName() {
        if (tagName == null) {
            tagName = tagNameBuffer.toString();
        }
        return tagName;
    }

    /**
     * Flush internal buffer. This forces the parser to flush the characters
     * still held in its internal buffer, if the parsing state allows.
     */
    private void flushBuffer() throws IOException {
        if (buffer.size() > 0) {
            final char[] chars = buffer.toCharArray();
            documentHandler.onCharacters(chars, 0, chars.length);
            buffer.reset();
        }
    }

    /**
     * Process a comment from current and accumulated character data
     *
     * @param ch character data work buffer
     * @param off start offset for current data
     * @param len length of current data
     * @throws IOException
     */
    private void processComment(char[] ch, int off, int len) throws IOException {
        buffer.write(ch, off, len);
        documentHandler.onComment(buffer.toString());
        buffer.reset();
    }

    /**
     * Decompose a tag and feed it to the document handler.
     *
     * @param ch
     *            character data
     * @param off
     *            offset where character data starts
     * @param len
     *            length of character data
     */
    private boolean processTag(char[] ch, int off, int len) throws IOException {
        buffer.write(ch, off, len);

        final char[] snippet = buffer.toCharArray();

        tokenizer.tokenize(snippet, 0, snippet.length);
        if (!tokenizer.endTag()) {
            documentHandler.onStartElement(tokenizer.tagName(), tokenizer.attributes(),
                    tokenizer.endSlash() || VOID_ELEMENTS.contains(tokenizer.tagName().toLowerCase()));
        } else {
            documentHandler.onEndElement(tokenizer.tagName());
        }

        buffer.reset();
        return "SCRIPT".equalsIgnoreCase(tokenizer.tagName()) && !tokenizer.endSlash();
    }
}
