| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| package org.netbeans.modules.html.editor.lib; |
| |
| import java.util.ArrayList; |
| import java.util.Iterator; |
| import java.util.List; |
| import java.util.concurrent.atomic.AtomicReference; |
| import java.util.logging.Level; |
| import java.util.logging.Logger; |
| import org.netbeans.api.html.lexer.HTMLTokenId; |
| import org.netbeans.api.lexer.Token; |
| import org.netbeans.api.lexer.TokenSequence; |
| import org.netbeans.modules.html.editor.lib.api.ProblemDescription; |
| import org.netbeans.modules.html.editor.lib.api.elements.Attribute; |
| import org.netbeans.modules.html.editor.lib.api.elements.Element; |
| import org.netbeans.modules.html.editor.lib.plain.*; |
| import org.netbeans.modules.web.common.api.LexerUtils; |
| |
| /** |
| * Html syntax analyzer/plain parser |
| * |
| * @author mfukala@netbeans.org |
| */ |
| public class ElementsParser implements Iterator<Element> { |
| |
| //parser state |
| private int state; |
| //parser state constants |
| private static final int S_INIT = 0; |
| private static final int S_TAG_OPEN_SYMBOL = 1; |
| private static final int S_TAG = 2; |
| private static final int S_TAG_ATTR = 3; |
| private static final int S_TAG_VALUE = 4; |
| private static final int S_COMMENT = 5; |
| private static final int S_DECLARATION = 6; |
| private static final int S_DOCTYPE_DECLARATION = 7; |
| private static final int S_DOCTYPE_AFTER_ROOT_ELEMENT = 8; |
| private static final int S_DOCTYPE_PUBLIC_ID = 9; |
| private static final int S_DOCTYPE_FILE = 10; |
| private static final int S_TEXT = 11; |
| private static final int S_TAG_AFTER_NAME = 12; |
| //eof parser state constants |
| public static final String UNEXPECTED_SYMBOL_IN_OPEN_TAG = "unexpected_symbol_in_open_tag"; //NOI18N |
| private CharSequence sourceCode; |
| private TokenSequence<HTMLTokenId> ts; |
| //inner parsing states |
| private Token<HTMLTokenId> token; |
| private int start; |
| private boolean openTag = true; |
| private String tagName; |
| private TokenInfo attrib; |
| private List<TokenInfo> attr_keys; |
| private List<List<TokenInfo>> attr_values; |
| private Element current; |
| private boolean eof; |
| private AtomicReference<Element> lastFoundElement; |
| private String root_element, doctype_public_id, doctype_file, doctype_name; |
| |
| /* The {@link TokenSequence} needs to be properly positioned. */ |
| private ElementsParser(CharSequence sourceCode, TokenSequence<HTMLTokenId> tokenSequence) { |
| this.sourceCode = sourceCode; |
| this.ts = tokenSequence; |
| |
| state = S_INIT; |
| start = -1; |
| attr_keys = new ArrayList<>(); |
| attr_values = new ArrayList<>(); |
| eof = false; |
| } |
| |
| public static ElementsParser forOffset(CharSequence sourceCode, TokenSequence<HTMLTokenId> tokenSequence, int position) { |
| if (position < 0) { |
| throw new IllegalArgumentException(String.format("Position (%s) must be positive", position)); |
| } |
| |
| int diff = tokenSequence.move(position); |
| if (diff != 0) { |
| throw new IllegalArgumentException(String.format("Parser must be started " |
| + "at a token beginning, not in the middle (position=%s, token diff=%s, token=%s)", |
| position, diff, (tokenSequence.moveNext() ? tokenSequence.token() : null))); //NOI18N |
| } |
| return new ElementsParser(sourceCode, tokenSequence); |
| } |
| |
| public static ElementsParser forTokenIndex(CharSequence sourceCode, TokenSequence<HTMLTokenId> tokenSequence, int tokenIndex) { |
| if (tokenIndex < 0) { |
| throw new IllegalArgumentException(String.format("TokenSequence index (%s) must be positive", tokenIndex)); |
| } |
| tokenSequence.moveEnd(); |
| int lastTokenIndex = tokenSequence.index(); |
| if(tokenIndex > lastTokenIndex) { |
| throw new IllegalArgumentException(String.format("token index (%s) is bigger than last index in the sequence (%s)", tokenIndex, lastTokenIndex)); |
| } |
| tokenSequence.moveIndex(tokenIndex); |
| return new ElementsParser(sourceCode, tokenSequence); |
| } |
| |
| @Override |
| public boolean hasNext() { |
| if (lastFoundElement == null) { |
| lastFoundElement = new AtomicReference<>(findNextElement()); |
| } |
| return lastFoundElement.get() != null; |
| } |
| |
| @Override |
| public Element next() { |
| if (!hasNext()) { |
| throw new IllegalStateException("No such element"); |
| } |
| Element element = lastFoundElement.get(); |
| lastFoundElement = null; |
| return element; |
| } |
| |
| @Override |
| public void remove() { |
| //no-op |
| } |
| |
| //---------------------------- private methods ----------------------------- |
| private void error() { |
| current = new ErrorElement(sourceCode, |
| start, |
| (short) (ts.offset() + ts.token().length() - start)); |
| } |
| |
| private void text() { |
| current = new TextElement(start, ts.offset() + ts.token().length()); |
| } |
| |
| private void entityReference() { |
| current = new EntityReferenceElement(sourceCode, |
| start, |
| (short) (ts.offset() + ts.token().length() - start)); |
| |
| } |
| |
| private void comment() { |
| current = new CommentElement(sourceCode, |
| start, |
| ts.offset() + ts.token().length() - start); |
| } |
| |
| private void declaration() { |
| current = new DeclarationElement(sourceCode, |
| start, |
| (short) (ts.offset() + ts.token().length() - start), |
| root_element, |
| doctype_public_id, |
| doctype_file, |
| doctype_name); |
| } |
| |
| private void tag(boolean emptyTag) { |
| tag(emptyTag, null); |
| } |
| |
| private void tag(boolean emptyTag, ProblemDescription problem) { |
| List<Attribute> attributes = new ArrayList<>(1); //use small initial capacity since typically there are one or two attribs (if any) |
| for (int i = 0; i < attr_keys.size(); i++) { |
| TokenInfo key = attr_keys.get(i); |
| List<TokenInfo> values = attr_values.get(i); |
| StringBuilder joinedValue = new StringBuilder(); |
| |
| if (values == null) { |
| //attribute has no value |
| assert key.token.length() < Short.MAX_VALUE; |
| Attribute ta = new AttributeElement( |
| sourceCode, |
| key.offset, |
| (short) key.token.length()); |
| attributes.add(ta); |
| } else { |
| if (values.size() == 1) { |
| //one part value |
| TokenInfo ti = values.get(0); |
| |
| assert key.token.length() < Short.MAX_VALUE; |
| Attribute ta = new AttributeElement( |
| sourceCode, |
| key.offset, |
| ti.offset, |
| (short) key.token.length(), |
| ti.token.length()); |
| |
| attributes.add(ta); |
| |
| } else { |
| //multipart value |
| for (TokenInfo t : values) { |
| joinedValue.append(t.token.text()); |
| } |
| |
| TokenInfo firstValuePart = values.get(0); |
| TokenInfo lastValuePart = values.get(values.size() - 1); |
| |
| Attribute ta = new AttributeElement.AttributeElementWithJoinedValue( |
| sourceCode, |
| key.offset, |
| (short) key.token.length(), |
| firstValuePart.offset, |
| joinedValue.toString().intern()); |
| |
| attributes.add(ta); |
| } |
| } |
| } |
| |
| //Bug 220775 - AssertionError: element length must be positive! debug>>> |
| if (start == -1) { |
| throw new IllegalStateException(getCodeSnippet()); |
| } |
| int len = ts.offset() + ts.token().length() - start; |
| if (len <= 0) { |
| throw new IllegalStateException(getCodeSnippet()); |
| } |
| //<<< |
| |
| if (openTag) { |
| |
| if (attributes.isEmpty()) { |
| //no attributes |
| if (problem == null) { |
| current = new AttributelessOpenTagElement( |
| sourceCode, |
| start, |
| (short) len, |
| (byte) tagName.length(), |
| emptyTag); |
| } else { |
| current = new ProblematicAttributelessOpenTagElement( |
| sourceCode, |
| start, |
| (short) len, |
| (byte) tagName.length(), |
| emptyTag, |
| problem); |
| |
| } |
| } else { |
| //attributes |
| if (problem == null) { |
| //open tag w/o error |
| if (len > Short.MAX_VALUE) { |
| //unusually long element |
| current = new LongOpenTagElement( |
| sourceCode, |
| start, |
| len, |
| (byte) tagName.length(), |
| attributes, |
| emptyTag); |
| } else { |
| current = new OpenTagElement( |
| sourceCode, |
| start, |
| (short) len, |
| (byte) tagName.length(), |
| attributes, |
| emptyTag); |
| } |
| } else { |
| //open tag w/ error |
| //note: the ProblematicOpenTagElement also extends LongOpenTagElement |
| current = new ProblematicOpenTagElement( |
| sourceCode, |
| start, |
| (short) len, |
| (byte) tagName.length(), |
| attributes, |
| emptyTag, |
| problem); |
| |
| } |
| } |
| } else { |
| current = new EndTagElement( |
| sourceCode, |
| start, |
| (short) len, |
| (byte) tagName.length()); |
| } |
| |
| tagName = null; |
| attrib = null; |
| attr_keys = new ArrayList<>(); |
| attr_values = new ArrayList<>(); |
| } |
| private static final int SNIPPET_LEN = 100; |
| |
| private String getCodeSnippet() { |
| int offset = ts.offset(); |
| int from = Math.max(0, offset - (SNIPPET_LEN / 2)); |
| int to = Math.min(sourceCode.length(), offset + (SNIPPET_LEN / 2)); |
| return sourceCode.subSequence(from, to).toString(); |
| } |
| |
| //an error inside a tag, at least the tag name is known |
| private void tag_with_error(ProblemDescription problem) { |
| //lets put back the errorneous symbol first |
| backup(1); |
| //make the tag, we do not know if empty or not |
| tag(false, problem); |
| |
| state = S_INIT; |
| start = -1; |
| } |
| |
| //recover from error |
| private void reset() { |
| backup(1); |
| //create error element excluding the last token caused the error |
| error(); |
| state = S_INIT; |
| start = -1; |
| } |
| |
| private void backup(int tokens) { |
| for (int i = 0; i < tokens; i++) { |
| ts.movePrevious(); |
| token = ts.token(); |
| } |
| } |
| |
| private Element findNextElement() { |
| Element element = null; |
| //parse tokens until a syntaxelement is found |
| while (!eof && (element = processNextToken()) == null) { |
| //no-op |
| } |
| return element; |
| } |
| |
| private Element processNextToken() { |
| current = null; |
| |
| if (!ts.moveNext()) { |
| //eof |
| handleEOF(); //may possibly set current element |
| eof = true; //finish the parsing cycle |
| return current; |
| } |
| |
| int offset = ts.offset(); |
| token = ts.token(); |
| HTMLTokenId id = token.id(); |
| |
| switch (state) { |
| case S_INIT: |
| switch (id) { |
| case CHARACTER: |
| start = ts.offset(); |
| entityReference(); |
| state = S_INIT; |
| start = -1; |
| break; |
| case TAG_OPEN_SYMBOL: |
| start = ts.offset(); |
| state = S_TAG_OPEN_SYMBOL; |
| break; |
| case BLOCK_COMMENT: |
| start = ts.offset(); |
| state = S_COMMENT; |
| break; |
| case DECLARATION: |
| start = ts.offset(); |
| if (LexerUtils.equals("<!doctype", token.text(), true, true)) { //NOI18N |
| root_element = null; |
| doctype_public_id = null; |
| doctype_file = null; |
| state = S_DOCTYPE_DECLARATION; |
| } else { |
| state = S_DECLARATION; |
| } |
| doctype_name = token.text().subSequence(2, token.text().length()).toString(); //strip off the <! chars |
| break; |
| default: |
| //everything else is just a text |
| start = ts.offset(); |
| state = S_TEXT; |
| break; |
| } |
| break; |
| |
| case S_TEXT: |
| switch (id) { |
| case TEXT: |
| break; |
| default: |
| backup(1); |
| text(); |
| state = S_INIT; |
| start = -1; |
| break; |
| } |
| break; |
| |
| case S_TAG_OPEN_SYMBOL: |
| switch (id) { |
| case TAG_OPEN: |
| state = S_TAG_AFTER_NAME; |
| openTag = true; |
| tagName = token.text().toString(); |
| break; |
| case TAG_CLOSE: |
| state = S_TAG_AFTER_NAME; |
| openTag = false; |
| tagName = token.text().toString(); |
| break; |
| default: |
| reset(); //error |
| break; |
| } |
| break; |
| |
| case S_TAG_AFTER_NAME: |
| //just switch to 'in tag state' |
| backup(1); |
| state = S_TAG; |
| break; |
| |
| case S_TAG: |
| switch (id) { |
| case WS: |
| case EOL: |
| case ERROR: |
| break; |
| case ARGUMENT: |
| state = S_TAG_ATTR; |
| attrib = tokenInfo(); |
| break; |
| case TAG_CLOSE_SYMBOL: |
| boolean emptyTag = "/>".equals(token.text().toString()); |
| tag(emptyTag); |
| state = S_INIT; |
| start = -1; |
| break; |
| default: |
| tag_with_error( |
| ProblemDescription.create(UNEXPECTED_SYMBOL_IN_OPEN_TAG, |
| String.format("Unexpected symbol '%s' found in the open tag", token.text()), |
| ProblemDescription.ERROR, |
| offset, |
| offset + token.length())); |
| break; |
| } |
| break; |
| |
| |
| case S_TAG_ATTR: |
| switch (id) { |
| case OPERATOR: |
| case WS: |
| break; |
| case VALUE: |
| case VALUE_JAVASCRIPT: |
| case VALUE_CSS: |
| backup(1); //backup the value |
| state = S_TAG_VALUE; |
| break; |
| case ARGUMENT: |
| case TAG_CLOSE_SYMBOL: |
| //attribute without value |
| attr_keys.add(attrib); |
| attr_values.add(null); |
| state = S_TAG; |
| backup(1); |
| break; |
| default: |
| tag_with_error( |
| ProblemDescription.create(UNEXPECTED_SYMBOL_IN_OPEN_TAG, |
| String.format("Unexpected symbol '%s' found in the open tag", token.text()), |
| ProblemDescription.ERROR, |
| offset, |
| offset + token.length())); |
| break; |
| } |
| break; |
| |
| case S_TAG_VALUE: |
| switch (id) { |
| case VALUE: |
| case VALUE_JAVASCRIPT: |
| case VALUE_CSS: |
| case EL_OPEN_DELIMITER: |
| case EL_CONTENT: |
| case EL_CLOSE_DELIMITER: |
| int index = attr_keys.indexOf(attrib); |
| if (index == -1) { |
| List<TokenInfo> values = new ArrayList<>(); |
| values.add(tokenInfo()); |
| attr_keys.add(attrib); |
| attr_values.add(values); |
| } else { |
| List<TokenInfo> valueParts = attr_values.get(index); |
| //http://statistics.netbeans.org/exceptions/messageslog?id=679650 |
| //NPE might happen as attr_values.get(index) might return null |
| //I cannot see the code path which leads to this so adding a silly NPE check |
| if(valueParts != null) { |
| valueParts.add(tokenInfo()); |
| } |
| } |
| |
| break; |
| case ERROR: |
| tag_with_error( |
| ProblemDescription.create(UNEXPECTED_SYMBOL_IN_OPEN_TAG, |
| String.format("Unexpected symbol '%s' found in the open tag", token.text()), |
| ProblemDescription.ERROR, |
| offset, |
| offset + token.length())); |
| break; |
| default: |
| backup(1); |
| state = S_TAG; |
| break; |
| } |
| break; |
| |
| case S_COMMENT: |
| switch (id) { |
| case BLOCK_COMMENT: |
| case EOL: |
| case WS: |
| break; |
| default: |
| backup(1); |
| comment(); |
| state = S_INIT; |
| start = -1; |
| break; |
| } |
| break; |
| |
| case S_DECLARATION: |
| switch (id) { |
| case DECLARATION: |
| case SGML_COMMENT: |
| case EOL: |
| case WS: |
| break; |
| default: |
| backup(1); |
| declaration(); |
| state = S_INIT; |
| start = -1; |
| break; |
| } |
| break; |
| |
| case S_DOCTYPE_DECLARATION: |
| switch (id) { |
| case DECLARATION: |
| root_element = token.text().toString(); |
| state = S_DOCTYPE_AFTER_ROOT_ELEMENT; |
| break; |
| case SGML_COMMENT: |
| case EOL: |
| case WS: |
| break; |
| default: |
| backup(1); |
| declaration(); |
| state = S_INIT; |
| start = -1; |
| break; |
| } |
| break; |
| |
| case S_DOCTYPE_AFTER_ROOT_ELEMENT: |
| switch (id) { |
| case DECLARATION: |
| if (LexerUtils.equals("public", token.text(), true, true)) { //NOI18N |
| doctype_public_id = new String(); |
| state = S_DOCTYPE_PUBLIC_ID; |
| break; |
| } else if (LexerUtils.equals("system", token.text(), true, true)) { //NOI18N |
| state = S_DOCTYPE_FILE; |
| doctype_file = new String(); |
| break; |
| } else if (token.text().charAt(0) == '>') { |
| declaration(); |
| state = S_INIT; |
| start = -1; |
| } |
| break; |
| case SGML_COMMENT: |
| case EOL: |
| case WS: |
| break; |
| default: |
| backup(1); |
| declaration(); |
| state = S_INIT; |
| start = -1; |
| break; |
| } |
| break; |
| |
| case S_DOCTYPE_PUBLIC_ID: |
| switch (id) { |
| case WS: |
| case DECLARATION: |
| String tokenText = token.text().toString(); |
| if (tokenText.startsWith("\"")) { |
| //first token |
| tokenText = tokenText.substring(1); //cut off the quotation mark |
| } |
| if (tokenText.endsWith("\"")) { |
| //last token |
| tokenText = tokenText.substring(0, tokenText.length() - 1); //cut off the quotation mark |
| doctype_public_id += tokenText; //short and rare strings, no perf problem |
| doctype_public_id = doctype_public_id.trim(); |
| state = S_DOCTYPE_FILE; |
| break; |
| } |
| doctype_public_id += tokenText; //short and rare strings, no perf problem |
| |
| break; |
| case SGML_COMMENT: |
| case EOL: |
| |
| break; |
| default: |
| backup(1); |
| declaration(); |
| state = S_INIT; |
| start = -1; |
| break; |
| } |
| break; |
| |
| case S_DOCTYPE_FILE: |
| switch (id) { |
| case DECLARATION: |
| doctype_file = token.text().toString(); |
| //jump to simple sgml declaration so potentially |
| //other declaration tokens are inluded |
| state = S_DECLARATION; |
| break; |
| case SGML_COMMENT: |
| case EOL: |
| case WS: |
| break; |
| default: |
| backup(1); |
| declaration(); |
| state = S_INIT; |
| start = -1; |
| break; |
| } |
| break; |
| |
| } //switch end |
| |
| return current; |
| |
| } |
| |
| private void handleEOF() { |
| if (state != S_INIT) { |
| //an incomplete syntax element at the end of the file |
| switch (state) { |
| case S_COMMENT: |
| comment(); |
| break; |
| case S_DECLARATION: |
| case S_DOCTYPE_AFTER_ROOT_ELEMENT: |
| case S_DOCTYPE_DECLARATION: |
| case S_DOCTYPE_FILE: |
| case S_DOCTYPE_PUBLIC_ID: |
| declaration(); |
| break; |
| case S_TEXT: |
| text(); |
| break; |
| case S_TAG: |
| case S_TAG_ATTR: |
| case S_TAG_VALUE: |
| tag(false); |
| break; |
| case S_TAG_AFTER_NAME: |
| tag(false); |
| break; |
| default: |
| error(); |
| break; |
| } |
| |
| } |
| } |
| |
| private TokenInfo tokenInfo() { |
| return new TokenInfo(ts.offset(), token); |
| |
| |
| } |
| |
| static final class TokenInfo { |
| |
| public int offset; |
| public Token token; |
| |
| public TokenInfo(int offset, Token token) { |
| this.offset = offset; |
| this.token = token; |
| } |
| |
| @Override |
| public boolean equals(Object obj) { |
| if (obj == null) { |
| return false; |
| } |
| if (getClass() != obj.getClass()) { |
| return false; |
| } |
| final TokenInfo other = (TokenInfo) obj; |
| if (this.offset != other.offset) { |
| return false; |
| } |
| if (this.token != other.token && (this.token == null || !this.token.equals(other.token))) { |
| return false; |
| } |
| return true; |
| } |
| |
| @Override |
| public int hashCode() { |
| int hash = 3; |
| hash = 37 * hash + this.offset; |
| hash = 37 * hash + (this.token != null ? this.token.hashCode() : 0); |
| return hash; |
| } |
| } |
| } |