blob: d44dafdd9563c3d189c7576843d7254898d9855b [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.netbeans.lib.html.lexer;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.netbeans.api.html.lexer.HTMLTokenId;
import org.netbeans.api.html.lexer.HtmlLexerPlugin;
import org.netbeans.api.lexer.InputAttributes;
import org.netbeans.api.lexer.LanguagePath;
import org.netbeans.api.lexer.Token;
import org.netbeans.spi.lexer.Lexer;
import org.netbeans.spi.lexer.LexerInput;
import org.netbeans.spi.lexer.LexerRestartInfo;
import org.netbeans.spi.lexer.TokenFactory;
import org.netbeans.spi.lexer.TokenPropertyProvider;
/**
* Lexical analyzer for HTML. Based on original HTML lexer from html/editor module.
*
* @author Petr Nejedly
* @author Miloslav Metelka
* @author Jan Lahoda
* @author Marek Fukala
* @version 1.00
*/
public final class HtmlLexer implements Lexer<HTMLTokenId> {
private static final Logger LOGGER = Logger.getLogger(HtmlLexer.class.getName());
private static final boolean LOG = Boolean.getBoolean("j2ee_lexer_debug"); //NOI18N
private static final int EOF = LexerInput.EOF;
private final LexerInput input;
private final TokenFactory<HTMLTokenId> tokenFactory;
private static final class CompoundState {
private int lexerState;
private int lexerSubState;
private int lexerEmbeddingState;
private byte customELIndex;
private String attribute;
private String tag;
private String scriptType;
private boolean quoteType;
public CompoundState(int lexerState, int lexerSubState, int lexerEmbeddingState, String attributeName, String tagName, String scriptType, byte customELIndex, boolean quoteType) {
this.lexerState = lexerState;
this.lexerSubState = lexerSubState;
this.lexerEmbeddingState = lexerEmbeddingState;
this.attribute = attributeName;
this.tag = tagName;
this.scriptType = scriptType;
this.customELIndex = customELIndex;
this.quoteType = quoteType;
}
@Override
public boolean equals(Object obj) {
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
final CompoundState other = (CompoundState) obj;
if (this.lexerState != other.lexerState) {
return false;
}
if (this.lexerSubState != other.lexerSubState) {
return false;
}
if (this.lexerEmbeddingState != other.lexerEmbeddingState) {
return false;
}
if (this.attribute != other.attribute && (this.attribute == null || !this.attribute.equals(other.attribute))) {
return false;
}
if (this.tag != other.tag && (this.tag == null || !this.tag.equals(other.tag))) {
return false;
}
if (this.scriptType != other.scriptType && (this.scriptType == null || !this.scriptType.equals(other.scriptType))) {
return false;
}
if (this.customELIndex != other.customELIndex) {
return false;
}
if (this.quoteType != other.quoteType) {
return false;
}
return true;
}
@Override
public int hashCode() {
int hash = 3;
hash = 17 * hash + this.lexerState;
hash = 17 * hash + this.lexerSubState;
hash = 17 * hash + this.lexerEmbeddingState;
hash = 17 * hash + (this.attribute != null ? this.attribute.hashCode() : 0);
hash = 17 * hash + (this.tag != null ? this.tag.hashCode() : 0);
hash = 17 * hash + (this.scriptType != null ? this.scriptType.hashCode() : 0);
if(this.customELIndex > 0) {
//do not alter hash code if there's no custom el index set
hash = 17 * hash + this.customELIndex;
}
//do not alter the hash code out of the related area
switch(lexerState) {
case ISI_VAL_QUOT:
case ISI_VAL_QUOT_EL:
case ISI_VAL_QUOT_ESC:
hash = 17 * hash + (quoteType ? 1 : 0);
break;
}
return hash;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("HLS(hc="); //NOI18N
sb.append(hashCode());
sb.append(",s="); //NOI18N
sb.append(lexerState);
if(lexerSubState > 0) {
sb.append(",ss="); //NOI18N
sb.append(lexerSubState);
}
if(lexerEmbeddingState > 0) {
sb.append(",es="); //NOI18N
sb.append(lexerEmbeddingState);
}
if(tag != null) {
sb.append(",tag="); //NOI18N
sb.append(tag);
}
if(attribute != null) {
sb.append(",attribute="); //NOI18N
sb.append(attribute);
}
if(scriptType != null) {
sb.append(",scriptType="); //NOI18N
sb.append(scriptType);
}
sb.append(')'); //NOI18N
return sb.toString();
}
}
private final HashMap<CompoundState, CompoundState> STATES_CACHE = new HashMap<>();
@Override
public Object state() {
//cache the states so lexing of large files do not eat too much memory
CompoundState currentState = new CompoundState(lexerState, lexerSubState, lexerEmbeddingState, attribute, tag, scriptType, customELIndex, quoteType);
CompoundState cached = STATES_CACHE.get(currentState);
if(cached == null) {
STATES_CACHE.put(currentState, currentState);
return currentState;
} else {
return cached;
}
}
//script and style tag names
private static final String SCRIPT = "script"; //NOI18N
private static final String STYLE = "style"; //NOI18N
private static final String[] STYLE_ATTRS = new String[]{"style", "id", "class"}; //NOI18N
/** Internal state of the lexical analyzer before entering subanalyzer of
* character references. It is initially set to INIT, but before first usage,
* this will be overwritten with state, which originated transition to
* charref subanalyzer.
*/
private int lexerSubState = INIT;
private int lexerState = INIT;
private String attribute;
private String tag; //tag name of the current context tag
/**
* Value of the "type" attribute in SCRIPT tag
*/
private String scriptType;
//tag name with namespace prefix to collection of attributes which should have
//css class embedding by default
private Map<String, Collection<String>> cssClassTagAttrMap;
private String CSS_CLASS_MAP_PROPERTY_KEY = "cssClassTagAttrMap"; //NOI18N //semi api
/** indicated whether we are in a script */
private int lexerEmbeddingState = INIT;
private byte customELIndex = INIT;
/**
* Indicates the quote type in ISI_VAL_QUOT state.
*
* true means double qoute, false single quote.
*/
private boolean quoteType;
public static final String EL_CONTENT_PROVIDER_INDEX = "elci"; //NOI18N
// internal 'in script' state. 'scriptState' internal state is set to it when the
// analyzer goes into a script tag body
private static final int ISI_SCRIPT = 1;
private static final int ISI_STYLE = 2;
// Internal states
private static final int INIT = 0;
private static final int ISI_TEXT = 1; // Plain text between tags
private static final int ISI_ERROR = 2; // Syntax error in HTML syntax
private static final int ISA_LT = 3; // After start of tag delimiter - "<"
private static final int ISA_SLASH = 4; // After ETAGO - "</"
private static final int ISI_ENDTAG = 5; // Inside endtag - "</[a..Z]+"
private static final int ISP_ENDTAG_X = 6; // X-switch after ENDTAG's name
private static final int ISP_ENDTAG_WS = 7; // In WS in ENDTAG - "</A_ _>"
private static final int ISI_TAG = 8; // Inside tag - "<[a..Z]+"
private static final int ISP_TAG_X = 9; // X-switch after TAG's name
private static final int ISP_TAG_WS = 10; // In WS in TAG - "<A_ _...>"
private static final int ISI_ARG = 11; // Inside tag's argument - "<A h_r_...>"
private static final int ISP_ARG_X = 12; // X-switch after ARGUMENT's name
private static final int ISP_ARG_WS = 13; // Inside WS after argument awaiting '='
private static final int ISP_EQ = 14; // X-switch after '=' in TAG's ARGUMENT
private static final int ISP_EQ_WS = 15; // In WS after '='
private static final int ISI_VAL = 16; // Non-quoted value
private static final int ISI_VAL_QUOT = 17; // quoted value
private static final int ISI_VAL_QUOT_EL = 18; // in EL in quoted value
private static final int ISA_SGML_ESCAPE = 19; // After "<!"
private static final int ISA_SGML_DASH = 20; // After "<!-"
private static final int ISI_HTML_COMMENT = 21; // Somewhere after "<!--"
private static final int ISA_HTML_COMMENT_DASH = 22; // Dash in comment - maybe end of comment
private static final int ISI_HTML_COMMENT_WS = 23; // After end of comment, awaiting end of comment declaration
private static final int ISI_SGML_DECL = 24;
private static final int ISA_SGML_DECL_DASH = 25;
private static final int ISI_SGML_COMMENT = 26;
private static final int ISA_SGML_COMMENT_DASH = 27;
private static final int ISA_REF = 28; // when comes to character reference, e.g. &amp;, after &
private static final int ISI_REF_NAME = 29; // if the reference is symbolic - by predefined name
private static final int ISA_REF_HASH = 30; // for numeric references - after &#
private static final int ISI_REF_DEC = 31; // decimal character reference, e.g. &#345;
private static final int ISA_REF_X = 32; //
private static final int ISI_REF_HEX = 33; // hexadecimal reference, in &#xa.. of &#X9..
private static final int ISI_TAG_SLASH = 34; //after slash in html tag
private static final int ISI_SCRIPT_CONTENT = 35; //after <script> tags closing symbol '>' - the tag content
private static final int ISI_SCRIPT_CONTENT_AFTER_LT = 36; //after < in script content
private static final int ISI_SCRIPT_CONTENT_ENDTAG = 37; //after </ in script content
private static final int ISI_STYLE_CONTENT = 38; //after <style> tags closing symbol '>' - the tag content
private static final int ISI_STYLE_CONTENT_AFTER_LT = 39; //after < in style content
private static final int ISI_STYLE_CONTENT_ENDTAG = 40; //after </ in style content
private static final int ISI_SGML_DECL_WS = 41; //after whitespace in SGML declaration
private static final int ISI_VAL_QUOT_ESC = 42;
private static final int ISP_TAG_X_ERROR = 45; //error in tag content
private static final int ISI_XML_PI = 47; //inside <? ... ?>
private static final int ISI_XML_PI_QM = 48; //after ? in XML PI
private static final int ISI_EL = 49; //EL custom open delimiter: {{.....}}
static final Set<String> EVENT_HANDLER_NAMES = new HashSet<>();
static {
// See http://www.w3.org/TR/html401/interact/scripts.html
EVENT_HANDLER_NAMES.add("onload"); // NOI18N
EVENT_HANDLER_NAMES.add("onunload"); // NOI18N
EVENT_HANDLER_NAMES.add("onclick"); // NOI18N
EVENT_HANDLER_NAMES.add("ondblclick"); // NOI18N
EVENT_HANDLER_NAMES.add("onmousedown"); // NOI18N
EVENT_HANDLER_NAMES.add("onmouseup"); // NOI18N
EVENT_HANDLER_NAMES.add("onmouseover"); // NOI18N
EVENT_HANDLER_NAMES.add("onmousemove"); // NOI18N
EVENT_HANDLER_NAMES.add("onmouseout"); // NOI18N
EVENT_HANDLER_NAMES.add("onfocus"); // NOI18N
EVENT_HANDLER_NAMES.add("onblur"); // NOI18N
EVENT_HANDLER_NAMES.add("onkeypress"); // NOI18N
EVENT_HANDLER_NAMES.add("onkeydown"); // NOI18N
EVENT_HANDLER_NAMES.add("onkeyup"); // NOI18N
EVENT_HANDLER_NAMES.add("onsubmit"); // NOI18N
EVENT_HANDLER_NAMES.add("onreset"); // NOI18N
EVENT_HANDLER_NAMES.add("onselect"); // NOI18N
EVENT_HANDLER_NAMES.add("onchange"); // NOI18N
EVENT_HANDLER_NAMES.add("ondrag"); // NOI18N
EVENT_HANDLER_NAMES.add("ondrop"); // NOI18N
// IMPORTANT - if you add any that DON'T start with "o" here,
// make sure you update the optimized firstchar look in isJavaScriptArgument
}
private static final String SUPPORTED_SCRIPT_TYPE = "text/javascript"; //NOI18N
//flyweight token images
private static final String IMG_EQUAL_SIGN = "="; //NOI18N
private static final String IMG_CLOSE_TAG_SYMBOL = ">"; //NOI18N
private static final String IMG_CLOSE_TAG_SYMBOL2 = "/>"; //NOI18N
private static final String IMG_OPEN_TAG_SYMBOL = "<"; //NOI18N
private static final String IMG_OPEN_TAG_SYMBOL2 = "</"; //NOI18N
private final HtmlPlugins customELQuery = HtmlPlugins.getDefault();
/**
* Expression language open delimiter token can be queried for the mime type of
* the content of the expression.
*/
public static final String EL_EXPRESSION_CONTENT_MIMETYPE_TOKEN_PROPERTY_KEY = "contentMimeType"; //NOI18N
/**
* {@link HtmlLexerPlugin#createAttributeEmbedding(java.lang.String, java.lang.String)} can be used to
* inject a custom embedding to an html tag attribute value. When the plugin returns a non null value
* then the mimetype is set as a token's property and then used in {@link HTMLTokenId#language.createEmbedding()} method.
*/
public static final String ATTRIBUTE_VALUE_EMBEDDING_MIMETYPE_TOKEN_PROPERTY_KEY = "embeddingMimeType"; //NOI18N
public HtmlLexer(LexerRestartInfo<HTMLTokenId> info) {
this.input = info.input();
this.tokenFactory = info.tokenFactory();
if (info.state() == null) {
this.lexerSubState = INIT;
this.lexerState = INIT;
this.lexerEmbeddingState = INIT;
this.customELIndex = INIT;
this.quoteType = false;
} else {
CompoundState cs = (CompoundState) info.state();
lexerState = cs.lexerState;
lexerSubState = cs.lexerSubState;
lexerEmbeddingState = cs.lexerEmbeddingState;
attribute = cs.attribute;
tag = cs.tag;
customELIndex = cs.customELIndex;
quoteType = cs.quoteType;
}
InputAttributes inputAttributes = info.inputAttributes();
if (inputAttributes != null) {
cssClassTagAttrMap = (Map<String, Collection<String>>)inputAttributes.getValue(
LanguagePath.get(HTMLTokenId.language()), CSS_CLASS_MAP_PROPERTY_KEY); //NOI18N
}
}
private boolean isAZ( int character ) {
return( (character >= 'a' && character <= 'z') || (character >= 'A' && character <= 'Z') );
}
private boolean isName( int character ) {
return Character.isLetterOrDigit(character) ||
character == '-' || character == '_' || character == '.' || character == ':';
}
private boolean isAttributeName( int character ) {
return (! Character.isWhitespace(character)) && character != '/'
&& character != '>' && character != '=' && character != 0;
}
/**
* Resolves if given char is whitespace in terms of HTML4.0 specs
* According to specs, following characters are treated as whitespace:
* Space - <CODE>'\u0020'</CODE>, Tab - <CODE>'\u0009'</CODE>,
* Formfeed - <CODE>'\u000C'</CODE>,Zero-width space - <CODE>'\u200B'</CODE>,
* Carriage return - <CODE>'\u000D'</CODE> and Line feed - <CODE>'\u000A'</CODE>
* CR's are included for completenes only, they should never appear in document
*/
private boolean isWS( int character ) {
//why there is the || character == '@'???
//----------------------------------------
//see the issue #149968. It is the simpliest
//and not very harmful solution to that.
//In principle we need to recognize three at signs
// (@@@) anywhere in the html code and ignore it.
//This mark can occure in the generated virtual
//html code and denotes the places where there is
//some templating language in the real document.
//To fix this completely properly I would have to
//either somehow preprocess the text or introduce some
//more states to the already complicated lexer.
//The sideeffect of this change is that a single at sign
//wont be signalled as error in the editor and lexed as whitespace
//which doesn't sound too bad.
//
//note: the language construct where one generates
//attribute name doesn't work, but I consider this a quite
//unusuall: <div <? echo "align"; ?>="center" />
return Character.isWhitespace(character) || character == '@';
}
private boolean isJavascriptEventHandlerName(CharSequence attributeName) {
if(attributeName == null) {
return false;
}
if(attributeName.length() > 2) {
char firstChar = attributeName.charAt(0);
char secondChar = attributeName.charAt(1);
if((firstChar == 'o' || firstChar == 'O') &&
(secondChar == 'n' || secondChar == 'N')) {
return EVENT_HANDLER_NAMES.contains(attributeName.toString().toLowerCase(Locale.ENGLISH));
}
}
return false;
}
private boolean isStyleAttributeName(CharSequence chs) {
if(chs == null) {
return false;
}
outer: for (int j = 0; j < STYLE_ATTRS.length; j++) {
if (chs.length() == STYLE_ATTRS[j].length()) {
for (int i = 0; i < chs.length(); i++) {
if (Character.toLowerCase(chs.charAt(i)) != Character.toLowerCase(STYLE_ATTRS[j].charAt(i))) {
continue outer;
}
}
return true;
}
}
return false;
}
private CharSequence getScriptType(CharSequence attributeValue, boolean quoted) {
char lastChar = attributeValue.charAt(attributeValue.length() - 1);
boolean hasEndQuote = attributeValue.length() > 1 && (lastChar == '\'' || lastChar == '"');
return quoted ? attributeValue.subSequence(1, attributeValue.length() - (hasEndQuote ? 1 : 0)) : attributeValue;
}
private boolean followsCloseTag(CharSequence closeTagName) {
int actChar;
int prev_read = input.readLength(); //remember the size of the read sequence //substract the first read character
int read = 0;
while(true) {
actChar = input.read();
read++;
if(!(Character.isLetter(actChar) ||
Character.isDigit(actChar) ||
(actChar == '_') ||
(actChar == '-') ||
(actChar == ':') ||
(actChar == '.') ||
(actChar == '/')) ||
(actChar == EOF)) { // EOL or not alpha
//end of tagname
CharSequence tagName = input.readText().subSequence(prev_read, prev_read + read - 1);
input.backup(read); //put the lookahead text back to the buffer
if(equals(closeTagName, tagName, true, true)) {
if(actChar == '>') {
return true;
}
}
return false;
}
}
}
@Override
public Token<HTMLTokenId> nextToken() {
int actChar;
main: while (true) {
actChar = input.read();
if (actChar == EOF) {
if(input.readLengthEOF() == 1) {
return null; //just EOL is read
} else {
//there is something else in the buffer except EOL
//we will return last token now
input.backup(1); //backup the EOL, we will return null in next nextToken() call
break;
}
}
//System.out.println("HTMLSyntax: parseToken tokenOffset=" + tokenOffset + ", actChar='" + actChar + "', offset=" + offset + ", state=" + getStateName(state) +
// ", stopOffset=" + stopOffset + ", lastBuffer=" + lastBuffer);
switch( lexerState ) {
case INIT: // DONE
switch( actChar ) {
case '<':
lexerState = ISA_LT;
continue main;
case '&':
lexerState = ISA_REF;
lexerSubState = ISI_TEXT;
continue main;
default:
lexerState = ISI_TEXT;
break;
}
//fall through to ISI_TEXT
case ISI_TEXT: // DONE
switch( actChar ) {
case '<':
case '&':
lexerState = INIT;
input.backup(1);
if(input.readLength() > 0) { //is there any text before & or < ???
return token(HTMLTokenId.TEXT);
}
break;
}
//custom EL support
delimiters: for(byte delimiterIndex = 0; delimiterIndex < customELQuery.getOpenDelimiters().length; delimiterIndex++ ) {
String openDelimiter = customELQuery.getOpenDelimiters()[delimiterIndex];
if(openDelimiter == null) {
continue;
}
int alreadyRead = input.readLength();
char read = (char)actChar; //first char is already read
for(int i = 0; i < openDelimiter.length(); i++) {
char delimChar = openDelimiter.charAt(i);
if(read != delimChar) {
//no match
input.backup(input.readLengthEOF() - alreadyRead); //backup text
continue delimiters; //and try next one
}
if((i+1) < openDelimiter.length()) {
//will be next loop, read char
read = (char)input.read();
}
}
//we've found an open delimiter
//check if the there was already something read before checking the delimiter,
//if so then return it and re-run this step again so then we can return
//clean token for the delimiter
if(input.readLength() > openDelimiter.length()) {
input.backup(openDelimiter.length());
return token(HTMLTokenId.TEXT);
} else {
//return the open symbol token and switch to "in el" state
lexerState = ISI_EL;
customELIndex = (byte)(delimiterIndex + 1); //0 is reserved for "no delimiter", 1 means delimiter with index 0
//save the provider's index in the delimiter token's property so once can recognize what should be
//the delimiters' content if it is empty
//TODO "contentMimetype" INTO API???
return token(HTMLTokenId.EL_OPEN_DELIMITER,
new HtmlTokenPropertyProvider(EL_EXPRESSION_CONTENT_MIMETYPE_TOKEN_PROPERTY_KEY, customELQuery.getMimeTypes()[delimiterIndex]));
}
}
break;
case ISI_EL:
delimiters: for(byte delimiterIndex = 0; delimiterIndex < customELQuery.getOpenDelimiters().length; delimiterIndex++ ) {
String closeDelimiter = customELQuery.getCloseDelimiters()[delimiterIndex];
if(closeDelimiter == null) {
continue;
}
int alreadyRead = input.readLength();
char read = (char)actChar; //first char is already read
for(int i = 0; i < closeDelimiter.length(); i++) {
char delimChar = closeDelimiter.charAt(i);
if(read != delimChar) {
//no match
input.backup(input.readLength() - alreadyRead); //backup text
continue delimiters; //and try next one
}
if((i+1) < closeDelimiter.length()) {
//will be next loop, read char
read = (char)input.read();
}
}
//we've found a close delimiter
//check if the there was already something read before checking the delimiter,
//if so then return it and re-run this step again so then we can return
//clean token for the delimiter
if(input.readLength() > closeDelimiter.length()) {
input.backup(closeDelimiter.length());
//save the provider's index in the token's property so we can set the corresponding embdding in HTMLTokenId.language()
return token(HTMLTokenId.EL_CONTENT, new HtmlTokenPropertyProvider(EL_CONTENT_PROVIDER_INDEX, new Byte((byte)(customELIndex - 1))));
} else {
//return the open symbol token and switch to "in el" state
lexerState = INIT;
customELIndex = INIT;
return token(HTMLTokenId.EL_CLOSE_DELIMITER);
}
}
break;
case ISI_ERROR: // DONE
lexerState = INIT;
tag = null;
return token(HTMLTokenId.ERROR);
case ISA_LT: // PENDING other transitions - e.g '<?'
if( isAZ( actChar ) ) { // <'a..Z'
lexerState = ISI_TAG;
if(input.readLength() > 1) { //lexer restart check, token already returned before last EOF
input.backup(1);
return token(HTMLTokenId.TAG_OPEN_SYMBOL);
}
break;
}
switch( actChar ) {
case '/': // ETAGO - </
lexerState = ISA_SLASH;
return token(HTMLTokenId.TAG_OPEN_SYMBOL);
case '>': // Empty start tag <>, RELAXED
lexerState = INIT;
return token(HTMLTokenId.TAG_CLOSE_SYMBOL);
case '!':
lexerState = ISA_SGML_ESCAPE;
break;
case '?':
lexerState = ISI_XML_PI;
break;
default:
input.backup(1);
lexerState = ISI_TEXT;
break;
}
break;
case ISI_XML_PI:
if(actChar == '?') {
lexerState = ISI_XML_PI_QM;
break;
}
//else stay in XML PI
break;
case ISI_XML_PI_QM:
if(actChar == '>') {
//XML PI token
lexerState = INIT;
return token(HTMLTokenId.XML_PI);
} else {
lexerState = ISI_XML_PI;
break;
}
case ISA_SLASH: // DONE
if( isAZ( actChar ) ) { // </'a..Z'
lexerState = ISI_ENDTAG;
break;
}
switch( actChar ) {
case '>': // Empty end tag </>, RELAXED
lexerState = INIT;
return token(HTMLTokenId.TAG_CLOSE_SYMBOL);
default: // Part of text, e.g. </3, </'\n', RELAXED
lexerState = ISI_TEXT;
input.backup(1);
break;
}
break;
case ISI_ENDTAG: // DONE
if( isName( actChar ) ) break; // Still in endtag identifier, eat next char
lexerState = ISP_ENDTAG_X;
if(input.readLength() > 1) { //lexer restart check, token already returned before last EOF
input.backup(1);
return token(HTMLTokenId.TAG_CLOSE);
}
break;
case ISP_ENDTAG_X: // DONE
if( isWS( actChar ) ) {
lexerState = ISP_ENDTAG_WS;
break;
}
tag = null;
switch( actChar ) {
case '>': // Closing of endtag, e.g. </H6 _>_
lexerState = INIT;
return token(HTMLTokenId.TAG_CLOSE_SYMBOL);
case '<': // next tag, e.g. </H6 _<_, RELAXED
lexerState = INIT;
input.backup(1);
break;
default:
lexerState = ISI_ERROR;
input.backup(1);
break;
}
break;
case ISP_ENDTAG_WS: // DONE
if( isWS( actChar ) ) break; // eat all WS
lexerState = ISP_ENDTAG_X;
if(input.readLength() > 1) { //lexer restart check, token already returned before last EOF
input.backup(1);
return token(HTMLTokenId.WS);
}
break;
case ISI_TAG: // DONE
if( isName( actChar ) ) break; // Still in tag identifier, eat next char
lexerState = ISP_TAG_X;
if(input.readLength() > 1) { //lexer restart check, token already returned before last EOF
input.backup(1);
//test if the tagname is SCRIPT
tag = input.readText().toString();
if(equals(SCRIPT, tag, true, true)) {
lexerEmbeddingState = ISI_SCRIPT;
}
if(equals(STYLE, tag, true, true)) {
lexerEmbeddingState = ISI_STYLE;
}
return token(HTMLTokenId.TAG_OPEN);
}
break;
case ISP_TAG_X: // DONE
if( isWS( actChar ) ) {
lexerState = ISP_TAG_WS;
break;
}
if( isAttributeName(actChar) ) {
lexerState = ISI_ARG;
break;
}
switch( actChar ) {
case '/':
lexerState = ISI_TAG_SLASH;
break;
case '>':
switch (lexerEmbeddingState) {
case INIT:
lexerState = INIT;
break;
case ISI_SCRIPT:
//script w/ "text/html" content type workaround
//do lex the script content as normal html code
if(scriptType != null && "text/html".equalsIgnoreCase(scriptType)) { //NOI18N
lexerEmbeddingState = INIT;
scriptType = null;
lexerState = INIT;
} else {
lexerState = ISI_SCRIPT_CONTENT;
}
break;
case ISI_STYLE:
lexerState = ISI_STYLE_CONTENT;
break;
}
tag = null;
return token(HTMLTokenId.TAG_CLOSE_SYMBOL);
case '<':
tag = null;
lexerState = INIT;
input.backup(1);
break;
default:
lexerState = ISP_TAG_X_ERROR;
break;
}
break;
case ISP_TAG_X_ERROR:
if(isWS(actChar)) {
lexerState = ISP_TAG_X;
input.backup(1); //backup the WS
return token(HTMLTokenId.ERROR);
}
switch(actChar) {
case '/':
case '>':
lexerState = ISP_TAG_X;
input.backup(1); //lets reread the token again
return token(HTMLTokenId.ERROR);
}
//stay in error
break;
case ISP_TAG_WS: // DONE
if( isWS( actChar ) ) break; // eat all WS
lexerState = ISP_TAG_X;
if(input.readLength() > 1) { //lexer restart check, token already returned before last EOF
input.backup(1);
return token(HTMLTokenId.WS);
}
case ISI_TAG_SLASH:
tag = null;
switch( actChar ) {
case '>':
lexerEmbeddingState = INIT; //possibly cancel 'in script' if empty tag found
lexerState = INIT;
return token(HTMLTokenId.TAG_CLOSE_SYMBOL);
default:
lexerState = ISP_TAG_X;
input.backup(1);
return token(HTMLTokenId.ERROR);
}
case ISI_SCRIPT_CONTENT:
switch( actChar ) {
case '<' :
lexerState = ISI_SCRIPT_CONTENT_AFTER_LT;
break;
default:
break;
}
break;
case ISI_SCRIPT_CONTENT_AFTER_LT:
if (actChar == '/') {
if (followsCloseTag(SCRIPT)) {
//end of script section found
lexerEmbeddingState = INIT;
lexerState = INIT;
tag = null;
String type = scriptType;
scriptType = null;
input.backup(input.readLength() > 2 ? 2 : input.readLength()); //backup the '</', we will read it again
if (input.readLength() > 0) {
//the script has a body
return token(HTMLTokenId.SCRIPT, new HtmlTokenPropertyProvider(HTMLTokenId.SCRIPT_TYPE_TOKEN_PROPERTY, type)); //NOI18N
} else {
break;
}
}
}
lexerState = ISI_SCRIPT_CONTENT;
break;
case ISI_STYLE_CONTENT:
switch( actChar ) {
case '<' :
lexerState = ISI_STYLE_CONTENT_AFTER_LT;
break;
default:
break;
}
break;
case ISI_STYLE_CONTENT_AFTER_LT:
if (actChar == '/') {
if (followsCloseTag(STYLE)) {
//end of script section found
lexerEmbeddingState = INIT;
lexerState = INIT;
tag = null;
input.backup(input.readLength() > 2 ? 2 : input.readLength()); //backup the '</', we will read it again
if (input.readLength() > 0) {
//the script has a body
return token(HTMLTokenId.STYLE);
} else {
break;
}
}
}
lexerState = ISI_STYLE_CONTENT;
break;
case ISI_ARG: // DONE
if( isAttributeName(actChar) ) break; // eat next char
lexerState = ISP_ARG_X;
if(input.readLength() > 1) { //lexer restart check, token already returned before last EOF
input.backup(1);
attribute =input.readText().toString();
return token(HTMLTokenId.ARGUMENT);
}
break;
case ISP_ARG_X:
if( isWS( actChar ) ) {
lexerState = ISP_ARG_WS;
break;
}
if( isAttributeName(actChar) ) {
lexerState = ISI_ARG;
break;
}
switch( actChar ) {
case '/':
case '>':
input.backup(1);
lexerState = ISP_TAG_X;
break;
case '<':
lexerState = INIT;
input.backup(1);
break;
case '=':
lexerState = ISP_EQ;
return token(HTMLTokenId.OPERATOR);
default:
lexerState = ISI_ERROR;
input.backup(1);
break;
}
break;
case ISP_ARG_WS:
if( isWS( actChar ) ) break; // Eat all WhiteSpace
lexerState = ISP_ARG_X;
if(input.readLength() > 1) { //lexer restart check, token already returned before last EOF
input.backup(1);
return token(HTMLTokenId.WS);
}
break;
case ISP_EQ:
if( isWS( actChar ) ) {
lexerState = ISP_EQ_WS;
break;
}
switch( actChar ) {
case '\'':
quoteType = false;
lexerState = ISI_VAL_QUOT;
break;
case '"':
quoteType = true;
lexerState = ISI_VAL_QUOT;
break;
case '/':
case '>':
case '<':
input.backup(1);
lexerState = ISP_TAG_X;
break;
default:
lexerState = ISI_VAL; //everything else if attribute value
break;
}
break;
case ISP_EQ_WS:
if( isWS( actChar ) ) break; // Consume all WS
lexerState = ISP_EQ;
if(input.readLength() > 1) { //lexer restart check, token already returned before last EOF
input.backup(1);
return token(HTMLTokenId.WS);
}
break;
case ISI_VAL:
if(actChar == '/') {
//slash in unquoted value -- may be there but not followed by >.
//In such case IMO the value should be closed
char next = (char)input.read();
input.backup(1); //backup the next char
if(next != '>') {
//continue lexing the value
break;
}
} else if(!isWS(actChar) && actChar != '>' && actChar != '<') {
break; //continue lexing the attribute value
}
//finish lexing the value
lexerState = ISP_TAG_X;
if(input.readLength() > 1) { //lexer restart check, token already returned before last EOF
input.backup(1);
Token<HTMLTokenId> resolveValueToken = resolveValueToken();
attribute = null;
return resolveValueToken;
}
break;
case ISI_VAL_QUOT:
//custom EL support
delimiters: for(byte delimiterIndex = 0; delimiterIndex < customELQuery.getOpenDelimiters().length; delimiterIndex++ ) {
String openDelimiter = customELQuery.getOpenDelimiters()[delimiterIndex];
if(openDelimiter == null) {
continue;
}
int alreadyRead = input.readLength();
char read = (char)actChar; //first char is already read
for(int i = 0; i < openDelimiter.length(); i++) {
char delimChar = openDelimiter.charAt(i);
if(read != delimChar) {
//no match
input.backup(input.readLength() - alreadyRead); //backup text
continue delimiters; //and try next one
}
if((i+1) < openDelimiter.length()) {
//will be next loop, read char
read = (char)input.read();
}
}
//we've found an open delimiter
//check if the there was already something read before checking the delimiter,
//if so then return it and re-run this step again so then we can return
//clean token for the delimiter
if(input.readLength() > openDelimiter.length()) {
input.backup(openDelimiter.length());
return resolveValueToken();
} else {
//return the open symbol token and switch to "in el" state
lexerState = ISI_VAL_QUOT_EL;
customELIndex = (byte)(delimiterIndex + 1); //0 is reserved for "no delimiter", 1 means delimiter with index 0
//save the provider's index in the delimiter token's property so once can recognize what should be
//the delimiters' content if it is empty
//TODO "contentMimetype" INTO API???
return token(HTMLTokenId.EL_OPEN_DELIMITER,
new HtmlTokenPropertyProvider(EL_EXPRESSION_CONTENT_MIMETYPE_TOKEN_PROPERTY_KEY, customELQuery.getMimeTypes()[delimiterIndex]));
}
}
switch (actChar) {
case '\\':
//may be escaped quote
lexerState = ISI_VAL_QUOT_ESC;
break;
case '\'':
case '"':
if(actChar == '\'' && !quoteType || actChar == '"' && quoteType) {
//reset the 'script embedding will follow state' if the value represents a
//type attribute value of a script tag
if(equals(SCRIPT, tag, true, true) && equals("type", attribute, true, true)) { //NOI18N
//inside script tag
scriptType = getScriptType(input.readText(), true).toString();
}
lexerState = ISP_TAG_X;
Token<HTMLTokenId> resolveValueToken = resolveValueToken();
attribute = null;
return resolveValueToken;
}
}
break; // else simply consume next char of VALUE
case ISI_VAL_QUOT_EL:
delimiters: for(byte delimiterIndex = 0; delimiterIndex < customELQuery.getOpenDelimiters().length; delimiterIndex++ ) {
String closeDelimiter = customELQuery.getCloseDelimiters()[delimiterIndex];
if(closeDelimiter == null) {
continue;
}
int alreadyRead = input.readLength();
char read = (char)actChar; //first char is already read
for(int i = 0; i < closeDelimiter.length(); i++) {
char delimChar = closeDelimiter.charAt(i);
if(read != delimChar) {
//no match
input.backup(input.readLength() - alreadyRead); //backup text
continue delimiters; //and try next one
}
if((i+1) < closeDelimiter.length()) {
//will be next loop, read char
read = (char)input.read();
}
}
//we've found a close delimiter
//check if the there was already something read before checking the delimiter,
//if so then return it and re-run this step again so then we can return
//clean token for the delimiter
if(input.readLength() > closeDelimiter.length()) {
input.backup(closeDelimiter.length());
//save the provider's index in the token's property so we can set the corresponding embdding in HTMLTokenId.language()
return token(HTMLTokenId.EL_CONTENT, new HtmlTokenPropertyProvider(EL_CONTENT_PROVIDER_INDEX, new Byte((byte)(customELIndex - 1))));
} else {
//return the close symbol token and switch to "in value" state
lexerState = ISI_VAL_QUOT;
customELIndex = INIT;
return token(HTMLTokenId.EL_CLOSE_DELIMITER);
}
}
break;
case ISI_VAL_QUOT_ESC:
//Just consume the escaped char.
//The state prevents the quoted value
//to be finished by an escaped quote.
lexerState = ISI_VAL_QUOT;
break;
case ISA_SGML_ESCAPE: // DONE
if( isAZ(actChar) ) {
lexerState = ISI_SGML_DECL;
break;
}
switch( actChar ) {
case '-':
lexerState = ISA_SGML_DASH;
break;
default:
lexerState = ISI_TEXT;
input.backup(1);
continue;
}
break;
case ISA_SGML_DASH: // DONE
switch( actChar ) {
case '-':
lexerState = ISI_HTML_COMMENT;
break;
default:
lexerState = ISI_TEXT;
input.backup(1);
continue;
}
break;
case ISI_HTML_COMMENT: // DONE
switch( actChar ) {
case '-':
lexerState = ISA_HTML_COMMENT_DASH;
break;
//create an HTML comment token for each line of the comment - a performance fix for #43532
case '\n':
//leave the some state - we are still in an HTML comment,
//we just need to create a token for each line.
return token(HTMLTokenId.BLOCK_COMMENT);
}
break;
case ISA_HTML_COMMENT_DASH:
switch( actChar ) {
case '-':
lexerState = ISI_HTML_COMMENT_WS;
break;
default:
lexerState = ISI_HTML_COMMENT;
continue;
}
break;
case ISI_HTML_COMMENT_WS: // DONE
switch( actChar ) {
case '>':
lexerState = INIT;
return token(HTMLTokenId.BLOCK_COMMENT);
default:
lexerState = ISI_HTML_COMMENT;
input.backup(2); //backup everything except the first comma
break;
}
break;
case ISI_SGML_DECL:
if(Character.isWhitespace(actChar)) {
lexerState = ISI_SGML_DECL_WS;
if(input.readLength() > 1) {
input.backup(1); //backup the whitespace
return token(HTMLTokenId.DECLARATION);
}
break;
}
switch( actChar ) {
case '>':
if(input.readLength() > 1) {
input.backup(1); //backup the '<' char
return token(HTMLTokenId.DECLARATION);
} else {
//just the symbol read - return it as a part of declaration
lexerState = INIT;
return token(HTMLTokenId.DECLARATION);
}
}
break;
case ISI_SGML_DECL_WS:
if(actChar == '-') {
if( input.readLength() == 1 ) {
lexerState = ISA_SGML_DECL_DASH;
break;
} else {
if(input.readLength() > 1) { //lexer restart check, token already returned before last EOF
input.backup(1);
return token(HTMLTokenId.DECLARATION);
}
}
} else if(!Character.isWhitespace(actChar)) {
lexerState = ISI_SGML_DECL;
input.backup(1);
return token(HTMLTokenId.WS);
}
break;
case ISA_SGML_DECL_DASH:
if( actChar == '-' ) {
lexerState = ISI_SGML_COMMENT;
break;
} else {
lexerState = ISI_SGML_DECL;
input.backup(1);
continue;
}
case ISI_SGML_COMMENT:
switch( actChar ) {
case '-':
lexerState = ISA_SGML_COMMENT_DASH;
break;
}
break;
case ISA_SGML_COMMENT_DASH:
if( actChar == '-' ) {
lexerState = ISI_SGML_DECL;
return token(HTMLTokenId.SGML_COMMENT);
} else {
lexerState = ISI_SGML_COMMENT;
input.backup(1);
continue;
}
case ISA_REF:
if( isAZ( actChar ) ) {
lexerState = ISI_REF_NAME;
break;
}
if( actChar == '#' ) {
lexerState = ISA_REF_HASH;
break;
}
lexerState = lexerSubState;
input.backup(1);
continue;
case ISI_REF_NAME:
if( isName( actChar ) ) break;
lexerState = lexerSubState;
if(input.readLength() > 1) { //lexer restart check, token already returned before last EOF
if( actChar != ';' ) {
input.backup(1);
return token(HTMLTokenId.TEXT);
}
return token(HTMLTokenId.CHARACTER);
}
break;
case ISA_REF_HASH:
if( actChar >= '0' && actChar <= '9' ) {
lexerState = ISI_REF_DEC;
break;
}
if( actChar == 'x' || actChar == 'X' ) {
lexerState = ISA_REF_X;
break;
}
if( isAZ( actChar ) ) {
lexerState = lexerSubState;
return token(HTMLTokenId.ERROR);
}
lexerState = lexerSubState;
input.backup(1);
continue;
case ISI_REF_DEC:
if( actChar >= '0' && actChar <= '9' ) break;
lexerState = lexerSubState;
if(input.readLength() > 1) { //lexer restart check, token already returned before last EOF
if( actChar != ';' )
input.backup(1);
return token(HTMLTokenId.CHARACTER);
}
break;
case ISA_REF_X:
if( (actChar >= '0' && actChar <= '9') ||
(actChar >= 'a' && actChar <= 'f') ||
(actChar >= 'A' && actChar <= 'F')
) {
lexerState = ISI_REF_HEX;
break;
}
lexerState = lexerSubState;
if(input.readLength() > 1) { //lexer restart check, token already returned before last EOF
input.backup(1);
return token(HTMLTokenId.ERROR); // error on previous "&#x" sequence
}
break;
case ISI_REF_HEX:
if( (actChar >= '0' && actChar <= '9') ||
(actChar >= 'a' && actChar <= 'f') ||
(actChar >= 'A' && actChar <= 'F')
) break;
lexerState = lexerSubState;
if(input.readLength() > 1) { //lexer restart check, token already returned before last EOF
if( actChar != ';' )
input.backup(1);
return token(HTMLTokenId.CHARACTER);
}
break;
}
} // end of while(offset...)
/** At this stage there's no more text in the scanned buffer.
* Scanner first checks whether this is completely the last
* available buffer.
*/
switch( lexerState ) {
case INIT:
if (input.readLength() == 0) {
return null;
}
break;
case ISI_TEXT:
case ISA_LT:
case ISA_SLASH:
case ISA_SGML_ESCAPE:
case ISA_SGML_DASH:
case ISI_TAG_SLASH:
return token(HTMLTokenId.TEXT);
case ISI_XML_PI:
case ISI_XML_PI_QM:
return token(HTMLTokenId.XML_PI);
case ISA_REF:
case ISA_REF_HASH:
if( lexerSubState == ISI_TEXT ) return token(HTMLTokenId.TEXT);
else return token(HTMLTokenId.VALUE);
case ISI_HTML_COMMENT:
case ISA_HTML_COMMENT_DASH:
case ISI_HTML_COMMENT_WS:
return token(HTMLTokenId.BLOCK_COMMENT);
case ISI_TAG:
lexerState = ISP_TAG_X;
//test if the tagname is SCRIPT
if(equals(SCRIPT, input.readText(), true, true)) {
lexerEmbeddingState = ISI_SCRIPT;
}
if(equals(STYLE, input.readText(), true, true)) {
lexerEmbeddingState = ISI_STYLE;
}
return token(HTMLTokenId.TAG_OPEN);
case ISI_ENDTAG:
return token(HTMLTokenId.TAG_CLOSE);
case ISI_ARG:
return token(HTMLTokenId.ARGUMENT);
case ISI_ERROR:
case ISP_TAG_X_ERROR:
return token(HTMLTokenId.ERROR);
case ISP_ARG_WS:
case ISP_TAG_WS:
case ISP_ENDTAG_WS:
case ISP_EQ_WS:
return token(HTMLTokenId.WS);
case ISP_ARG_X:
case ISP_TAG_X:
case ISP_ENDTAG_X:
case ISP_EQ:
return token(HTMLTokenId.WS);
case ISI_VAL:
case ISI_VAL_QUOT:
case ISI_VAL_QUOT_ESC:
return resolveValueToken();
case ISI_SGML_DECL:
case ISA_SGML_DECL_DASH:
case ISI_SGML_DECL_WS:
return token(HTMLTokenId.DECLARATION);
case ISI_SGML_COMMENT:
case ISA_SGML_COMMENT_DASH:
return token(HTMLTokenId.SGML_COMMENT);
case ISI_REF_NAME:
case ISI_REF_DEC:
case ISA_REF_X:
case ISI_REF_HEX:
return token(HTMLTokenId.TEXT);
case ISI_SCRIPT_CONTENT:
case ISI_SCRIPT_CONTENT_ENDTAG:
case ISI_SCRIPT_CONTENT_AFTER_LT:
return token(HTMLTokenId.SCRIPT);
case ISI_STYLE_CONTENT:
case ISI_STYLE_CONTENT_ENDTAG:
case ISI_STYLE_CONTENT_AFTER_LT:
return token(HTMLTokenId.STYLE);
case ISI_EL:
case ISI_VAL_QUOT_EL:
return token(HTMLTokenId.EL_CONTENT, new HtmlTokenPropertyProvider(EL_CONTENT_PROVIDER_INDEX, new Byte((byte)(customELIndex - 1))));
}
assert input.readLength() == 0 : "Returning null even if some chars still needs to be tokenized! " +
"lexer state=" + lexerState + "; " +
"lexer substate=" + lexerSubState + "; " +
"lexer embedding state=" + lexerEmbeddingState + "; " +
"readtext='" + input.readText() + "'";
return null;
}
private static final String CLASS_ATTR_NAME = "class"; //NOI18N
private static final String ID_ATTR_NAME = "id"; //NOI18N
private Token<HTMLTokenId> resolveValueToken() {
assert attribute != null;
//onclick and similar method javascript embedding
if (isJavascriptEventHandlerName(attribute)) {
return token(HTMLTokenId.VALUE_JAVASCRIPT);
}
//style, id or class attribute value css embeddeding
if (isStyleAttributeName(attribute)) {
return createCssValueToken();
}
//generic css "class" embedding
if (cssClassTagAttrMap != null && tag != null) {
Collection attrs = cssClassTagAttrMap.get(tag);
if (attrs != null && attrs.contains(attribute)) {
//yup the attribute's value should have css "class" selector embedding
return token(HTMLTokenId.VALUE_CSS, CLASS_TOKEN_PP);
}
}
//lexer plugins:
String embeddingMimeType = HtmlPlugins.getDefault().createAttributeEmbedding(tag, attribute);
if (embeddingMimeType != null) {
LOGGER.log(Level.FINE, "creating html attribute value token {0} in tag {1} with embedding {2}",
new Object[]{attribute, tag, embeddingMimeType});
return token(HTMLTokenId.VALUE, new HtmlTokenPropertyProvider(ATTRIBUTE_VALUE_EMBEDDING_MIMETYPE_TOKEN_PROPERTY_KEY, embeddingMimeType));
}
return token(HTMLTokenId.VALUE);
}
private Token<HTMLTokenId> createCssValueToken() {
TokenPropertyProvider provider;
if(equals(CLASS_ATTR_NAME, attribute, true, true)) {
provider = CLASS_TOKEN_PP;
} else if(equals(ID_ATTR_NAME, attribute, true, true)) {
provider = ID_TOKEN_PP;
} else {
provider = null;
}
return token(HTMLTokenId.VALUE_CSS, provider);
}
private Token<HTMLTokenId> token(HTMLTokenId tokenId) {
return token(tokenId, null);
}
private Token<HTMLTokenId> token(HTMLTokenId tokenId, TokenPropertyProvider tokenPropertyProvider) {
if(LOG) {
if(input.readLength() == 0) {
LOGGER.log(Level.INFO, "Found zero length token: "); //NOI18N
}
LOGGER.log(Level.INFO, "[{0}] token (''{1}''; id={2}; state={3})\n", new Object[]{this.getClass().getSimpleName(), input.readText().toString(), tokenId, state()}); //NOI18N
}
if(tokenPropertyProvider != null) {
return tokenFactory.createPropertyToken(tokenId, input.readLength(), tokenPropertyProvider);
} else {
CharSequence image = input.readText();
switch(tokenId) {
case OPERATOR:
return tokenFactory.getFlyweightToken(tokenId, IMG_EQUAL_SIGN);
case TAG_CLOSE_SYMBOL:
switch(image.charAt(0)) {
case '/':
if(input.readLength() > 1) {
if(image.charAt(1) == '>') {
return tokenFactory.getFlyweightToken(tokenId, IMG_CLOSE_TAG_SYMBOL2);
}
}
break;
case '>':
return tokenFactory.getFlyweightToken(tokenId, IMG_CLOSE_TAG_SYMBOL);
}
case TAG_OPEN_SYMBOL:
switch(image.charAt(0)) {
case '<':
if(input.readLength() > 1) {
if(image.charAt(1) == '/') {
return tokenFactory.getFlyweightToken(tokenId, IMG_OPEN_TAG_SYMBOL2);
}
break;
} else {
return tokenFactory.getFlyweightToken(tokenId, IMG_OPEN_TAG_SYMBOL);
}
}
case TAG_OPEN:
case TAG_CLOSE:
String cachedTagName = HtmlElements.getCachedTagName(image);
if(cachedTagName != null) {
assert (cachedTagName.length() <= input.readLength()) : "readlength == " + input.readLength() + "; text=" + cachedTagName + "; image=" + image;
return tokenFactory.getFlyweightToken(tokenId, cachedTagName);
}
break;
case ARGUMENT:
String cachedAttrName = HtmlElements.getCachedAttrName(image);
if(cachedAttrName != null) {
assert (cachedAttrName.length() <= input.readLength()) : "readlength == " + input.readLength() + "; text=" + cachedAttrName + "; image=" + image;
return tokenFactory.getFlyweightToken(tokenId, cachedAttrName);
}
break;
}
return tokenFactory.createToken(tokenId);
}
}
@Override
public void release() {
}
/** @param optimized - first sequence is lowercase, one call to Character.toLowerCase() */
private static boolean equals(CharSequence text1, CharSequence text2, boolean ignoreCase, boolean optimized) {
if (text1 == text2) {
return true;
}
if (text1 == null || text2 == null) {
return false;
}
if (text1.length() != text2.length()) {
return false;
} else {
//compare content
for (int i = 0; i < text1.length(); i++) {
char ch1 = ignoreCase && !optimized ? Character.toLowerCase(text1.charAt(i)) : text1.charAt(i);
char ch2 = ignoreCase ? Character.toLowerCase(text2.charAt(i)) : text2.charAt(i);
if (ch1 != ch2) {
return false;
}
}
return true;
}
}
private static class HtmlTokenPropertyProvider implements TokenPropertyProvider {
private final String key;
private final Object value;
HtmlTokenPropertyProvider(String key, Object value) {
this.key = key;
this.value = value;
}
@Override
public Object getValue(Token token, Object key) {
if (this.key.equals(key)) {
return value;
} else {
return null;
}
}
}
private static final TokenPropertyProvider CLASS_TOKEN_PP = new HtmlTokenPropertyProvider(HTMLTokenId.VALUE_CSS_TOKEN_TYPE_PROPERTY, HTMLTokenId.VALUE_CSS_TOKEN_TYPE_CLASS);
private static final TokenPropertyProvider ID_TOKEN_PP = new HtmlTokenPropertyProvider(HTMLTokenId.VALUE_CSS_TOKEN_TYPE_PROPERTY, HTMLTokenId.VALUE_CSS_TOKEN_TYPE_ID);
}