| /* |
| * The Apache Software License, Version 1.1 |
| * |
| * |
| * Copyright (c) 2003 The Apache Software Foundation. All rights |
| * reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in |
| * the documentation and/or other materials provided with the |
| * distribution. |
| * |
| * 3. The end-user documentation included with the redistribution, |
| * if any, must include the following acknowledgment: |
| * "This product includes software developed by the |
| * Apache Software Foundation (http://www.apache.org/)." |
| * Alternately, this acknowledgment may appear in the software itself, |
| * if and wherever such third-party acknowledgments normally appear. |
| * |
| * 4. The names "Apache" and "Apache Software Foundation" must |
| * not be used to endorse or promote products derived from this |
| * software without prior written permission. For written |
| * permission, please contact apache@apache.org. |
| * |
| * 5. Products derived from this software may not be called "Apache |
| * XMLBeans", nor may "Apache" appear in their name, without prior |
| * written permission of the Apache Software Foundation. |
| * |
| * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED |
| * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
| * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR |
| * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF |
| * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
| * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT |
| * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| * SUCH DAMAGE. |
| * ==================================================================== |
| * |
| * This software consists of voluntary contributions made by many |
| * individuals on behalf of the Apache Software Foundation and was |
| * originally based on software copyright (c) 2000-2003 BEA Systems |
| * Inc., <http://www.bea.com/>. For more information on the Apache Software |
| * Foundation, please see <http://www.apache.org/>. |
| */ |
| |
| package org.apache.xmlbeans.impl.regex; |
| |
| import java.util.Locale; |
| import java.util.MissingResourceException; |
| import java.util.ResourceBundle; |
| import java.util.Vector; |
| |
| /** |
| * A Regular Expression Parser. |
| */ |
| class RegexParser { |
| static final int T_CHAR = 0; |
| static final int T_EOF = 1; |
| static final int T_OR = 2; // '|' |
| static final int T_STAR = 3; // '*' |
| static final int T_PLUS = 4; // '+' |
| static final int T_QUESTION = 5; // '?' |
| static final int T_LPAREN = 6; // '(' |
| static final int T_RPAREN = 7; // ')' |
| static final int T_DOT = 8; // '.' |
| static final int T_LBRACKET = 9; // '[' |
| static final int T_BACKSOLIDUS = 10; // '\' |
| static final int T_CARET = 11; // '^' |
| static final int T_DOLLAR = 12; // '$' |
| static final int T_LPAREN2 = 13; // '(?:' |
| static final int T_LOOKAHEAD = 14; // '(?=' |
| static final int T_NEGATIVELOOKAHEAD = 15; // '(?!' |
| static final int T_LOOKBEHIND = 16; // '(?<=' |
| static final int T_NEGATIVELOOKBEHIND = 17; // '(?<!' |
| static final int T_INDEPENDENT = 18; // '(?>' |
| static final int T_SET_OPERATIONS = 19; // '(?[' |
| static final int T_POSIX_CHARCLASS_START = 20; // '[:' in a character class |
| static final int T_COMMENT = 21; // '(?#' |
| static final int T_MODIFIERS = 22; // '(?' [\-,a-z,A-Z] |
| static final int T_CONDITION = 23; // '(?(' |
| static final int T_XMLSCHEMA_CC_SUBTRACTION = 24; // '-[' in a character class |
| |
| static class ReferencePosition { |
| int refNumber; |
| int position; |
| ReferencePosition(int n, int pos) { |
| this.refNumber = n; |
| this.position = pos; |
| } |
| } |
| |
| int offset; |
| String regex; |
| int regexlen; |
| int options; |
| ResourceBundle resources; |
| int chardata; |
| int nexttoken; |
| static protected final int S_NORMAL = 0; |
| static protected final int S_INBRACKETS = 1; |
| static protected final int S_INXBRACKETS = 2; |
| int context = S_NORMAL; |
| int parennumber = 1; |
| boolean hasBackReferences; |
| Vector references = null; |
| |
| public RegexParser() { |
| this.setLocale(Locale.getDefault()); |
| } |
| public RegexParser(Locale locale) { |
| this.setLocale(locale); |
| } |
| |
| public void setLocale(Locale locale) { |
| try { |
| this.resources = ResourceBundle.getBundle("org.apache.xmlbeans.impl.regex.message", locale); |
| } catch (MissingResourceException mre) { |
| throw new RuntimeException("Installation Problem??? Couldn't load messages: " |
| +mre.getMessage()); |
| } |
| } |
| |
| final ParseException ex(String key, int loc) { |
| return new ParseException(this.resources.getString(key), loc); |
| } |
| |
| private final boolean isSet(int flag) { |
| return (this.options & flag) == flag; |
| } |
| |
| synchronized Token parse(String regex, int options) throws ParseException { |
| this.options = options; |
| this.offset = 0; |
| this.setContext(S_NORMAL); |
| this.parennumber = 1; |
| this.hasBackReferences = false; |
| this.regex = regex; |
| if (this.isSet(RegularExpression.EXTENDED_COMMENT)) |
| this.regex = REUtil.stripExtendedComment(this.regex); |
| this.regexlen = this.regex.length(); |
| |
| |
| this.next(); |
| Token ret = this.parseRegex(); |
| if (this.offset != this.regexlen) |
| throw ex("parser.parse.1", this.offset); |
| if (this.references != null) { |
| for (int i = 0; i < this.references.size(); i ++) { |
| ReferencePosition position = (ReferencePosition)this.references.elementAt(i); |
| if (this.parennumber <= position.refNumber) |
| throw ex("parser.parse.2", position.position); |
| } |
| this.references.removeAllElements(); |
| } |
| return ret; |
| } |
| |
| /* |
| public RegularExpression createRegex(String regex, int options) throws ParseException { |
| Token tok = this.parse(regex, options); |
| return new RegularExpression(regex, tok, this.parennumber, this.hasBackReferences, options); |
| } |
| */ |
| |
| protected final void setContext(int con) { |
| this.context = con; |
| } |
| |
| final int read() { |
| return this.nexttoken; |
| } |
| |
| final void next() { |
| if (this.offset >= this.regexlen) { |
| this.chardata = -1; |
| this.nexttoken = T_EOF; |
| return; |
| } |
| |
| int ret; |
| int ch = this.regex.charAt(this.offset++); |
| this.chardata = ch; |
| |
| if (this.context == S_INBRACKETS) { |
| // In a character class, this.chardata has one character, that is to say, |
| // a pair of surrogates is composed and stored to this.chardata. |
| switch (ch) { |
| case '\\': |
| ret = T_BACKSOLIDUS; |
| if (this.offset >= this.regexlen) |
| throw ex("parser.next.1", this.offset-1); |
| this.chardata = this.regex.charAt(this.offset++); |
| break; |
| |
| case '-': |
| if (this.isSet(RegularExpression.XMLSCHEMA_MODE) |
| && this.offset < this.regexlen && this.regex.charAt(this.offset) == '[') { |
| this.offset++; |
| ret = T_XMLSCHEMA_CC_SUBTRACTION; |
| } else |
| ret = T_CHAR; |
| break; |
| |
| case '[': |
| if (!this.isSet(RegularExpression.XMLSCHEMA_MODE) |
| && this.offset < this.regexlen && this.regex.charAt(this.offset) == ':') { |
| this.offset++; |
| ret = T_POSIX_CHARCLASS_START; |
| break; |
| } // Through down |
| default: |
| if (REUtil.isHighSurrogate(ch) && this.offset < this.regexlen) { |
| int low = this.regex.charAt(this.offset); |
| if (REUtil.isLowSurrogate(low)) { |
| this.chardata = REUtil.composeFromSurrogates(ch, low); |
| this.offset ++; |
| } |
| } |
| ret = T_CHAR; |
| } |
| this.nexttoken = ret; |
| return; |
| } |
| |
| switch (ch) { |
| case '|': ret = T_OR; break; |
| case '*': ret = T_STAR; break; |
| case '+': ret = T_PLUS; break; |
| case '?': ret = T_QUESTION; break; |
| case ')': ret = T_RPAREN; break; |
| case '.': ret = T_DOT; break; |
| case '[': ret = T_LBRACKET; break; |
| case '^': ret = T_CARET; break; |
| case '$': ret = T_DOLLAR; break; |
| case '(': |
| ret = T_LPAREN; |
| if (this.offset >= this.regexlen) |
| break; |
| if (this.regex.charAt(this.offset) != '?') |
| break; |
| if (++this.offset >= this.regexlen) |
| throw ex("parser.next.2", this.offset-1); |
| ch = this.regex.charAt(this.offset++); |
| switch (ch) { |
| case ':': ret = T_LPAREN2; break; |
| case '=': ret = T_LOOKAHEAD; break; |
| case '!': ret = T_NEGATIVELOOKAHEAD; break; |
| case '[': ret = T_SET_OPERATIONS; break; |
| case '>': ret = T_INDEPENDENT; break; |
| case '<': |
| if (this.offset >= this.regexlen) |
| throw ex("parser.next.2", this.offset-3); |
| ch = this.regex.charAt(this.offset++); |
| if (ch == '=') { |
| ret = T_LOOKBEHIND; |
| } else if (ch == '!') { |
| ret = T_NEGATIVELOOKBEHIND; |
| } else |
| throw ex("parser.next.3", this.offset-3); |
| break; |
| case '#': |
| while (this.offset < this.regexlen) { |
| ch = this.regex.charAt(this.offset++); |
| if (ch == ')') break; |
| } |
| if (ch != ')') |
| throw ex("parser.next.4", this.offset-1); |
| ret = T_COMMENT; |
| break; |
| default: |
| if (ch == '-' || 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z') {// Options |
| this.offset --; |
| ret = T_MODIFIERS; |
| break; |
| } else if (ch == '(') { // conditional |
| ret = T_CONDITION; // this.offsets points the next of '('. |
| break; |
| } |
| throw ex("parser.next.2", this.offset-2); |
| } |
| break; |
| |
| case '\\': |
| ret = T_BACKSOLIDUS; |
| if (this.offset >= this.regexlen) |
| throw ex("parser.next.1", this.offset-1); |
| this.chardata = this.regex.charAt(this.offset++); |
| break; |
| |
| default: |
| ret = T_CHAR; |
| } |
| this.nexttoken = ret; |
| } |
| |
| /** |
| * regex ::= term (`|` term)* |
| * term ::= factor+ |
| * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>' |
| * | atom (('*' | '+' | '?' | minmax ) '?'? )?) |
| * | '(?=' regex ')' | '(?!' regex ')' | '(?<=' regex ')' | '(?<!' regex ')' |
| * atom ::= char | '.' | range | '(' regex ')' | '(?:' regex ')' | '\' [0-9] |
| * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block |
| */ |
| Token parseRegex() throws ParseException { |
| Token tok = this.parseTerm(); |
| Token parent = null; |
| while (this.read() == T_OR) { |
| this.next(); // '|' |
| if (parent == null) { |
| parent = Token.createUnion(); |
| parent.addChild(tok); |
| tok = parent; |
| } |
| tok.addChild(this.parseTerm()); |
| } |
| return tok; |
| } |
| |
| /** |
| * term ::= factor+ |
| */ |
| Token parseTerm() throws ParseException { |
| int ch = this.read(); |
| if (ch == T_OR || ch == T_RPAREN || ch == T_EOF) { |
| return Token.createEmpty(); |
| } else { |
| Token tok = this.parseFactor(); |
| Token concat = null; |
| while ((ch = this.read()) != T_OR && ch != T_RPAREN && ch != T_EOF) { |
| if (concat == null) { |
| concat = Token.createConcat(); |
| concat.addChild(tok); |
| tok = concat; |
| } |
| concat.addChild(this.parseFactor()); |
| //tok = Token.createConcat(tok, this.parseFactor()); |
| } |
| return tok; |
| } |
| } |
| |
| // ---------------------------------------------------------------- |
| |
| Token processCaret() throws ParseException { |
| this.next(); |
| return Token.token_linebeginning; |
| } |
| Token processDollar() throws ParseException { |
| this.next(); |
| return Token.token_lineend; |
| } |
| Token processLookahead() throws ParseException { |
| this.next(); |
| Token tok = Token.createLook(Token.LOOKAHEAD, this.parseRegex()); |
| if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); |
| this.next(); // ')' |
| return tok; |
| } |
| Token processNegativelookahead() throws ParseException { |
| this.next(); |
| Token tok = Token.createLook(Token.NEGATIVELOOKAHEAD, this.parseRegex()); |
| if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); |
| this.next(); // ')' |
| return tok; |
| } |
| Token processLookbehind() throws ParseException { |
| this.next(); |
| Token tok = Token.createLook(Token.LOOKBEHIND, this.parseRegex()); |
| if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); |
| this.next(); // ')' |
| return tok; |
| } |
| Token processNegativelookbehind() throws ParseException { |
| this.next(); |
| Token tok = Token.createLook(Token.NEGATIVELOOKBEHIND, this.parseRegex()); |
| if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); |
| this.next(); // ')' |
| return tok; |
| } |
| Token processBacksolidus_A() throws ParseException { |
| this.next(); |
| return Token.token_stringbeginning; |
| } |
| Token processBacksolidus_Z() throws ParseException { |
| this.next(); |
| return Token.token_stringend2; |
| } |
| Token processBacksolidus_z() throws ParseException { |
| this.next(); |
| return Token.token_stringend; |
| } |
| Token processBacksolidus_b() throws ParseException { |
| this.next(); |
| return Token.token_wordedge; |
| } |
| Token processBacksolidus_B() throws ParseException { |
| this.next(); |
| return Token.token_not_wordedge; |
| } |
| Token processBacksolidus_lt() throws ParseException { |
| this.next(); |
| return Token.token_wordbeginning; |
| } |
| Token processBacksolidus_gt() throws ParseException { |
| this.next(); |
| return Token.token_wordend; |
| } |
| Token processStar(Token tok) throws ParseException { |
| this.next(); |
| if (this.read() == T_QUESTION) { |
| this.next(); |
| return Token.createNGClosure(tok); |
| } else |
| return Token.createClosure(tok); |
| } |
| Token processPlus(Token tok) throws ParseException { |
| // X+ -> XX* |
| this.next(); |
| if (this.read() == T_QUESTION) { |
| this.next(); |
| return Token.createConcat(tok, Token.createNGClosure(tok)); |
| } else |
| return Token.createConcat(tok, Token.createClosure(tok)); |
| } |
| Token processQuestion(Token tok) throws ParseException { |
| // X? -> X| |
| this.next(); |
| Token par = Token.createUnion(); |
| if (this.read() == T_QUESTION) { |
| this.next(); |
| par.addChild(Token.createEmpty()); |
| par.addChild(tok); |
| } else { |
| par.addChild(tok); |
| par.addChild(Token.createEmpty()); |
| } |
| return par; |
| } |
| boolean checkQuestion(int off) { |
| return off < this.regexlen && this.regex.charAt(off) == '?'; |
| } |
| Token processParen() throws ParseException { |
| this.next(); |
| int p = this.parennumber++; |
| Token tok = Token.createParen(this.parseRegex(), p); |
| if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); |
| this.next(); // Skips ')' |
| return tok; |
| } |
| Token processParen2() throws ParseException { |
| this.next(); |
| Token tok = Token.createParen(this.parseRegex(), 0); |
| if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); |
| this.next(); // Skips ')' |
| return tok; |
| } |
| Token processCondition() throws ParseException { |
| // this.offset points the next of '(' |
| if (this.offset+1 >= this.regexlen) throw ex("parser.factor.4", this.offset); |
| // Parses a condition. |
| int refno = -1; |
| Token condition = null; |
| int ch = this.regex.charAt(this.offset); |
| if ('1' <= ch && ch <= '9') { |
| refno = ch-'0'; |
| this.hasBackReferences = true; |
| if (this.references == null) this.references = new Vector(); |
| this.references.addElement(new ReferencePosition(refno, this.offset)); |
| this.offset ++; |
| if (this.regex.charAt(this.offset) != ')') throw ex("parser.factor.1", this.offset); |
| this.offset ++; |
| } else { |
| if (ch == '?') this.offset --; // Points '('. |
| this.next(); |
| condition = this.parseFactor(); |
| switch (condition.type) { |
| case Token.LOOKAHEAD: |
| case Token.NEGATIVELOOKAHEAD: |
| case Token.LOOKBEHIND: |
| case Token.NEGATIVELOOKBEHIND: |
| break; |
| case Token.ANCHOR: |
| if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); |
| break; |
| default: |
| throw ex("parser.factor.5", this.offset); |
| } |
| } |
| // Parses yes/no-patterns. |
| this.next(); |
| Token yesPattern = this.parseRegex(); |
| Token noPattern = null; |
| if (yesPattern.type == Token.UNION) { |
| if (yesPattern.size() != 2) throw ex("parser.factor.6", this.offset); |
| noPattern = yesPattern.getChild(1); |
| yesPattern = yesPattern.getChild(0); |
| } |
| if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); |
| this.next(); |
| return Token.createCondition(refno, condition, yesPattern, noPattern); |
| } |
| Token processModifiers() throws ParseException { |
| // this.offset points the next of '?'. |
| // modifiers ::= [imsw]* ('-' [imsw]*)? ':' |
| int add = 0, mask = 0, ch = -1; |
| while (this.offset < this.regexlen) { |
| ch = this.regex.charAt(this.offset); |
| int v = REUtil.getOptionValue(ch); |
| if (v == 0) break; // '-' or ':'? |
| add |= v; |
| this.offset ++; |
| } |
| if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1); |
| if (ch == '-') { |
| this.offset ++; |
| while (this.offset < this.regexlen) { |
| ch = this.regex.charAt(this.offset); |
| int v = REUtil.getOptionValue(ch); |
| if (v == 0) break; // ':'? |
| mask |= v; |
| this.offset ++; |
| } |
| if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1); |
| } |
| Token tok; |
| if (ch == ':') { |
| this.offset ++; |
| this.next(); |
| tok = Token.createModifierGroup(this.parseRegex(), add, mask); |
| if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); |
| this.next(); |
| } else if (ch == ')') { // such as (?-i) |
| this.offset ++; |
| this.next(); |
| tok = Token.createModifierGroup(this.parseRegex(), add, mask); |
| } else |
| throw ex("parser.factor.3", this.offset); |
| |
| return tok; |
| } |
| Token processIndependent() throws ParseException { |
| this.next(); |
| Token tok = Token.createLook(Token.INDEPENDENT, this.parseRegex()); |
| if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1); |
| this.next(); // Skips ')' |
| return tok; |
| } |
| Token processBacksolidus_c() throws ParseException { |
| int ch2; // Must be in 0x0040-0x005f |
| if (this.offset >= this.regexlen |
| || ((ch2 = this.regex.charAt(this.offset++)) & 0xffe0) != 0x0040) |
| throw ex("parser.atom.1", this.offset-1); |
| this.next(); |
| return Token.createChar(ch2-0x40); |
| } |
| Token processBacksolidus_C() throws ParseException { |
| throw ex("parser.process.1", this.offset); |
| } |
| Token processBacksolidus_i() throws ParseException { |
| Token tok = Token.createChar('i'); |
| this.next(); |
| return tok; |
| } |
| Token processBacksolidus_I() throws ParseException { |
| throw ex("parser.process.1", this.offset); |
| } |
| Token processBacksolidus_g() throws ParseException { |
| this.next(); |
| return Token.getGraphemePattern(); |
| } |
| Token processBacksolidus_X() throws ParseException { |
| this.next(); |
| return Token.getCombiningCharacterSequence(); |
| } |
| Token processBackreference() throws ParseException { |
| int refnum = this.chardata-'0'; |
| Token tok = Token.createBackReference(refnum); |
| this.hasBackReferences = true; |
| if (this.references == null) this.references = new Vector(); |
| this.references.addElement(new ReferencePosition(refnum, this.offset-2)); |
| this.next(); |
| return tok; |
| } |
| |
| // ---------------------------------------------------------------- |
| |
| /** |
| * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>' |
| * | atom (('*' | '+' | '?' | minmax ) '?'? )?) |
| * | '(?=' regex ')' | '(?!' regex ')' | '(?<=' regex ')' | '(?<!' regex ')' |
| * | '(?#' [^)]* ')' |
| * minmax ::= '{' min (',' max?)? '}' |
| * min ::= [0-9]+ |
| * max ::= [0-9]+ |
| */ |
| Token parseFactor() throws ParseException { |
| int ch = this.read(); |
| Token tok; |
| switch (ch) { |
| case T_CARET: return this.processCaret(); |
| case T_DOLLAR: return this.processDollar(); |
| case T_LOOKAHEAD: return this.processLookahead(); |
| case T_NEGATIVELOOKAHEAD: return this.processNegativelookahead(); |
| case T_LOOKBEHIND: return this.processLookbehind(); |
| case T_NEGATIVELOOKBEHIND: return this.processNegativelookbehind(); |
| |
| case T_COMMENT: |
| this.next(); |
| return Token.createEmpty(); |
| |
| case T_BACKSOLIDUS: |
| switch (this.chardata) { |
| case 'A': return this.processBacksolidus_A(); |
| case 'Z': return this.processBacksolidus_Z(); |
| case 'z': return this.processBacksolidus_z(); |
| case 'b': return this.processBacksolidus_b(); |
| case 'B': return this.processBacksolidus_B(); |
| case '<': return this.processBacksolidus_lt(); |
| case '>': return this.processBacksolidus_gt(); |
| } |
| // through down |
| } |
| tok = this.parseAtom(); |
| ch = this.read(); |
| switch (ch) { |
| case T_STAR: return this.processStar(tok); |
| case T_PLUS: return this.processPlus(tok); |
| case T_QUESTION: return this.processQuestion(tok); |
| case T_CHAR: |
| if (this.chardata == '{') { |
| // this.offset -> next of '{' |
| int off = this.offset; |
| int min = 0, max = -1; |
| if (off >= this.regexlen) break; |
| ch = this.regex.charAt(off++); |
| if (ch != ',' && (ch < '0' || ch > '9')) break; |
| if (ch != ',') { // 0-9 |
| min = ch-'0'; |
| while (off < this.regexlen |
| && (ch = this.regex.charAt(off++)) >= '0' && ch <= '9') { |
| min = min*10 +ch-'0'; |
| ch = -1; |
| } |
| if (ch < 0) break; |
| } |
| //if (off >= this.regexlen) break; |
| max = min; |
| if (ch == ',') { |
| if (off >= this.regexlen |
| || ((ch = this.regex.charAt(off++)) < '0' || ch > '9') |
| && ch != '}') |
| break; |
| if (ch == '}') { |
| max = -1; // {min,} |
| } else { |
| max = ch-'0'; // {min,max} |
| while (off < this.regexlen |
| && (ch = this.regex.charAt(off++)) >= '0' |
| && ch <= '9') { |
| max = max*10 +ch-'0'; |
| ch = -1; |
| } |
| if (ch < 0) break; |
| //if (min > max) |
| // throw new ParseException("parseFactor(): min > max: "+min+", "+max); |
| } |
| } |
| if (ch != '}') break; |
| // off -> next of '}' |
| if (this.checkQuestion(off)) { |
| tok = Token.createNGClosure(tok); |
| this.offset = off+1; |
| } else { |
| tok = Token.createClosure(tok); |
| this.offset = off; |
| } |
| tok.setMin(min); |
| tok.setMax(max); |
| //System.err.println("CLOSURE: "+min+", "+max); |
| this.next(); |
| } |
| } |
| return tok; |
| } |
| |
| /** |
| * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9] |
| * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block |
| * | '(?>' regex ')' |
| * char ::= '\\' | '\' [efnrt] | bmp-code | character-1 |
| */ |
| Token parseAtom() throws ParseException { |
| int ch = this.read(); |
| Token tok = null; |
| switch (ch) { |
| case T_LPAREN: return this.processParen(); |
| case T_LPAREN2: return this.processParen2(); // '(?:' |
| case T_CONDITION: return this.processCondition(); // '(?(' |
| case T_MODIFIERS: return this.processModifiers(); // (?modifiers ... ) |
| case T_INDEPENDENT: return this.processIndependent(); |
| case T_DOT: |
| this.next(); // Skips '.' |
| tok = Token.token_dot; |
| break; |
| |
| /** |
| * char-class ::= '[' ( '^'? range ','?)+ ']' |
| * range ::= '\d' | '\w' | '\s' | category-block | range-char |
| * | range-char '-' range-char |
| * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2 |
| * bmp-char ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] |
| */ |
| case T_LBRACKET: return this.parseCharacterClass(true); |
| case T_SET_OPERATIONS: return this.parseSetOperations(); |
| |
| case T_BACKSOLIDUS: |
| switch (this.chardata) { |
| case 'd': case 'D': |
| case 'w': case 'W': |
| case 's': case 'S': |
| tok = this.getTokenForShorthand(this.chardata); |
| this.next(); |
| return tok; |
| |
| case 'e': case 'f': case 'n': case 'r': |
| case 't': case 'u': case 'v': case 'x': |
| { |
| int ch2 = this.decodeEscaped(); |
| if (ch2 < 0x10000) { |
| tok = Token.createChar(ch2); |
| } else { |
| tok = Token.createString(REUtil.decomposeToSurrogates(ch2)); |
| } |
| } |
| break; |
| |
| case 'c': return this.processBacksolidus_c(); |
| case 'C': return this.processBacksolidus_C(); |
| case 'i': return this.processBacksolidus_i(); |
| case 'I': return this.processBacksolidus_I(); |
| case 'g': return this.processBacksolidus_g(); |
| case 'X': return this.processBacksolidus_X(); |
| case '1': case '2': case '3': case '4': |
| case '5': case '6': case '7': case '8': case '9': |
| return this.processBackreference(); |
| |
| case 'P': |
| case 'p': |
| int pstart = this.offset; |
| tok = processBacksolidus_pP(this.chardata); |
| if (tok == null) throw this.ex("parser.atom.5", pstart); |
| break; |
| |
| default: |
| tok = Token.createChar(this.chardata); |
| } |
| this.next(); |
| break; |
| |
| case T_CHAR: |
| tok = Token.createChar(this.chardata); |
| int high = this.chardata; |
| this.next(); |
| if (REUtil.isHighSurrogate(high) |
| && this.read() == T_CHAR && REUtil.isLowSurrogate(this.chardata)) { |
| char[] sur = new char[2]; |
| sur[0] = (char)high; |
| sur[1] = (char)this.chardata; |
| tok = Token.createParen(Token.createString(new String(sur)), 0); |
| this.next(); |
| } |
| break; |
| |
| default: |
| throw this.ex("parser.atom.4", this.offset-1); |
| } |
| return tok; |
| } |
| |
| protected RangeToken processBacksolidus_pP(int c) throws ParseException { |
| boolean positive = c == 'p'; |
| this.next(); |
| if (this.read() != T_CHAR) throw this.ex("parser.atom.2", this.offset-1); |
| RangeToken tok; |
| switch (this.chardata) { |
| case 'L': // Letter |
| tok = Token.getRange("L", positive); break; |
| case 'M': // Mark |
| tok = Token.getRange("M", positive); break; |
| case 'N': // Number |
| tok = Token.getRange("N", positive); break; |
| case 'Z': // Separator |
| tok = Token.getRange("Z", positive); break; |
| case 'C': // Other |
| tok = Token.getRange("C", positive); break; |
| case 'P': // Punctuation |
| tok = Token.getRange("P", positive); break; |
| case 'S': // Symbol |
| tok = Token.getRange("S", positive); break; |
| case '{': |
| // this.offset points the next of '{'. |
| //pstart = this.offset; |
| int namestart = this.offset; |
| int nameend = this.regex.indexOf('}', namestart); |
| if (nameend < 0) throw this.ex("parser.atom.3", this.offset); |
| String pname = this.regex.substring(namestart, nameend); |
| this.offset = nameend+1; |
| tok = Token.getRange(pname, positive, |
| this.isSet(RegularExpression.XMLSCHEMA_MODE)); |
| /* |
| if (this.isSet(RegularExpression.IGNORE_CASE)) |
| tok = RangeToken.createCaseInsensitiveToken(tok); |
| */ |
| break; |
| |
| default: |
| throw this.ex("parser.atom.2", this.offset-1); |
| } |
| return tok; |
| } |
| |
| int processCIinCharacterClass(RangeToken tok, int c) { |
| return this.decodeEscaped(); |
| } |
| |
| /** |
| * char-class ::= '[' ( '^'? range ','?)+ ']' |
| * range ::= '\d' | '\w' | '\s' | category-block | range-char |
| * | range-char '-' range-char |
| * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2 |
| * bmp-code ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] |
| */ |
| protected RangeToken parseCharacterClass(boolean useNrange) throws ParseException { |
| this.setContext(S_INBRACKETS); |
| this.next(); // '[' |
| boolean nrange = false; |
| RangeToken base = null; |
| RangeToken tok; |
| if (this.read() == T_CHAR && this.chardata == '^') { |
| nrange = true; |
| this.next(); // '^' |
| if (useNrange) { |
| tok = Token.createNRange(); |
| } else { |
| base = Token.createRange(); |
| base.addRange(0, Token.UTF16_MAX); |
| tok = Token.createRange(); |
| } |
| } else { |
| tok = Token.createRange(); |
| } |
| int type; |
| boolean firstloop = true; |
| while ((type = this.read()) != T_EOF) { |
| if (type == T_CHAR && this.chardata == ']' && !firstloop) |
| break; |
| firstloop = false; |
| int c = this.chardata; |
| boolean end = false; |
| if (type == T_BACKSOLIDUS) { |
| switch (c) { |
| case 'd': case 'D': |
| case 'w': case 'W': |
| case 's': case 'S': |
| tok.mergeRanges(this.getTokenForShorthand(c)); |
| end = true; |
| break; |
| |
| case 'i': case 'I': |
| case 'c': case 'C': |
| c = this.processCIinCharacterClass(tok, c); |
| if (c < 0) end = true; |
| break; |
| |
| case 'p': |
| case 'P': |
| int pstart = this.offset; |
| RangeToken tok2 = this.processBacksolidus_pP(c); |
| if (tok2 == null) throw this.ex("parser.atom.5", pstart); |
| tok.mergeRanges(tok2); |
| end = true; |
| break; |
| |
| default: |
| c = this.decodeEscaped(); |
| } // \ + c |
| } // backsolidus |
| // POSIX Character class such as [:alnum:] |
| else if (type == T_POSIX_CHARCLASS_START) { |
| int nameend = this.regex.indexOf(':', this.offset); |
| if (nameend < 0) throw this.ex("parser.cc.1", this.offset); |
| boolean positive = true; |
| if (this.regex.charAt(this.offset) == '^') { |
| this.offset ++; |
| positive = false; |
| } |
| String name = this.regex.substring(this.offset, nameend); |
| RangeToken range = Token.getRange(name, positive, |
| this.isSet(RegularExpression.XMLSCHEMA_MODE)); |
| if (range == null) throw this.ex("parser.cc.3", this.offset); |
| tok.mergeRanges(range); |
| end = true; |
| if (nameend+1 >= this.regexlen || this.regex.charAt(nameend+1) != ']') |
| throw this.ex("parser.cc.1", nameend); |
| this.offset = nameend+2; |
| } |
| this.next(); |
| if (!end) { // if not shorthands... |
| if (this.read() != T_CHAR || this.chardata != '-') { // Here is no '-'. |
| tok.addRange(c, c); |
| } else { |
| this.next(); // Skips '-' |
| if ((type = this.read()) == T_EOF) throw this.ex("parser.cc.2", this.offset); |
| if (type == T_CHAR && this.chardata == ']') { |
| tok.addRange(c, c); |
| tok.addRange('-', '-'); |
| } else { |
| int rangeend = this.chardata; |
| if (type == T_BACKSOLIDUS) |
| rangeend = this.decodeEscaped(); |
| this.next(); |
| tok.addRange(c, rangeend); |
| } |
| } |
| } |
| if (this.isSet(RegularExpression.SPECIAL_COMMA) |
| && this.read() == T_CHAR && this.chardata == ',') |
| this.next(); |
| } |
| if (this.read() == T_EOF) |
| throw this.ex("parser.cc.2", this.offset); |
| if (!useNrange && nrange) { |
| base.subtractRanges(tok); |
| tok = base; |
| } |
| tok.sortRanges(); |
| tok.compactRanges(); |
| //tok.dumpRanges(); |
| /* |
| if (this.isSet(RegularExpression.IGNORE_CASE)) |
| tok = RangeToken.createCaseInsensitiveToken(tok); |
| */ |
| this.setContext(S_NORMAL); |
| this.next(); // Skips ']' |
| |
| return tok; |
| } |
| |
| /** |
| * '(?[' ... ']' (('-' | '+' | '&') '[' ... ']')? ')' |
| */ |
| protected RangeToken parseSetOperations() throws ParseException { |
| RangeToken tok = this.parseCharacterClass(false); |
| int type; |
| while ((type = this.read()) != T_RPAREN) { |
| int ch = this.chardata; |
| if (type == T_CHAR && (ch == '-' || ch == '&') |
| || type == T_PLUS) { |
| this.next(); |
| if (this.read() != T_LBRACKET) throw ex("parser.ope.1", this.offset-1); |
| RangeToken t2 = this.parseCharacterClass(false); |
| if (type == T_PLUS) |
| tok.mergeRanges(t2); |
| else if (ch == '-') |
| tok.subtractRanges(t2); |
| else if (ch == '&') |
| tok.intersectRanges(t2); |
| else |
| throw new RuntimeException("ASSERT"); |
| } else { |
| throw ex("parser.ope.2", this.offset-1); |
| } |
| } |
| this.next(); |
| return tok; |
| } |
| |
| Token getTokenForShorthand(int ch) { |
| Token tok; |
| switch (ch) { |
| case 'd': |
| tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) |
| ? Token.getRange("Nd", true) : Token.token_0to9; |
| break; |
| case 'D': |
| tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) |
| ? Token.getRange("Nd", false) : Token.token_not_0to9; |
| break; |
| case 'w': |
| tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) |
| ? Token.getRange("IsWord", true) : Token.token_wordchars; |
| break; |
| case 'W': |
| tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) |
| ? Token.getRange("IsWord", false) : Token.token_not_wordchars; |
| break; |
| case 's': |
| tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) |
| ? Token.getRange("IsSpace", true) : Token.token_spaces; |
| break; |
| case 'S': |
| tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY) |
| ? Token.getRange("IsSpace", false) : Token.token_not_spaces; |
| break; |
| |
| default: |
| throw new RuntimeException("Internal Error: shorthands: \\u"+Integer.toString(ch, 16)); |
| } |
| return tok; |
| } |
| |
| /** |
| */ |
| int decodeEscaped() throws ParseException { |
| if (this.read() != T_BACKSOLIDUS) throw ex("parser.next.1", this.offset-1); |
| int c = this.chardata; |
| switch (c) { |
| case 'e': c = 0x1b; break; // ESCAPE U+001B |
| case 'f': c = '\f'; break; // FORM FEED U+000C |
| case 'n': c = '\n'; break; // LINE FEED U+000A |
| case 'r': c = '\r'; break; // CRRIAGE RETURN U+000D |
| case 't': c = '\t'; break; // HORIZONTAL TABULATION U+0009 |
| //case 'v': c = 0x0b; break; // VERTICAL TABULATION U+000B |
| case 'x': |
| this.next(); |
| if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1); |
| if (this.chardata == '{') { |
| int v1 = 0; |
| int uv = 0; |
| do { |
| this.next(); |
| if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1); |
| if ((v1 = hexChar(this.chardata)) < 0) |
| break; |
| if (uv > uv*16) throw ex("parser.descape.2", this.offset-1); |
| uv = uv*16+v1; |
| } while (true); |
| if (this.chardata != '}') throw ex("parser.descape.3", this.offset-1); |
| if (uv > Token.UTF16_MAX) throw ex("parser.descape.4", this.offset-1); |
| c = uv; |
| } else { |
| int v1 = 0; |
| if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) |
| throw ex("parser.descape.1", this.offset-1); |
| int uv = v1; |
| this.next(); |
| if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) |
| throw ex("parser.descape.1", this.offset-1); |
| uv = uv*16+v1; |
| c = uv; |
| } |
| break; |
| |
| case 'u': |
| int v1 = 0; |
| this.next(); |
| if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) |
| throw ex("parser.descape.1", this.offset-1); |
| int uv = v1; |
| this.next(); |
| if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) |
| throw ex("parser.descape.1", this.offset-1); |
| uv = uv*16+v1; |
| this.next(); |
| if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) |
| throw ex("parser.descape.1", this.offset-1); |
| uv = uv*16+v1; |
| this.next(); |
| if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) |
| throw ex("parser.descape.1", this.offset-1); |
| uv = uv*16+v1; |
| c = uv; |
| break; |
| |
| case 'v': |
| this.next(); |
| if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) |
| throw ex("parser.descape.1", this.offset-1); |
| uv = v1; |
| this.next(); |
| if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) |
| throw ex("parser.descape.1", this.offset-1); |
| uv = uv*16+v1; |
| this.next(); |
| if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) |
| throw ex("parser.descape.1", this.offset-1); |
| uv = uv*16+v1; |
| this.next(); |
| if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) |
| throw ex("parser.descape.1", this.offset-1); |
| uv = uv*16+v1; |
| this.next(); |
| if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) |
| throw ex("parser.descape.1", this.offset-1); |
| uv = uv*16+v1; |
| this.next(); |
| if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0) |
| throw ex("parser.descape.1", this.offset-1); |
| uv = uv*16+v1; |
| if (uv > Token.UTF16_MAX) throw ex("parser.descappe.4", this.offset-1); |
| c = uv; |
| break; |
| case 'A': |
| case 'Z': |
| case 'z': |
| throw ex("parser.descape.5", this.offset-2); |
| default: |
| } |
| return c; |
| } |
| |
| static private final int hexChar(int ch) { |
| if (ch < '0') return -1; |
| if (ch > 'f') return -1; |
| if (ch <= '9') return ch-'0'; |
| if (ch < 'A') return -1; |
| if (ch <= 'F') return ch-'A'+10; |
| if (ch < 'a') return -1; |
| return ch-'a'+10; |
| } |
| } |