| |
| 1;4601;0cFrom 0ce2400e4eb5d08586f7253cbc1324acf39ee407 Mon Sep 17 00:00:00 2001 |
| From: Vincent Arnaud <vincent.arnaud90@laposte.net> |
| Date: Fri, 23 Feb 2018 12:35:26 +0100 |
| Subject: [PATCH] Add case insensitive |
| |
| add ignore case |
| --- |
| .../org/apache/lucene/util/automaton/Automata.java | 14 ++ |
| .../org/apache/lucene/util/automaton/RegExp.java | 204 ++++++++++++++------- |
| .../apache/lucene/util/automaton/TestRegExp.java | 33 +++- |
| 3 files changed, 182 insertions(+), 69 deletions(-) |
| |
| diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java |
| index 294700b849..5639626dfc 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java |
| +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java |
| @@ -108,6 +108,20 @@ final public class Automata { |
| return makeCharRange(c, c); |
| } |
| |
| + /** |
| + * Returns a new (deterministic) automaton that accepts a single codepoint of |
| + * the given value and its other case. |
| + */ |
| + public static Automaton makeCharCaseInsensitive(int c) { |
| + Automaton a; |
| + if (Character.toLowerCase(c) != Character.toUpperCase(c)) { |
| + a = Operations.union(Automata.makeChar(Character.toLowerCase(c)), Automata.makeChar(Character.toUpperCase(c))); |
| + } else { |
| + a = Automata.makeChar(c); |
| + } |
| + return a; |
| + } |
| + |
| /** Appends the specified character to the specified state, returning a new state. */ |
| public static int appendChar(Automaton a, int state, int c) { |
| int newState = a.createState(); |
| diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java |
| index a643ddb5b5..98c41b697b 100644 |
| --- a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java |
| +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java |
| @@ -351,6 +351,11 @@ public class RegExp { |
| public static final int INTERVAL = 0x0020; |
| |
| /** |
| + * Syntax flag, enables case insensitive (<tt>(?i)</tt>). |
| + */ |
| + public static final int CASE_INSENSITIVE = 0x0040; |
| + |
| + /** |
| * Syntax flag, enables all optional regexp syntax. |
| */ |
| public static final int ALL = 0xffff; |
| @@ -371,8 +376,11 @@ public class RegExp { |
| int flags; |
| int pos; |
| |
| - RegExp() { |
| + boolean isCaseInsensitive; |
| + |
| + RegExp(final boolean aIsInsensitive) { |
| this.originalString = null; |
| + this.isCaseInsensitive = aIsInsensitive; |
| } |
| |
| /** |
| @@ -400,15 +408,16 @@ public class RegExp { |
| originalString = s; |
| flags = syntax_flags; |
| RegExp e; |
| - if (s.length() == 0) e = makeString(""); |
| + if (s.length() == 0) e = makeString("", false); |
| else { |
| - e = parseUnionExp(); |
| + e = parseUnionExp(false); |
| if (pos < originalString.length()) throw new IllegalArgumentException( |
| "end-of-string expected at position " + pos); |
| } |
| kind = e.kind; |
| exp1 = e.exp1; |
| exp2 = e.exp2; |
| + isCaseInsensitive = e.isCaseInsensitive; |
| this.s = e.s; |
| c = e.c; |
| min = e.min; |
| @@ -566,10 +575,29 @@ public class RegExp { |
| a = MinimizationOperations.minimize(a, maxDeterminizedStates); |
| break; |
| case REGEXP_CHAR: |
| - a = Automata.makeChar(c); |
| + if (this.isCaseInsensitive && check(CASE_INSENSITIVE)) { |
| + a = Automata.makeCharCaseInsensitive(c); |
| + } else { |
| + a = Automata.makeChar(c); |
| + } |
| break; |
| case REGEXP_CHAR_RANGE: |
| - a = Automata.makeCharRange(from, to); |
| + final List<Automaton> automatons = new ArrayList<>(); |
| + automatons.add(Automata.makeCharRange(from, to)); |
| + if (this.isCaseInsensitive) { |
| + for (int i = from; i <= to; i++) { |
| + automatons.add(Automata.makeChar(i)); |
| + if (Character.toLowerCase(i) != Character.toUpperCase(i)) { |
| + automatons.add(Automata.makeChar((i == Character.toLowerCase(i)) ? Character.toUpperCase(i) : Character.toLowerCase(i))); |
| + } |
| + } |
| + } |
| + if (automatons.size() == 1){ |
| + a = automatons.get(0); |
| + } else { |
| + a = Operations.union(automatons); |
| + a = MinimizationOperations.minimize(a, maxDeterminizedStates); |
| + } |
| break; |
| case REGEXP_ANYCHAR: |
| a = Automata.makeAnyChar(); |
| @@ -578,7 +606,14 @@ public class RegExp { |
| a = Automata.makeEmpty(); |
| break; |
| case REGEXP_STRING: |
| - a = Automata.makeString(s); |
| + if (this.isCaseInsensitive) { |
| + a = makeCaracterAutomaton(s.charAt(0)); |
| + for (int i = 1; i < s.length(); i++) { |
| + a = Operations.concatenate(a, makeCaracterAutomaton(s.charAt(i))); |
| + } |
| + } else { |
| + a = Automata.makeString(s); |
| + } |
| break; |
| case REGEXP_ANYSTRING: |
| a = Automata.makeAnyString(); |
| @@ -606,7 +641,18 @@ public class RegExp { |
| } |
| return a; |
| } |
| - |
| + |
| + |
| + private Automaton makeCaracterAutomaton(final int c) { |
| + Automaton a; |
| + if (this.isCaseInsensitive && Character.toLowerCase(c) != Character.toUpperCase(c)) { |
| + a = Operations.union(Automata.makeChar(Character.toLowerCase(c)), Automata.makeChar(Character.toUpperCase(c))); |
| + } else { |
| + a = Automata.makeChar(c); |
| + } |
| + return a; |
| + } |
| + |
| private void findLeaves(RegExp exp, Kind kind, List<Automaton> list, |
| Map<String,Automaton> automata, AutomatonProvider automaton_provider, |
| int maxDeterminizedStates) { |
| @@ -854,7 +900,7 @@ public class RegExp { |
| } |
| |
| static RegExp makeUnion(RegExp exp1, RegExp exp2) { |
| - RegExp r = new RegExp(); |
| + RegExp r = new RegExp(false); |
| r.kind = Kind.REGEXP_UNION; |
| r.exp1 = exp1; |
| r.exp2 = exp2; |
| @@ -863,9 +909,18 @@ public class RegExp { |
| |
| static RegExp makeConcatenation(RegExp exp1, RegExp exp2) { |
| if ((exp1.kind == Kind.REGEXP_CHAR || exp1.kind == Kind.REGEXP_STRING) |
| - && (exp2.kind == Kind.REGEXP_CHAR || exp2.kind == Kind.REGEXP_STRING)) return makeString( |
| - exp1, exp2); |
| - RegExp r = new RegExp(); |
| + && (exp2.kind == Kind.REGEXP_CHAR || exp2.kind == Kind.REGEXP_STRING)) { |
| + if (exp1.isCaseInsensitive != exp2.isCaseInsensitive) { |
| + final RegExp r = new RegExp(false); |
| + r.kind = Kind.REGEXP_CONCATENATION; |
| + r.exp1 = exp1; |
| + r.exp2 = exp2; |
| + return r; |
| + } else { |
| + return makeString(exp1, exp2); |
| + } |
| + } |
| + RegExp r = new RegExp(false); |
| r.kind = Kind.REGEXP_CONCATENATION; |
| if (exp1.kind == Kind.REGEXP_CONCATENATION |
| && (exp1.exp2.kind == Kind.REGEXP_CHAR || exp1.exp2.kind == Kind.REGEXP_STRING) |
| @@ -885,16 +940,19 @@ public class RegExp { |
| } |
| |
| static private RegExp makeString(RegExp exp1, RegExp exp2) { |
| + if (exp1.isCaseInsensitive != exp2.isCaseInsensitive) { |
| + throw new IllegalArgumentException("RegExp" + exp1 + " and " + exp2 + " should have the same case sensitivity to be concatenated."); |
| + } |
| StringBuilder b = new StringBuilder(); |
| if (exp1.kind == Kind.REGEXP_STRING) b.append(exp1.s); |
| else b.appendCodePoint(exp1.c); |
| if (exp2.kind == Kind.REGEXP_STRING) b.append(exp2.s); |
| else b.appendCodePoint(exp2.c); |
| - return makeString(b.toString()); |
| + return makeString(b.toString(), exp1.isCaseInsensitive); |
| } |
| |
| static RegExp makeIntersection(RegExp exp1, RegExp exp2) { |
| - RegExp r = new RegExp(); |
| + RegExp r = new RegExp(false); |
| r.kind = Kind.REGEXP_INTERSECTION; |
| r.exp1 = exp1; |
| r.exp2 = exp2; |
| @@ -902,21 +960,21 @@ public class RegExp { |
| } |
| |
| static RegExp makeOptional(RegExp exp) { |
| - RegExp r = new RegExp(); |
| + RegExp r = new RegExp(exp.isCaseInsensitive); |
| r.kind = Kind.REGEXP_OPTIONAL; |
| r.exp1 = exp; |
| return r; |
| } |
| |
| static RegExp makeRepeat(RegExp exp) { |
| - RegExp r = new RegExp(); |
| + RegExp r = new RegExp(exp.isCaseInsensitive); |
| r.kind = Kind.REGEXP_REPEAT; |
| r.exp1 = exp; |
| return r; |
| } |
| |
| static RegExp makeRepeat(RegExp exp, int min) { |
| - RegExp r = new RegExp(); |
| + RegExp r = new RegExp(exp.isCaseInsensitive); |
| r.kind = Kind.REGEXP_REPEAT_MIN; |
| r.exp1 = exp; |
| r.min = min; |
| @@ -924,7 +982,7 @@ public class RegExp { |
| } |
| |
| static RegExp makeRepeat(RegExp exp, int min, int max) { |
| - RegExp r = new RegExp(); |
| + RegExp r = new RegExp(exp.isCaseInsensitive); |
| r.kind = Kind.REGEXP_REPEAT_MINMAX; |
| r.exp1 = exp; |
| r.min = min; |
| @@ -933,63 +991,63 @@ public class RegExp { |
| } |
| |
| static RegExp makeComplement(RegExp exp) { |
| - RegExp r = new RegExp(); |
| + RegExp r = new RegExp(exp.isCaseInsensitive); |
| r.kind = Kind.REGEXP_COMPLEMENT; |
| r.exp1 = exp; |
| return r; |
| } |
| |
| - static RegExp makeChar(int c) { |
| - RegExp r = new RegExp(); |
| + static RegExp makeChar(int c, boolean isInsensitive) { |
| + RegExp r = new RegExp(isInsensitive); |
| r.kind = Kind.REGEXP_CHAR; |
| r.c = c; |
| return r; |
| } |
| |
| - static RegExp makeCharRange(int from, int to) { |
| + static RegExp makeCharRange(int from, int to, boolean isInsensitive) { |
| if (from > to) |
| throw new IllegalArgumentException("invalid range: from (" + from + ") cannot be > to (" + to + ")"); |
| - RegExp r = new RegExp(); |
| + RegExp r = new RegExp(isInsensitive); |
| r.kind = Kind.REGEXP_CHAR_RANGE; |
| r.from = from; |
| r.to = to; |
| return r; |
| } |
| |
| - static RegExp makeAnyChar() { |
| - RegExp r = new RegExp(); |
| + static RegExp makeAnyChar(boolean isInsensitive) { |
| + RegExp r = new RegExp(isInsensitive); |
| r.kind = Kind.REGEXP_ANYCHAR; |
| return r; |
| } |
| |
| - static RegExp makeEmpty() { |
| - RegExp r = new RegExp(); |
| + static RegExp makeEmpty(boolean isInsensitive) { |
| + RegExp r = new RegExp(isInsensitive); |
| r.kind = Kind.REGEXP_EMPTY; |
| return r; |
| } |
| |
| - static RegExp makeString(String s) { |
| - RegExp r = new RegExp(); |
| + static RegExp makeString(String s, boolean isInsensitive) { |
| + RegExp r = new RegExp(isInsensitive); |
| r.kind = Kind.REGEXP_STRING; |
| r.s = s; |
| return r; |
| } |
| |
| - static RegExp makeAnyString() { |
| - RegExp r = new RegExp(); |
| + static RegExp makeAnyString(boolean isInsensitive) { |
| + RegExp r = new RegExp(isInsensitive); |
| r.kind = Kind.REGEXP_ANYSTRING; |
| return r; |
| } |
| |
| - static RegExp makeAutomaton(String s) { |
| - RegExp r = new RegExp(); |
| + static RegExp makeAutomaton(String s, boolean isInsensitive) { |
| + RegExp r = new RegExp(isInsensitive); |
| r.kind = Kind.REGEXP_AUTOMATON; |
| r.s = s; |
| return r; |
| } |
| |
| - static RegExp makeInterval(int min, int max, int digits) { |
| - RegExp r = new RegExp(); |
| + static RegExp makeInterval(int min, int max, int digits, boolean isInsensitive) { |
| + RegExp r = new RegExp(isInsensitive); |
| r.kind = Kind.REGEXP_INTERVAL; |
| r.min = min; |
| r.max = max; |
| @@ -1025,28 +1083,28 @@ public class RegExp { |
| return (flags & flag) != 0; |
| } |
| |
| - final RegExp parseUnionExp() throws IllegalArgumentException { |
| - RegExp e = parseInterExp(); |
| - if (match('|')) e = makeUnion(e, parseUnionExp()); |
| + final RegExp parseUnionExp(boolean isInsensitive) throws IllegalArgumentException { |
| + RegExp e = parseInterExp(isInsensitive); |
| + if (match('|')) e = makeUnion(e, parseUnionExp(isInsensitive)); |
| return e; |
| } |
| |
| - final RegExp parseInterExp() throws IllegalArgumentException { |
| - RegExp e = parseConcatExp(); |
| + final RegExp parseInterExp(boolean isInsensitive) throws IllegalArgumentException { |
| + RegExp e = parseConcatExp(isInsensitive); |
| if (check(INTERSECTION) && match('&')) e = makeIntersection(e, |
| - parseInterExp()); |
| + parseInterExp(isInsensitive)); |
| return e; |
| } |
| |
| - final RegExp parseConcatExp() throws IllegalArgumentException { |
| - RegExp e = parseRepeatExp(); |
| + final RegExp parseConcatExp(boolean isInsensitive) throws IllegalArgumentException { |
| + RegExp e = parseRepeatExp(isInsensitive); |
| if (more() && !peek(")|") && (!check(INTERSECTION) || !peek("&"))) e = makeConcatenation( |
| - e, parseConcatExp()); |
| + e, parseConcatExp(isInsensitive)); |
| return e; |
| } |
| |
| - final RegExp parseRepeatExp() throws IllegalArgumentException { |
| - RegExp e = parseComplExp(); |
| + final RegExp parseRepeatExp(boolean isInsensitive) throws IllegalArgumentException { |
| + RegExp e = parseComplExp(isInsensitive); |
| while (peek("?*+{")) { |
| if (match('?')) e = makeOptional(e); |
| else if (match('*')) e = makeRepeat(e); |
| @@ -1075,50 +1133,60 @@ public class RegExp { |
| return e; |
| } |
| |
| - final RegExp parseComplExp() throws IllegalArgumentException { |
| - if (check(COMPLEMENT) && match('~')) return makeComplement(parseComplExp()); |
| - else return parseCharClassExp(); |
| + final RegExp parseComplExp(boolean isInsensitive) throws IllegalArgumentException { |
| + if (check(COMPLEMENT) && match('~')) return makeComplement(parseComplExp(isInsensitive)); |
| + else return parseCharClassExp(isInsensitive); |
| } |
| |
| - final RegExp parseCharClassExp() throws IllegalArgumentException { |
| + final RegExp parseCharClassExp(boolean isInsensitive) throws IllegalArgumentException { |
| if (match('[')) { |
| boolean negate = false; |
| if (match('^')) negate = true; |
| - RegExp e = parseCharClasses(); |
| - if (negate) e = makeIntersection(makeAnyChar(), makeComplement(e)); |
| + RegExp e = parseCharClasses(isInsensitive); |
| + if (negate) e = makeIntersection(makeAnyChar(isInsensitive), makeComplement(e)); |
| if (!match(']')) throw new IllegalArgumentException( |
| "expected ']' at position " + pos); |
| return e; |
| - } else return parseSimpleExp(); |
| + } else return parseSimpleExp(isInsensitive); |
| } |
| |
| - final RegExp parseCharClasses() throws IllegalArgumentException { |
| - RegExp e = parseCharClass(); |
| + final RegExp parseCharClasses(boolean isInsensitive) throws IllegalArgumentException { |
| + RegExp e = parseCharClass(isInsensitive); |
| while (more() && !peek("]")) |
| - e = makeUnion(e, parseCharClass()); |
| + e = makeUnion(e, parseCharClass(isInsensitive)); |
| return e; |
| } |
| |
| - final RegExp parseCharClass() throws IllegalArgumentException { |
| + final RegExp parseCharClass(boolean isInsensitive) throws IllegalArgumentException { |
| int c = parseCharExp(); |
| - if (match('-')) return makeCharRange(c, parseCharExp()); |
| - else return makeChar(c); |
| + if (match('-')) return makeCharRange(c, parseCharExp(), isInsensitive); |
| + else return makeChar(c, isInsensitive); |
| } |
| |
| - final RegExp parseSimpleExp() throws IllegalArgumentException { |
| - if (match('.')) return makeAnyChar(); |
| - else if (check(EMPTY) && match('#')) return makeEmpty(); |
| - else if (check(ANYSTRING) && match('@')) return makeAnyString(); |
| + final RegExp parseSimpleExp(boolean isInsensitive) throws IllegalArgumentException { |
| + if (match('.')) return makeAnyChar(isInsensitive); |
| + else if (check(EMPTY) && match('#')) return makeEmpty(isInsensitive); |
| + else if (check(ANYSTRING) && match('@')) return makeAnyString(isInsensitive); |
| else if (match('"')) { |
| int start = pos; |
| while (more() && !peek("\"")) |
| next(); |
| if (!match('"')) throw new IllegalArgumentException( |
| "expected '\"' at position " + pos); |
| - return makeString(originalString.substring(start, pos - 1)); |
| + return makeString(originalString.substring(start, pos - 1), false); |
| } else if (match('(')) { |
| - if (match(')')) return makeString(""); |
| - RegExp e = parseUnionExp(); |
| + if (match(')')) return makeString("", isInsensitive); |
| + if (match('?')) { |
| + if (match('i')) { |
| + if (match(')')) { |
| + isCaseInsensitive = true; |
| + return parseInterExp(true); |
| + } |
| + pos -= Character.charCount('i'); |
| + } |
| + pos -= Character.charCount('?'); |
| + } |
| + RegExp e = parseUnionExp(isInsensitive); |
| if (!match(')')) throw new IllegalArgumentException( |
| "expected ')' at position " + pos); |
| return e; |
| @@ -1133,7 +1201,7 @@ public class RegExp { |
| if (i == -1) { |
| if (!check(AUTOMATON)) throw new IllegalArgumentException( |
| "interval syntax error at position " + (pos - 1)); |
| - return makeAutomaton(s); |
| + return makeAutomaton(s, isInsensitive); |
| } else { |
| if (!check(INTERVAL)) throw new IllegalArgumentException( |
| "illegal identifier at position " + (pos - 1)); |
| @@ -1151,13 +1219,13 @@ public class RegExp { |
| imin = imax; |
| imax = t; |
| } |
| - return makeInterval(imin, imax, digits); |
| + return makeInterval(imin, imax, digits, isInsensitive); |
| } catch (NumberFormatException e) { |
| throw new IllegalArgumentException( |
| "interval syntax error at position " + (pos - 1)); |
| } |
| } |
| - } else return makeChar(parseCharExp()); |
| + } else return makeChar(parseCharExp(), isInsensitive); |
| } |
| |
| final int parseCharExp() throws IllegalArgumentException { |
| diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java |
| index 7d24939c34..a6f6d6fbce 100644 |
| --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java |
| +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java |
| @@ -16,7 +16,6 @@ |
| */ |
| package org.apache.lucene.util.automaton; |
| |
| - |
| import org.apache.lucene.util.LuceneTestCase; |
| |
| public class TestRegExp extends LuceneTestCase { |
| @@ -83,4 +82,36 @@ public class TestRegExp extends LuceneTestCase { |
| a = new RegExp("#?").toAutomaton(1000); |
| assertTrue(a.toString().length() > 0); |
| } |
| + |
| + public void testWithCaseInsensitive() throws Exception { |
| + CharacterRunAutomaton run = new CharacterRunAutomaton(new RegExp("(?i)a-pa(zU|io)[O-R]apl").toAutomaton()); |
| + assertTrue(run.run("a-pazUQapl")); |
| + assertTrue(run.run("a-pazUQaPl")); |
| + assertTrue(run.run("a-pazUpaPl")); |
| + assertTrue(run.run("a-paIOpaPl")); |
| + assertTrue(run.run("A-paIOpaPl")); |
| + assertFalse(run.run("o")); |
| + run = new CharacterRunAutomaton(new RegExp("a-pa(?i)(zU|io)[O-R]apl").toAutomaton()); |
| + assertTrue(run.run("a-pazUQapl")); |
| + assertTrue(run.run("a-pazUQaPl")); |
| + assertTrue(run.run("a-pazUpaPl")); |
| + assertTrue(run.run("a-paIOpaPl")); |
| + assertFalse(run.run("A-paIOpaPl")); |
| + assertFalse(run.run("o")); |
| + assertTrue(run.run("a-pazUQapl")); |
| + run = new CharacterRunAutomaton(new RegExp("a-pa(zU|io)(?i)[O-R]apl").toAutomaton()); |
| + assertTrue(run.run("a-pazUQaPl")); |
| + assertTrue(run.run("a-pazUpaPl")); |
| + assertFalse(run.run("a-paIOpaPl")); |
| + assertFalse(run.run("A-paIOpaPl")); |
| + assertFalse(run.run("o")); |
| + run = new CharacterRunAutomaton(new RegExp("a-pa(zU|io)[O-R](?i)apl").toAutomaton()); |
| + assertTrue(run.run("a-pazUQapl")); |
| + assertTrue(run.run("a-pazUQaPl")); |
| + assertFalse(run.run("a-pazUpaPl")); |
| + assertFalse(run.run("a-paIOpaPl")); |
| + assertFalse(run.run("A-paIOpaPl")); |
| + assertFalse(run.run("o")); |
| + } |
| + |
| } |
| -- |
| 2.11.0 |
| |