blob: 80c3a4d10bdf1f2550b2aa22681a2b48a84ddeec [file] [log] [blame]
1;4601;0cFrom 0ce2400e4eb5d08586f7253cbc1324acf39ee407 Mon Sep 17 00:00:00 2001
From: Vincent Arnaud <vincent.arnaud90@laposte.net>
Date: Fri, 23 Feb 2018 12:35:26 +0100
Subject: [PATCH] Add case insensitive
add ignore case
---
.../org/apache/lucene/util/automaton/Automata.java | 14 ++
.../org/apache/lucene/util/automaton/RegExp.java | 204 ++++++++++++++-------
.../apache/lucene/util/automaton/TestRegExp.java | 33 +++-
3 files changed, 182 insertions(+), 69 deletions(-)
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java
index 294700b849..5639626dfc 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Automata.java
@@ -108,6 +108,20 @@ final public class Automata {
return makeCharRange(c, c);
}
+ /**
+ * Returns a new (deterministic) automaton that accepts a single codepoint of
+ * the given value and its other case.
+ */
+ public static Automaton makeCharCaseInsensitive(int c) {
+ Automaton a;
+ if (Character.toLowerCase(c) != Character.toUpperCase(c)) {
+ a = Operations.union(Automata.makeChar(Character.toLowerCase(c)), Automata.makeChar(Character.toUpperCase(c)));
+ } else {
+ a = Automata.makeChar(c);
+ }
+ return a;
+ }
+
/** Appends the specified character to the specified state, returning a new state. */
public static int appendChar(Automaton a, int state, int c) {
int newState = a.createState();
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
index a643ddb5b5..98c41b697b 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java
@@ -351,6 +351,11 @@ public class RegExp {
public static final int INTERVAL = 0x0020;
/**
+ * Syntax flag, enables case insensitive (<tt>(?i)</tt>).
+ */
+ public static final int CASE_INSENSITIVE = 0x0040;
+
+ /**
* Syntax flag, enables all optional regexp syntax.
*/
public static final int ALL = 0xffff;
@@ -371,8 +376,11 @@ public class RegExp {
int flags;
int pos;
- RegExp() {
+ boolean isCaseInsensitive;
+
+ RegExp(final boolean aIsInsensitive) {
this.originalString = null;
+ this.isCaseInsensitive = aIsInsensitive;
}
/**
@@ -400,15 +408,16 @@ public class RegExp {
originalString = s;
flags = syntax_flags;
RegExp e;
- if (s.length() == 0) e = makeString("");
+ if (s.length() == 0) e = makeString("", false);
else {
- e = parseUnionExp();
+ e = parseUnionExp(false);
if (pos < originalString.length()) throw new IllegalArgumentException(
"end-of-string expected at position " + pos);
}
kind = e.kind;
exp1 = e.exp1;
exp2 = e.exp2;
+ isCaseInsensitive = e.isCaseInsensitive;
this.s = e.s;
c = e.c;
min = e.min;
@@ -566,10 +575,29 @@ public class RegExp {
a = MinimizationOperations.minimize(a, maxDeterminizedStates);
break;
case REGEXP_CHAR:
- a = Automata.makeChar(c);
+ if (this.isCaseInsensitive && check(CASE_INSENSITIVE)) {
+ a = Automata.makeCharCaseInsensitive(c);
+ } else {
+ a = Automata.makeChar(c);
+ }
break;
case REGEXP_CHAR_RANGE:
- a = Automata.makeCharRange(from, to);
+ final List<Automaton> automatons = new ArrayList<>();
+ automatons.add(Automata.makeCharRange(from, to));
+ if (this.isCaseInsensitive) {
+ for (int i = from; i <= to; i++) {
+ automatons.add(Automata.makeChar(i));
+ if (Character.toLowerCase(i) != Character.toUpperCase(i)) {
+ automatons.add(Automata.makeChar((i == Character.toLowerCase(i)) ? Character.toUpperCase(i) : Character.toLowerCase(i)));
+ }
+ }
+ }
+ if (automatons.size() == 1){
+ a = automatons.get(0);
+ } else {
+ a = Operations.union(automatons);
+ a = MinimizationOperations.minimize(a, maxDeterminizedStates);
+ }
break;
case REGEXP_ANYCHAR:
a = Automata.makeAnyChar();
@@ -578,7 +606,14 @@ public class RegExp {
a = Automata.makeEmpty();
break;
case REGEXP_STRING:
- a = Automata.makeString(s);
+ if (this.isCaseInsensitive) {
+ a = makeCaracterAutomaton(s.charAt(0));
+ for (int i = 1; i < s.length(); i++) {
+ a = Operations.concatenate(a, makeCaracterAutomaton(s.charAt(i)));
+ }
+ } else {
+ a = Automata.makeString(s);
+ }
break;
case REGEXP_ANYSTRING:
a = Automata.makeAnyString();
@@ -606,7 +641,18 @@ public class RegExp {
}
return a;
}
-
+
+
+ private Automaton makeCaracterAutomaton(final int c) {
+ Automaton a;
+ if (this.isCaseInsensitive && Character.toLowerCase(c) != Character.toUpperCase(c)) {
+ a = Operations.union(Automata.makeChar(Character.toLowerCase(c)), Automata.makeChar(Character.toUpperCase(c)));
+ } else {
+ a = Automata.makeChar(c);
+ }
+ return a;
+ }
+
private void findLeaves(RegExp exp, Kind kind, List<Automaton> list,
Map<String,Automaton> automata, AutomatonProvider automaton_provider,
int maxDeterminizedStates) {
@@ -854,7 +900,7 @@ public class RegExp {
}
static RegExp makeUnion(RegExp exp1, RegExp exp2) {
- RegExp r = new RegExp();
+ RegExp r = new RegExp(false);
r.kind = Kind.REGEXP_UNION;
r.exp1 = exp1;
r.exp2 = exp2;
@@ -863,9 +909,18 @@ public class RegExp {
static RegExp makeConcatenation(RegExp exp1, RegExp exp2) {
if ((exp1.kind == Kind.REGEXP_CHAR || exp1.kind == Kind.REGEXP_STRING)
- && (exp2.kind == Kind.REGEXP_CHAR || exp2.kind == Kind.REGEXP_STRING)) return makeString(
- exp1, exp2);
- RegExp r = new RegExp();
+ && (exp2.kind == Kind.REGEXP_CHAR || exp2.kind == Kind.REGEXP_STRING)) {
+ if (exp1.isCaseInsensitive != exp2.isCaseInsensitive) {
+ final RegExp r = new RegExp(false);
+ r.kind = Kind.REGEXP_CONCATENATION;
+ r.exp1 = exp1;
+ r.exp2 = exp2;
+ return r;
+ } else {
+ return makeString(exp1, exp2);
+ }
+ }
+ RegExp r = new RegExp(false);
r.kind = Kind.REGEXP_CONCATENATION;
if (exp1.kind == Kind.REGEXP_CONCATENATION
&& (exp1.exp2.kind == Kind.REGEXP_CHAR || exp1.exp2.kind == Kind.REGEXP_STRING)
@@ -885,16 +940,19 @@ public class RegExp {
}
static private RegExp makeString(RegExp exp1, RegExp exp2) {
+ if (exp1.isCaseInsensitive != exp2.isCaseInsensitive) {
+ throw new IllegalArgumentException("RegExp" + exp1 + " and " + exp2 + " should have the same case sensitivity to be concatenated.");
+ }
StringBuilder b = new StringBuilder();
if (exp1.kind == Kind.REGEXP_STRING) b.append(exp1.s);
else b.appendCodePoint(exp1.c);
if (exp2.kind == Kind.REGEXP_STRING) b.append(exp2.s);
else b.appendCodePoint(exp2.c);
- return makeString(b.toString());
+ return makeString(b.toString(), exp1.isCaseInsensitive);
}
static RegExp makeIntersection(RegExp exp1, RegExp exp2) {
- RegExp r = new RegExp();
+ RegExp r = new RegExp(false);
r.kind = Kind.REGEXP_INTERSECTION;
r.exp1 = exp1;
r.exp2 = exp2;
@@ -902,21 +960,21 @@ public class RegExp {
}
static RegExp makeOptional(RegExp exp) {
- RegExp r = new RegExp();
+ RegExp r = new RegExp(exp.isCaseInsensitive);
r.kind = Kind.REGEXP_OPTIONAL;
r.exp1 = exp;
return r;
}
static RegExp makeRepeat(RegExp exp) {
- RegExp r = new RegExp();
+ RegExp r = new RegExp(exp.isCaseInsensitive);
r.kind = Kind.REGEXP_REPEAT;
r.exp1 = exp;
return r;
}
static RegExp makeRepeat(RegExp exp, int min) {
- RegExp r = new RegExp();
+ RegExp r = new RegExp(exp.isCaseInsensitive);
r.kind = Kind.REGEXP_REPEAT_MIN;
r.exp1 = exp;
r.min = min;
@@ -924,7 +982,7 @@ public class RegExp {
}
static RegExp makeRepeat(RegExp exp, int min, int max) {
- RegExp r = new RegExp();
+ RegExp r = new RegExp(exp.isCaseInsensitive);
r.kind = Kind.REGEXP_REPEAT_MINMAX;
r.exp1 = exp;
r.min = min;
@@ -933,63 +991,63 @@ public class RegExp {
}
static RegExp makeComplement(RegExp exp) {
- RegExp r = new RegExp();
+ RegExp r = new RegExp(exp.isCaseInsensitive);
r.kind = Kind.REGEXP_COMPLEMENT;
r.exp1 = exp;
return r;
}
- static RegExp makeChar(int c) {
- RegExp r = new RegExp();
+ static RegExp makeChar(int c, boolean isInsensitive) {
+ RegExp r = new RegExp(isInsensitive);
r.kind = Kind.REGEXP_CHAR;
r.c = c;
return r;
}
- static RegExp makeCharRange(int from, int to) {
+ static RegExp makeCharRange(int from, int to, boolean isInsensitive) {
if (from > to)
throw new IllegalArgumentException("invalid range: from (" + from + ") cannot be > to (" + to + ")");
- RegExp r = new RegExp();
+ RegExp r = new RegExp(isInsensitive);
r.kind = Kind.REGEXP_CHAR_RANGE;
r.from = from;
r.to = to;
return r;
}
- static RegExp makeAnyChar() {
- RegExp r = new RegExp();
+ static RegExp makeAnyChar(boolean isInsensitive) {
+ RegExp r = new RegExp(isInsensitive);
r.kind = Kind.REGEXP_ANYCHAR;
return r;
}
- static RegExp makeEmpty() {
- RegExp r = new RegExp();
+ static RegExp makeEmpty(boolean isInsensitive) {
+ RegExp r = new RegExp(isInsensitive);
r.kind = Kind.REGEXP_EMPTY;
return r;
}
- static RegExp makeString(String s) {
- RegExp r = new RegExp();
+ static RegExp makeString(String s, boolean isInsensitive) {
+ RegExp r = new RegExp(isInsensitive);
r.kind = Kind.REGEXP_STRING;
r.s = s;
return r;
}
- static RegExp makeAnyString() {
- RegExp r = new RegExp();
+ static RegExp makeAnyString(boolean isInsensitive) {
+ RegExp r = new RegExp(isInsensitive);
r.kind = Kind.REGEXP_ANYSTRING;
return r;
}
- static RegExp makeAutomaton(String s) {
- RegExp r = new RegExp();
+ static RegExp makeAutomaton(String s, boolean isInsensitive) {
+ RegExp r = new RegExp(isInsensitive);
r.kind = Kind.REGEXP_AUTOMATON;
r.s = s;
return r;
}
- static RegExp makeInterval(int min, int max, int digits) {
- RegExp r = new RegExp();
+ static RegExp makeInterval(int min, int max, int digits, boolean isInsensitive) {
+ RegExp r = new RegExp(isInsensitive);
r.kind = Kind.REGEXP_INTERVAL;
r.min = min;
r.max = max;
@@ -1025,28 +1083,28 @@ public class RegExp {
return (flags & flag) != 0;
}
- final RegExp parseUnionExp() throws IllegalArgumentException {
- RegExp e = parseInterExp();
- if (match('|')) e = makeUnion(e, parseUnionExp());
+ final RegExp parseUnionExp(boolean isInsensitive) throws IllegalArgumentException {
+ RegExp e = parseInterExp(isInsensitive);
+ if (match('|')) e = makeUnion(e, parseUnionExp(isInsensitive));
return e;
}
- final RegExp parseInterExp() throws IllegalArgumentException {
- RegExp e = parseConcatExp();
+ final RegExp parseInterExp(boolean isInsensitive) throws IllegalArgumentException {
+ RegExp e = parseConcatExp(isInsensitive);
if (check(INTERSECTION) && match('&')) e = makeIntersection(e,
- parseInterExp());
+ parseInterExp(isInsensitive));
return e;
}
- final RegExp parseConcatExp() throws IllegalArgumentException {
- RegExp e = parseRepeatExp();
+ final RegExp parseConcatExp(boolean isInsensitive) throws IllegalArgumentException {
+ RegExp e = parseRepeatExp(isInsensitive);
if (more() && !peek(")|") && (!check(INTERSECTION) || !peek("&"))) e = makeConcatenation(
- e, parseConcatExp());
+ e, parseConcatExp(isInsensitive));
return e;
}
- final RegExp parseRepeatExp() throws IllegalArgumentException {
- RegExp e = parseComplExp();
+ final RegExp parseRepeatExp(boolean isInsensitive) throws IllegalArgumentException {
+ RegExp e = parseComplExp(isInsensitive);
while (peek("?*+{")) {
if (match('?')) e = makeOptional(e);
else if (match('*')) e = makeRepeat(e);
@@ -1075,50 +1133,60 @@ public class RegExp {
return e;
}
- final RegExp parseComplExp() throws IllegalArgumentException {
- if (check(COMPLEMENT) && match('~')) return makeComplement(parseComplExp());
- else return parseCharClassExp();
+ final RegExp parseComplExp(boolean isInsensitive) throws IllegalArgumentException {
+ if (check(COMPLEMENT) && match('~')) return makeComplement(parseComplExp(isInsensitive));
+ else return parseCharClassExp(isInsensitive);
}
- final RegExp parseCharClassExp() throws IllegalArgumentException {
+ final RegExp parseCharClassExp(boolean isInsensitive) throws IllegalArgumentException {
if (match('[')) {
boolean negate = false;
if (match('^')) negate = true;
- RegExp e = parseCharClasses();
- if (negate) e = makeIntersection(makeAnyChar(), makeComplement(e));
+ RegExp e = parseCharClasses(isInsensitive);
+ if (negate) e = makeIntersection(makeAnyChar(isInsensitive), makeComplement(e));
if (!match(']')) throw new IllegalArgumentException(
"expected ']' at position " + pos);
return e;
- } else return parseSimpleExp();
+ } else return parseSimpleExp(isInsensitive);
}
- final RegExp parseCharClasses() throws IllegalArgumentException {
- RegExp e = parseCharClass();
+ final RegExp parseCharClasses(boolean isInsensitive) throws IllegalArgumentException {
+ RegExp e = parseCharClass(isInsensitive);
while (more() && !peek("]"))
- e = makeUnion(e, parseCharClass());
+ e = makeUnion(e, parseCharClass(isInsensitive));
return e;
}
- final RegExp parseCharClass() throws IllegalArgumentException {
+ final RegExp parseCharClass(boolean isInsensitive) throws IllegalArgumentException {
int c = parseCharExp();
- if (match('-')) return makeCharRange(c, parseCharExp());
- else return makeChar(c);
+ if (match('-')) return makeCharRange(c, parseCharExp(), isInsensitive);
+ else return makeChar(c, isInsensitive);
}
- final RegExp parseSimpleExp() throws IllegalArgumentException {
- if (match('.')) return makeAnyChar();
- else if (check(EMPTY) && match('#')) return makeEmpty();
- else if (check(ANYSTRING) && match('@')) return makeAnyString();
+ final RegExp parseSimpleExp(boolean isInsensitive) throws IllegalArgumentException {
+ if (match('.')) return makeAnyChar(isInsensitive);
+ else if (check(EMPTY) && match('#')) return makeEmpty(isInsensitive);
+ else if (check(ANYSTRING) && match('@')) return makeAnyString(isInsensitive);
else if (match('"')) {
int start = pos;
while (more() && !peek("\""))
next();
if (!match('"')) throw new IllegalArgumentException(
"expected '\"' at position " + pos);
- return makeString(originalString.substring(start, pos - 1));
+ return makeString(originalString.substring(start, pos - 1), false);
} else if (match('(')) {
- if (match(')')) return makeString("");
- RegExp e = parseUnionExp();
+ if (match(')')) return makeString("", isInsensitive);
+ if (match('?')) {
+ if (match('i')) {
+ if (match(')')) {
+ isCaseInsensitive = true;
+ return parseInterExp(true);
+ }
+ pos -= Character.charCount('i');
+ }
+ pos -= Character.charCount('?');
+ }
+ RegExp e = parseUnionExp(isInsensitive);
if (!match(')')) throw new IllegalArgumentException(
"expected ')' at position " + pos);
return e;
@@ -1133,7 +1201,7 @@ public class RegExp {
if (i == -1) {
if (!check(AUTOMATON)) throw new IllegalArgumentException(
"interval syntax error at position " + (pos - 1));
- return makeAutomaton(s);
+ return makeAutomaton(s, isInsensitive);
} else {
if (!check(INTERVAL)) throw new IllegalArgumentException(
"illegal identifier at position " + (pos - 1));
@@ -1151,13 +1219,13 @@ public class RegExp {
imin = imax;
imax = t;
}
- return makeInterval(imin, imax, digits);
+ return makeInterval(imin, imax, digits, isInsensitive);
} catch (NumberFormatException e) {
throw new IllegalArgumentException(
"interval syntax error at position " + (pos - 1));
}
}
- } else return makeChar(parseCharExp());
+ } else return makeChar(parseCharExp(), isInsensitive);
}
final int parseCharExp() throws IllegalArgumentException {
diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java
index 7d24939c34..a6f6d6fbce 100644
--- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java
+++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java
@@ -16,7 +16,6 @@
*/
package org.apache.lucene.util.automaton;
-
import org.apache.lucene.util.LuceneTestCase;
public class TestRegExp extends LuceneTestCase {
@@ -83,4 +82,36 @@ public class TestRegExp extends LuceneTestCase {
a = new RegExp("#?").toAutomaton(1000);
assertTrue(a.toString().length() > 0);
}
+
+ public void testWithCaseInsensitive() throws Exception {
+ CharacterRunAutomaton run = new CharacterRunAutomaton(new RegExp("(?i)a-pa(zU|io)[O-R]apl").toAutomaton());
+ assertTrue(run.run("a-pazUQapl"));
+ assertTrue(run.run("a-pazUQaPl"));
+ assertTrue(run.run("a-pazUpaPl"));
+ assertTrue(run.run("a-paIOpaPl"));
+ assertTrue(run.run("A-paIOpaPl"));
+ assertFalse(run.run("o"));
+ run = new CharacterRunAutomaton(new RegExp("a-pa(?i)(zU|io)[O-R]apl").toAutomaton());
+ assertTrue(run.run("a-pazUQapl"));
+ assertTrue(run.run("a-pazUQaPl"));
+ assertTrue(run.run("a-pazUpaPl"));
+ assertTrue(run.run("a-paIOpaPl"));
+ assertFalse(run.run("A-paIOpaPl"));
+ assertFalse(run.run("o"));
+ assertTrue(run.run("a-pazUQapl"));
+ run = new CharacterRunAutomaton(new RegExp("a-pa(zU|io)(?i)[O-R]apl").toAutomaton());
+ assertTrue(run.run("a-pazUQaPl"));
+ assertTrue(run.run("a-pazUpaPl"));
+ assertFalse(run.run("a-paIOpaPl"));
+ assertFalse(run.run("A-paIOpaPl"));
+ assertFalse(run.run("o"));
+ run = new CharacterRunAutomaton(new RegExp("a-pa(zU|io)[O-R](?i)apl").toAutomaton());
+ assertTrue(run.run("a-pazUQapl"));
+ assertTrue(run.run("a-pazUQaPl"));
+ assertFalse(run.run("a-pazUpaPl"));
+ assertFalse(run.run("a-paIOpaPl"));
+ assertFalse(run.run("A-paIOpaPl"));
+ assertFalse(run.run("o"));
+ }
+
}
--
2.11.0