blob: e20f791e632cd5af47703d585f1c07092da85ae9 [file] [log] [blame]
Index: lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java
===================================================================
--- lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java (revision 956028)
+++ lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java (working copy)
@@ -103,7 +103,7 @@
// build a cache of sorted transitions for every state
allTransitions = new Transition[runAutomaton.getSize()][];
for (State state : this.automaton.getNumberedStates()) {
- state.sortTransitions(Transition.CompareByMinMaxThenDestUTF8InUTF16Order);
+ state.sortTransitions(Transition.CompareByMinMaxThenDest);
state.trimTransitionsArray();
allTransitions[state.getNumber()] = state.transitionsArray;
}
@@ -158,11 +158,7 @@
// seek to the next possible string;
if (nextString()) {
// reposition
-
- // FIXME: this is really bad to turn off
- // but it cannot work correctly until terms are in utf8 order.
- linear = false;
-
+
if (linear)
setLinear(infinitePosition);
return seekBytesRef;
@@ -188,15 +184,15 @@
}
for (int i = 0; i < allTransitions[state].length; i++) {
Transition t = allTransitions[state][i];
- if (compareToUTF16(t.getMin(), (seekBytesRef.bytes[position] & 0xff)) <= 0 &&
- compareToUTF16((seekBytesRef.bytes[position] & 0xff), t.getMax()) <= 0) {
+ if (t.getMin() <= (seekBytesRef.bytes[position] & 0xff) &&
+ (seekBytesRef.bytes[position] & 0xff) <= t.getMax()) {
maxInterval = t.getMax();
break;
}
}
- // 0xef terms don't get the optimization... not worth the trouble.
- if (maxInterval != 0xef)
- maxInterval = incrementUTF16(maxInterval);
+ // 0xff terms don't get the optimization... not worth the trouble.
+ if (maxInterval != 0xff)
+ maxInterval = incrementUTF8(maxInterval);
int length = position + 1; /* position + maxTransition */
if (linearUpperBound.bytes.length < length)
linearUpperBound.bytes = new byte[length];
@@ -281,7 +277,7 @@
// if the next character is U+FFFF and is not part of the useful portion,
// then by definition it puts us in a reject state, and therefore this
// path is dead. there cannot be any higher transitions. backtrack.
- c = incrementUTF16(c);
+ c = incrementUTF8(c);
if (c == -1)
return false;
}
@@ -295,8 +291,8 @@
for (int i = 0; i < transitions.length; i++) {
Transition transition = transitions[i];
- if (compareToUTF16(transition.getMax(), c) >= 0) {
- int nextChar = compareToUTF16(c, transition.getMin()) > 0 ? c : transition.getMin();
+ if (transition.getMax() >= c) {
+ int nextChar = Math.max(c, transition.getMin());
// append either the next sequential char, or the minimum transition
seekBytesRef.grow(seekBytesRef.length + 1);
seekBytesRef.length++;
@@ -342,9 +338,9 @@
private boolean backtrack(int position) {
while (position > 0) {
int nextChar = seekBytesRef.bytes[position - 1] & 0xff;
- // if a character is 0xef its a dead-end too,
- // because there is no higher character in UTF-16 sort order.
- nextChar = incrementUTF16(nextChar);
+ // if a character is 0xff its a dead-end too,
+ // because there is no higher character in UTF-8 sort order.
+ nextChar = incrementUTF8(nextChar);
if (nextChar != -1) {
seekBytesRef.bytes[position - 1] = (byte) nextChar;
seekBytesRef.length = position;
@@ -355,34 +351,11 @@
return false; /* all solutions exhausted */
}
- /* return the next utf8 byte in utf16 order, or -1 if exhausted */
- private final int incrementUTF16(int utf8) {
+ /* return the next utf8 byte in utf8 order, or -1 if exhausted */
+ private final int incrementUTF8(int utf8) {
switch(utf8) {
- case 0xed: return 0xf0;
- case 0xfd: return 0xee;
- case 0xee: return 0xef;
- case 0xef: return -1;
+ case 0xff: return -1;
default: return utf8 + 1;
}
}
-
- int compareToUTF16(int aByte, int bByte) {
- if (aByte != bByte) {
- // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order
-
- // We know the terms are not equal, but, we may
- // have to carefully fixup the bytes at the
- // difference to match UTF16's sort order:
- if (aByte >= 0xee && bByte >= 0xee) {
- if ((aByte & 0xfe) == 0xee) {
- aByte += 0x10;
- }
- if ((bByte&0xfe) == 0xee) {
- bByte += 0x10;
- }
- }
- return aByte - bByte;
- }
- return 0;
- }
}
Index: lucene/src/java/org/apache/lucene/util/automaton/Transition.java
===================================================================
--- lucene/src/java/org/apache/lucene/util/automaton/Transition.java (revision 956028)
+++ lucene/src/java/org/apache/lucene/util/automaton/Transition.java (working copy)
@@ -210,64 +210,4 @@
}
public static final Comparator<Transition> CompareByMinMaxThenDest = new CompareByMinMaxThenDestSingle();
-
- private static class UTF8InUTF16Order {
- protected int compareCodePoint(int aByte, int bByte) {
- if (aByte != bByte) {
- // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order
-
- // We know the terms are not equal, but, we may
- // have to carefully fixup the bytes at the
- // difference to match UTF16's sort order:
- if (aByte >= 0xee && bByte >= 0xee) {
- if ((aByte & 0xfe) == 0xee) {
- aByte += 0x10;
- }
- if ((bByte&0xfe) == 0xee) {
- bByte += 0x10;
- }
- }
- return aByte - bByte;
- }
- return 0;
- }
- }
-
- private static final class CompareByDestThenMinMaxUTF8InUTF16OrderSingle extends UTF8InUTF16Order implements Comparator<Transition> {
- public int compare(Transition t1, Transition t2) {
- if (t1.to != t2.to) {
- if (t1.to == null) return -1;
- else if (t2.to == null) return 1;
- else if (t1.to.number < t2.to.number) return -1;
- else if (t1.to.number > t2.to.number) return 1;
- }
- int minComp = compareCodePoint(t1.min, t2.min);
- if (minComp != 0) return minComp;
- int maxComp = compareCodePoint(t1.max, t2.max);
- if (maxComp != 0) return maxComp;
- return 0;
- }
- }
-
- public static final Comparator<Transition> CompareByDestThenMinMaxUTF8InUTF16Order = new CompareByDestThenMinMaxUTF8InUTF16OrderSingle();
-
- private static final class CompareByMinMaxThenDestUTF8InUTF16OrderSingle extends UTF8InUTF16Order implements Comparator<Transition> {
- public int compare(Transition t1, Transition t2) {
- int minComp = compareCodePoint(t1.min, t2.min);
- if (minComp != 0) return minComp;
- int maxComp = compareCodePoint(t1.max, t2.max);
- if (maxComp != 0) return maxComp;
- if (t1.to != t2.to) {
- if (t1.to == null) return -1;
- else if (t2.to == null) return 1;
- else if (t1.to.number < t2.to.number) return -1;
- else if (t1.to.number > t2.to.number) return 1;
- }
- return 0;
- }
- }
-
- public static final Comparator<Transition> CompareByMinMaxThenDestUTF8InUTF16Order = new CompareByMinMaxThenDestUTF8InUTF16OrderSingle();
-
-
}