| Index: lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java
|
| ===================================================================
|
| --- lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java (revision 956028)
|
| +++ lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java (working copy)
|
| @@ -103,7 +103,7 @@
|
| // build a cache of sorted transitions for every state |
| allTransitions = new Transition[runAutomaton.getSize()][]; |
| for (State state : this.automaton.getNumberedStates()) { |
| - state.sortTransitions(Transition.CompareByMinMaxThenDestUTF8InUTF16Order); |
| + state.sortTransitions(Transition.CompareByMinMaxThenDest); |
| state.trimTransitionsArray(); |
| allTransitions[state.getNumber()] = state.transitionsArray; |
| } |
| @@ -158,11 +158,7 @@
|
| // seek to the next possible string; |
| if (nextString()) { |
| // reposition |
| - |
| - // FIXME: this is really bad to turn off |
| - // but it cannot work correctly until terms are in utf8 order. |
| - linear = false; |
| - |
| + |
| if (linear) |
| setLinear(infinitePosition); |
| return seekBytesRef; |
| @@ -188,15 +184,15 @@
|
| } |
| for (int i = 0; i < allTransitions[state].length; i++) { |
| Transition t = allTransitions[state][i]; |
| - if (compareToUTF16(t.getMin(), (seekBytesRef.bytes[position] & 0xff)) <= 0 && |
| - compareToUTF16((seekBytesRef.bytes[position] & 0xff), t.getMax()) <= 0) { |
| + if (t.getMin() <= (seekBytesRef.bytes[position] & 0xff) && |
| + (seekBytesRef.bytes[position] & 0xff) <= t.getMax()) { |
| maxInterval = t.getMax(); |
| break; |
| } |
| } |
| - // 0xef terms don't get the optimization... not worth the trouble. |
| - if (maxInterval != 0xef) |
| - maxInterval = incrementUTF16(maxInterval); |
| + // 0xff terms don't get the optimization... not worth the trouble. |
| + if (maxInterval != 0xff) |
| + maxInterval = incrementUTF8(maxInterval); |
| int length = position + 1; /* position + maxTransition */ |
| if (linearUpperBound.bytes.length < length) |
| linearUpperBound.bytes = new byte[length]; |
| @@ -281,7 +277,7 @@
|
| // if the next character is U+FFFF and is not part of the useful portion, |
| // then by definition it puts us in a reject state, and therefore this |
| // path is dead. there cannot be any higher transitions. backtrack. |
| - c = incrementUTF16(c); |
| + c = incrementUTF8(c); |
| if (c == -1) |
| return false; |
| } |
| @@ -295,8 +291,8 @@
|
| |
| for (int i = 0; i < transitions.length; i++) { |
| Transition transition = transitions[i]; |
| - if (compareToUTF16(transition.getMax(), c) >= 0) { |
| - int nextChar = compareToUTF16(c, transition.getMin()) > 0 ? c : transition.getMin(); |
| + if (transition.getMax() >= c) { |
| + int nextChar = Math.max(c, transition.getMin()); |
| // append either the next sequential char, or the minimum transition |
| seekBytesRef.grow(seekBytesRef.length + 1); |
| seekBytesRef.length++; |
| @@ -342,9 +338,9 @@
|
| private boolean backtrack(int position) { |
| while (position > 0) { |
| int nextChar = seekBytesRef.bytes[position - 1] & 0xff; |
| - // if a character is 0xef its a dead-end too, |
| - // because there is no higher character in UTF-16 sort order. |
| - nextChar = incrementUTF16(nextChar); |
| + // if a character is 0xff its a dead-end too, |
| + // because there is no higher character in UTF-8 sort order. |
| + nextChar = incrementUTF8(nextChar); |
| if (nextChar != -1) { |
| seekBytesRef.bytes[position - 1] = (byte) nextChar; |
| seekBytesRef.length = position; |
| @@ -355,34 +351,11 @@
|
| return false; /* all solutions exhausted */ |
| } |
| |
| - /* return the next utf8 byte in utf16 order, or -1 if exhausted */ |
| - private final int incrementUTF16(int utf8) { |
| + /* return the next utf8 byte in utf8 order, or -1 if exhausted */ |
| + private final int incrementUTF8(int utf8) { |
| switch(utf8) { |
| - case 0xed: return 0xf0; |
| - case 0xfd: return 0xee; |
| - case 0xee: return 0xef; |
| - case 0xef: return -1; |
| + case 0xff: return -1; |
| default: return utf8 + 1; |
| } |
| } |
| - |
| - int compareToUTF16(int aByte, int bByte) { |
| - if (aByte != bByte) { |
| - // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order |
| - |
| - // We know the terms are not equal, but, we may |
| - // have to carefully fixup the bytes at the |
| - // difference to match UTF16's sort order: |
| - if (aByte >= 0xee && bByte >= 0xee) { |
| - if ((aByte & 0xfe) == 0xee) { |
| - aByte += 0x10; |
| - } |
| - if ((bByte&0xfe) == 0xee) { |
| - bByte += 0x10; |
| - } |
| - } |
| - return aByte - bByte; |
| - } |
| - return 0; |
| - } |
| } |
| Index: lucene/src/java/org/apache/lucene/util/automaton/Transition.java
|
| ===================================================================
|
| --- lucene/src/java/org/apache/lucene/util/automaton/Transition.java (revision 956028)
|
| +++ lucene/src/java/org/apache/lucene/util/automaton/Transition.java (working copy)
|
| @@ -210,64 +210,4 @@
|
| } |
| |
| public static final Comparator<Transition> CompareByMinMaxThenDest = new CompareByMinMaxThenDestSingle(); |
| - |
| - private static class UTF8InUTF16Order { |
| - protected int compareCodePoint(int aByte, int bByte) { |
| - if (aByte != bByte) { |
| - // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order |
| - |
| - // We know the terms are not equal, but, we may |
| - // have to carefully fixup the bytes at the |
| - // difference to match UTF16's sort order: |
| - if (aByte >= 0xee && bByte >= 0xee) { |
| - if ((aByte & 0xfe) == 0xee) { |
| - aByte += 0x10; |
| - } |
| - if ((bByte&0xfe) == 0xee) { |
| - bByte += 0x10; |
| - } |
| - } |
| - return aByte - bByte; |
| - } |
| - return 0; |
| - } |
| - } |
| - |
| - private static final class CompareByDestThenMinMaxUTF8InUTF16OrderSingle extends UTF8InUTF16Order implements Comparator<Transition> { |
| - public int compare(Transition t1, Transition t2) { |
| - if (t1.to != t2.to) { |
| - if (t1.to == null) return -1; |
| - else if (t2.to == null) return 1; |
| - else if (t1.to.number < t2.to.number) return -1; |
| - else if (t1.to.number > t2.to.number) return 1; |
| - } |
| - int minComp = compareCodePoint(t1.min, t2.min); |
| - if (minComp != 0) return minComp; |
| - int maxComp = compareCodePoint(t1.max, t2.max); |
| - if (maxComp != 0) return maxComp; |
| - return 0; |
| - } |
| - } |
| - |
| - public static final Comparator<Transition> CompareByDestThenMinMaxUTF8InUTF16Order = new CompareByDestThenMinMaxUTF8InUTF16OrderSingle(); |
| - |
| - private static final class CompareByMinMaxThenDestUTF8InUTF16OrderSingle extends UTF8InUTF16Order implements Comparator<Transition> { |
| - public int compare(Transition t1, Transition t2) { |
| - int minComp = compareCodePoint(t1.min, t2.min); |
| - if (minComp != 0) return minComp; |
| - int maxComp = compareCodePoint(t1.max, t2.max); |
| - if (maxComp != 0) return maxComp; |
| - if (t1.to != t2.to) { |
| - if (t1.to == null) return -1; |
| - else if (t2.to == null) return 1; |
| - else if (t1.to.number < t2.to.number) return -1; |
| - else if (t1.to.number > t2.to.number) return 1; |
| - } |
| - return 0; |
| - } |
| - } |
| - |
| - public static final Comparator<Transition> CompareByMinMaxThenDestUTF8InUTF16Order = new CompareByMinMaxThenDestUTF8InUTF16OrderSingle(); |
| - |
| - |
| } |