blob: 26c420c4a3bb90a65dfb33d90c3e84102e52ac83 [file] [log] [blame]
Index: lucene/src/test/org/apache/lucene/TestExternalCodecs.java
===================================================================
--- lucene/src/test/org/apache/lucene/TestExternalCodecs.java (revision 956375)
+++ lucene/src/test/org/apache/lucene/TestExternalCodecs.java (working copy)
@@ -179,7 +179,7 @@
@Override
public Comparator<BytesRef> getComparator() {
- return BytesRef.getUTF8SortedAsUTF16Comparator();
+ return BytesRef.getUTF8SortedAsUnicodeComparator();
}
@Override
@@ -263,7 +263,7 @@
@Override
public Comparator<BytesRef> getComparator() {
- return BytesRef.getUTF8SortedAsUTF16Comparator();
+ return BytesRef.getUTF8SortedAsUnicodeComparator();
}
@Override
Index: lucene/src/test/org/apache/lucene/index/TestIndexWriter.java
===================================================================
--- lucene/src/test/org/apache/lucene/index/TestIndexWriter.java (revision 956375)
+++ lucene/src/test/org/apache/lucene/index/TestIndexWriter.java (working copy)
@@ -4621,38 +4621,22 @@
private void checkTermsOrder(IndexReader r, Set<String> allTerms, boolean isTop) throws IOException {
TermsEnum terms = MultiFields.getFields(r).terms("f").iterator();
- char[] last = new char[2];
- int lastLength = 0;
+ BytesRef last = new BytesRef();
Set<String> seenTerms = new HashSet<String>();
- UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
while(true) {
final BytesRef term = terms.next();
if (term == null) {
break;
}
- UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16);
- assertTrue(utf16.length <= 2);
- // Make sure last term comes before current one, in
- // UTF16 sort order
- int i = 0;
- for(i=0;i<lastLength && i<utf16.length;i++) {
- assertTrue("UTF16 code unit " + termDesc(new String(utf16.result, 0, utf16.length)) + " incorrectly sorted after code unit " + termDesc(new String(last, 0, lastLength)), last[i] <= utf16.result[i]);
- if (last[i] < utf16.result[i]) {
- break;
- }
- }
- // Terms should not have been identical
- assertTrue(lastLength != utf16.length || i < lastLength);
+ assertTrue(last.compareTo(term) < 0);
+ last.copy(term);
- final String s = new String(utf16.result, 0, utf16.length);
+ final String s = term.utf8ToString();
assertTrue("term " + termDesc(s) + " was not added to index (count=" + allTerms.size() + ")", allTerms.contains(s));
seenTerms.add(s);
-
- System.arraycopy(utf16.result, 0, last, 0, utf16.length);
- lastLength = utf16.length;
}
if (isTop) {
Index: lucene/src/test/org/apache/lucene/index/codecs/intblock/TestIntBlockCodec.java
===================================================================
--- lucene/src/test/org/apache/lucene/index/codecs/intblock/TestIntBlockCodec.java (revision 956375)
+++ lucene/src/test/org/apache/lucene/index/codecs/intblock/TestIntBlockCodec.java (working copy)
@@ -1,5 +1,22 @@
package org.apache.lucene.index.codecs.intblock;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.store.*;
import org.apache.lucene.index.codecs.sep.*;
@@ -34,7 +51,7 @@
out.close();
IntIndexInput in = new SimpleIntBlockIndexInput(dir, "test", 128);
- IntIndexInput.Reader r = in.reader();
+ in.reader();
// read no ints
in.close();
dir.close();
Index: lucene/src/test/org/apache/lucene/index/codecs/preflex/TermInfosWriter.java
===================================================================
--- lucene/src/test/org/apache/lucene/index/codecs/preflex/TermInfosWriter.java (revision 0)
+++ lucene/src/test/org/apache/lucene/index/codecs/preflex/TermInfosWriter.java (revision 0)
@@ -0,0 +1,227 @@
+package org.apache.lucene.index.codecs.preflex;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.io.IOException;
+import org.apache.lucene.store.*;
+import org.apache.lucene.index.*;
+import org.apache.lucene.util.*;
+
+
+/** This stores a monotonically increasing set of <Term, TermInfo> pairs in a
+ Directory. A TermInfos can be written once, in order. */
+
+final class TermInfosWriter {
+ /** The file format version, a negative number. */
+ public static final int FORMAT = -3;
+
+ // Changed strings to true utf8 with length-in-bytes not
+ // length-in-chars
+ public static final int FORMAT_VERSION_UTF8_LENGTH_IN_BYTES = -4;
+
+ // NOTE: always change this if you switch to a new format!
+ public static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES;
+
+ private FieldInfos fieldInfos;
+ private IndexOutput output;
+ private TermInfo lastTi = new TermInfo();
+ private long size;
+
+ // TODO: the default values for these two parameters should be settable from
+ // IndexWriter. However, once that's done, folks will start setting them to
+ // ridiculous values and complaining that things don't work well, as with
+ // mergeFactor. So, let's wait until a number of folks find that alternate
+ // values work better. Note that both of these values are stored in the
+ // segment, so that it's safe to change these w/o rebuilding all indexes.
+
+ /** Expert: The fraction of terms in the "dictionary" which should be stored
+ * in RAM. Smaller values use more memory, but make searching slightly
+ * faster, while larger values use less memory and make searching slightly
+ * slower. Searching is typically not dominated by dictionary lookup, so
+ * tweaking this is rarely useful.*/
+ int indexInterval = 128;
+
+ /** Expert: The fraction of {@link TermDocs} entries stored in skip tables,
+ * used to accelerate {@link TermDocs#skipTo(int)}. Larger values result in
+ * smaller indexes, greater acceleration, but fewer accelerable cases, while
+ * smaller values result in bigger indexes, less acceleration and more
+ * accelerable cases. More detailed experiments would be useful here. */
+ int skipInterval = 16;
+
+ /** Expert: The maximum number of skip levels. Smaller values result in
+ * slightly smaller indexes, but slower skipping in big posting lists.
+ */
+ int maxSkipLevels = 10;
+
+ private long lastIndexPointer;
+ private boolean isIndex;
+ private byte[] lastTermBytes = new byte[10];
+ private int lastTermBytesLength = 0;
+ private int lastFieldNumber = -1;
+
+ private TermInfosWriter other;
+ private BytesRef utf8Result = new BytesRef(10);
+
+ TermInfosWriter(Directory directory, String segment, FieldInfos fis,
+ int interval)
+ throws IOException {
+ initialize(directory, segment, fis, interval, false);
+ other = new TermInfosWriter(directory, segment, fis, interval, true);
+ other.other = this;
+ }
+
+ private TermInfosWriter(Directory directory, String segment, FieldInfos fis,
+ int interval, boolean isIndex) throws IOException {
+ initialize(directory, segment, fis, interval, isIndex);
+ }
+
+ private void initialize(Directory directory, String segment, FieldInfos fis,
+ int interval, boolean isi) throws IOException {
+ indexInterval = interval;
+ fieldInfos = fis;
+ isIndex = isi;
+ output = directory.createOutput(segment + (isIndex ? ".tii" : ".tis"));
+ output.writeInt(FORMAT_CURRENT); // write format
+ output.writeLong(0); // leave space for size
+ output.writeInt(indexInterval); // write indexInterval
+ output.writeInt(skipInterval); // write skipInterval
+ output.writeInt(maxSkipLevels); // write maxSkipLevels
+ assert initUTF16Results();
+ }
+
+ void add(Term term, TermInfo ti) throws IOException {
+ UnicodeUtil.UTF16toUTF8(term.text(), 0, term.text().length(), utf8Result);
+ add(fieldInfos.fieldNumber(term.field()), utf8Result.bytes, utf8Result.length, ti);
+ }
+
+ // Currently used only by assert statements
+ UnicodeUtil.UTF16Result utf16Result1;
+ UnicodeUtil.UTF16Result utf16Result2;
+
+ // Currently used only by assert statements
+ private boolean initUTF16Results() {
+ utf16Result1 = new UnicodeUtil.UTF16Result();
+ utf16Result2 = new UnicodeUtil.UTF16Result();
+ return true;
+ }
+
+ // Currently used only by assert statement
+ private int compareToLastTerm(int fieldNumber, byte[] termBytes, int termBytesLength) {
+
+ if (lastFieldNumber != fieldNumber) {
+ final int cmp = fieldInfos.fieldName(lastFieldNumber).compareTo(fieldInfos.fieldName(fieldNumber));
+ // If there is a field named "" (empty string) then we
+ // will get 0 on this comparison, yet, it's "OK". But
+ // it's not OK if two different field numbers map to
+ // the same name.
+ if (cmp != 0 || lastFieldNumber != -1)
+ return cmp;
+ }
+
+ UnicodeUtil.UTF8toUTF16(lastTermBytes, 0, lastTermBytesLength, utf16Result1);
+ UnicodeUtil.UTF8toUTF16(termBytes, 0, termBytesLength, utf16Result2);
+ final int len;
+ if (utf16Result1.length < utf16Result2.length)
+ len = utf16Result1.length;
+ else
+ len = utf16Result2.length;
+
+ for(int i=0;i<len;i++) {
+ final char ch1 = utf16Result1.result[i];
+ final char ch2 = utf16Result2.result[i];
+ if (ch1 != ch2)
+ return ch1-ch2;
+ }
+ return utf16Result1.length - utf16Result2.length;
+ }
+
+ /** Adds a new <<fieldNumber, termBytes>, TermInfo> pair to the set.
+ Term must be lexicographically greater than all previous Terms added.
+ TermInfo pointers must be positive and greater than all previous.*/
+ void add(int fieldNumber, byte[] termBytes, int termBytesLength, TermInfo ti)
+ throws IOException {
+
+ assert compareToLastTerm(fieldNumber, termBytes, termBytesLength) < 0 ||
+ (isIndex && termBytesLength == 0 && lastTermBytesLength == 0) :
+ "Terms are out of order: field=" + fieldInfos.fieldName(fieldNumber) + " (number " + fieldNumber + ")" +
+ " lastField=" + fieldInfos.fieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" +
+ " text=" + new String(termBytes, 0, termBytesLength, "UTF-8") + " lastText=" + new String(lastTermBytes, 0, lastTermBytesLength, "UTF-8");
+
+ assert ti.freqPointer >= lastTi.freqPointer: "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")";
+ assert ti.proxPointer >= lastTi.proxPointer: "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")";
+
+ if (!isIndex && size % indexInterval == 0)
+ other.add(lastFieldNumber, lastTermBytes, lastTermBytesLength, lastTi); // add an index term
+
+ writeTerm(fieldNumber, termBytes, termBytesLength); // write term
+
+ output.writeVInt(ti.docFreq); // write doc freq
+ output.writeVLong(ti.freqPointer - lastTi.freqPointer); // write pointers
+ output.writeVLong(ti.proxPointer - lastTi.proxPointer);
+
+ if (ti.docFreq >= skipInterval) {
+ output.writeVInt(ti.skipOffset);
+ }
+
+ if (isIndex) {
+ output.writeVLong(other.output.getFilePointer() - lastIndexPointer);
+ lastIndexPointer = other.output.getFilePointer(); // write pointer
+ }
+
+ lastFieldNumber = fieldNumber;
+ lastTi.set(ti);
+ size++;
+ }
+
+ private void writeTerm(int fieldNumber, byte[] termBytes, int termBytesLength)
+ throws IOException {
+
+ // TODO: UTF16toUTF8 could tell us this prefix
+ // Compute prefix in common with last term:
+ int start = 0;
+ final int limit = termBytesLength < lastTermBytesLength ? termBytesLength : lastTermBytesLength;
+ while(start < limit) {
+ if (termBytes[start] != lastTermBytes[start])
+ break;
+ start++;
+ }
+
+ final int length = termBytesLength - start;
+ output.writeVInt(start); // write shared prefix length
+ output.writeVInt(length); // write delta length
+ output.writeBytes(termBytes, start, length); // write delta bytes
+ output.writeVInt(fieldNumber); // write field num
+ if (lastTermBytes.length < termBytesLength) {
+ lastTermBytes = ArrayUtil.grow(lastTermBytes, termBytesLength);
+ }
+ System.arraycopy(termBytes, start, lastTermBytes, start, length);
+ lastTermBytesLength = termBytesLength;
+ }
+
+ /** Called to complete TermInfos creation. */
+ void close() throws IOException {
+ output.seek(4); // write size after format
+ output.writeLong(size);
+ output.close();
+
+ if (!isIndex)
+ other.close();
+ }
+
+}
Property changes on: lucene/src/test/org/apache/lucene/index/codecs/preflex/TermInfosWriter.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java
===================================================================
--- lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java (revision 0)
+++ lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java (revision 0)
@@ -0,0 +1,206 @@
+package org.apache.lucene.index.codecs.preflex;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.store.*;
+import org.apache.lucene.index.*;
+import org.apache.lucene.index.codecs.*;
+import org.apache.lucene.util.*;
+
+import java.util.*;
+import java.io.IOException;
+
+public class TestSurrogates extends LuceneTestCase {
+
+ private static final boolean DEBUG = false;
+
+ // like Term, but uses BytesRef for text
+ private static class FieldAndText implements Comparable<FieldAndText> {
+ String field;
+ BytesRef text;
+
+ public FieldAndText(Term t) {
+ field = t.field();
+ text = new BytesRef(t.text());
+ }
+
+ public int compareTo(FieldAndText other) {
+ if (other.field == field) {
+ return text.compareTo(other.text);
+ } else {
+ return field.compareTo(other.field);
+ }
+ }
+ }
+
+ // chooses from a very limited alphabet to exacerbate the
+ // surrogate seeking required
+ private static String makeDifficultRandomUnicodeString(Random r) {
+ final int end = r.nextInt(20);
+ if (end == 0) {
+ // allow 0 length
+ return "";
+ }
+ final char[] buffer = new char[end];
+ for (int i = 0; i < end; i++) {
+ int t = r.nextInt(5);
+
+ if (0 == t && i < end - 1) {
+ // hi
+ buffer[i++] = (char) 0xd800;
+ // lo
+ buffer[i] = (char) 0xdc00;
+ } else if (t <= 3) {
+ buffer[i] = 'a';
+ } else if (4 == t) {
+ buffer[i] = 0xe000;
+ }
+ }
+
+ return new String(buffer, 0, end);
+ }
+
+ private SegmentInfo makePreFlexSegment(Random r, String segName, Directory dir, FieldInfos fieldInfos, Codec codec, List<FieldAndText> fieldTerms) throws IOException {
+
+ final int numField = _TestUtil.nextInt(r, 2, 5);
+
+ List<Term> terms = new ArrayList<Term>();
+
+ int tc = 0;
+
+ for(int f=0;f<numField;f++) {
+ String field = "f" + f;
+ Term protoTerm = new Term(field);
+
+ fieldInfos.add(field, true, false, false, false, false, false, false);
+ final int numTerms = 1000*_TestUtil.getRandomMultiplier();
+ for(int i=0;i<numTerms;i++) {
+ String s;
+ if (r.nextInt(3) == 1) {
+ s = makeDifficultRandomUnicodeString(r);
+ } else {
+ s = _TestUtil.randomUnicodeString(r);
+ }
+ terms.add(protoTerm.createTerm(s + "_" + (tc++)));
+ }
+ }
+
+ fieldInfos.write(dir, segName);
+
+ // sorts in UTF16 order, just like preflex:
+ Collections.sort(terms);
+
+ TermInfosWriter w = new TermInfosWriter(dir, segName, fieldInfos, 128);
+ TermInfo ti = new TermInfo();
+ BytesRef utf8 = new BytesRef(10);
+ String lastText = null;
+ int uniqueTermCount = 0;
+ if (DEBUG) {
+ System.out.println("TEST: utf16 order:");
+ }
+ for(Term t : terms) {
+ FieldInfo fi = fieldInfos.fieldInfo(t.field());
+
+ String text = t.text();
+ if (lastText != null && lastText.equals(text)) {
+ continue;
+ }
+ fieldTerms.add(new FieldAndText(t));
+ uniqueTermCount++;
+ lastText = text;
+ UnicodeUtil.UTF16toUTF8(text, 0, text.length(), utf8);
+
+ if (DEBUG) {
+ System.out.println(" " + toHexString(t));
+ }
+ w.add(fi.number, utf8.bytes, utf8.length, ti);
+ }
+ w.close();
+
+ Collections.sort(fieldTerms);
+ if (DEBUG) {
+ System.out.println("\nTEST: codepoint order");
+ for(FieldAndText t: fieldTerms) {
+ System.out.println(" " + t.field + ":" + UnicodeUtil.toHexString(t.text.utf8ToString()));
+ }
+ }
+
+ dir.createOutput(segName + ".prx").close();
+ dir.createOutput(segName + ".frq").close();
+
+ // !!hack alert!! stuffing uniqueTermCount in as docCount
+ return new SegmentInfo(segName, uniqueTermCount, dir, false, -1, null, false, true, codec);
+ }
+
+ private String toHexString(Term t) {
+ return t.field() + ":" + UnicodeUtil.toHexString(t.text());
+ }
+
+ public void testSurrogatesOrder() throws Exception {
+ Directory dir = new MockRAMDirectory();
+
+ Codec codec = new PreFlexCodec();
+
+ Random r = newRandom();
+ FieldInfos fieldInfos = new FieldInfos();
+ List<FieldAndText> fieldTerms = new ArrayList<FieldAndText>();
+ SegmentInfo si = makePreFlexSegment(r, "_0", dir, fieldInfos, codec, fieldTerms);
+
+ // hack alert!!
+ int uniqueTermCount = si.docCount;
+
+ FieldsProducer fields = codec.fieldsProducer(new SegmentReadState(dir, si, fieldInfos, 1024, 1));
+ assertNotNull(fields);
+
+ if (DEBUG) {
+ System.out.println("\nTEST: now enum");
+ }
+ FieldsEnum fieldsEnum = fields.iterator();
+ String field;
+ UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
+
+ int termCount = 0;
+ while((field = fieldsEnum.next()) != null) {
+ TermsEnum termsEnum = fieldsEnum.terms();
+ BytesRef text;
+ BytesRef lastText = null;
+ while((text = termsEnum.next()) != null) {
+ UnicodeUtil.UTF8toUTF16(text.bytes, text.offset, text.length, utf16);
+ if (DEBUG) {
+ System.out.println("got term=" + field + ":" + UnicodeUtil.toHexString(new String(utf16.result, 0, utf16.length)));
+ System.out.println();
+ }
+ if (lastText == null) {
+ lastText = new BytesRef(text);
+ } else {
+ assertTrue(lastText.compareTo(text) < 0);
+ lastText.copy(text);
+ }
+ assertEquals(fieldTerms.get(termCount).field, field);
+ assertEquals(fieldTerms.get(termCount).text, text);
+ termCount++;
+ }
+ if (DEBUG) {
+ System.out.println(" no more terms for field=" + field);
+ }
+ }
+ assertEquals(uniqueTermCount, termCount);
+
+ fields.close();
+ }
+}
Property changes on: lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java
___________________________________________________________________
Added: svn:eol-style
+ native
Index: lucene/src/test/org/apache/lucene/util/_TestUtil.java
===================================================================
--- lucene/src/test/org/apache/lucene/util/_TestUtil.java (revision 956375)
+++ lucene/src/test/org/apache/lucene/util/_TestUtil.java (working copy)
@@ -141,7 +141,7 @@
else if (t <= 1) buffer[i] = (char) r.nextInt(0x80);
else if (2 == t) buffer[i] = (char) nextInt(r, 0x80, 0x800);
else if (3 == t) buffer[i] = (char) nextInt(r, 0x800, 0xd7ff);
- else if (4 == t) buffer[i] = (char) nextInt(r, 0xe000, 0xffff);
+ else if (4 == t) buffer[i] = (char) nextInt(r, 0xe000, 0xfffe);
}
return new String(buffer, 0, end);
}
Index: lucene/src/test/org/apache/lucene/util/TestNumericUtils.java
===================================================================
--- lucene/src/test/org/apache/lucene/util/TestNumericUtils.java (revision 956375)
+++ lucene/src/test/org/apache/lucene/util/TestNumericUtils.java (working copy)
@@ -30,7 +30,7 @@
NumericUtils.longToPrefixCoded(l, 0, act);
if (last!=null) {
// test if smaller
- assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUTF16Comparator().compare(last, act) < 0 );
+ assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUnicodeComparator().compare(last, act) < 0 );
assertTrue("actual bigger than last (as String)", last.utf8ToString().compareTo(act.utf8ToString()) < 0 );
}
// test is back and forward conversion works
@@ -48,7 +48,7 @@
NumericUtils.intToPrefixCoded(i, 0, act);
if (last!=null) {
// test if smaller
- assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUTF16Comparator().compare(last, act) < 0 );
+ assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUnicodeComparator().compare(last, act) < 0 );
assertTrue("actual bigger than last (as String)", last.utf8ToString().compareTo(act.utf8ToString()) < 0 );
}
// test is back and forward conversion works
@@ -84,7 +84,7 @@
// check sort order (prefixVals should be ascending)
for (int i=1; i<prefixVals.length; i++) {
- assertTrue( "check sort order", BytesRef.getUTF8SortedAsUTF16Comparator().compare(prefixVals[i-1], prefixVals[i] ) < 0 );
+ assertTrue( "check sort order", BytesRef.getUTF8SortedAsUnicodeComparator().compare(prefixVals[i-1], prefixVals[i] ) < 0 );
}
// check the prefix encoding, lower precision should have the difference to original value equal to the lower removed bits
@@ -124,7 +124,7 @@
// check sort order (prefixVals should be ascending)
for (int i=1; i<prefixVals.length; i++) {
- assertTrue( "check sort order", BytesRef.getUTF8SortedAsUTF16Comparator().compare(prefixVals[i-1], prefixVals[i] ) < 0 );
+ assertTrue( "check sort order", BytesRef.getUTF8SortedAsUnicodeComparator().compare(prefixVals[i-1], prefixVals[i] ) < 0 );
}
// check the prefix encoding, lower precision should have the difference to original value equal to the lower removed bits
Index: lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java
===================================================================
--- lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java (revision 956375)
+++ lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java (working copy)
@@ -103,7 +103,7 @@
// build a cache of sorted transitions for every state
allTransitions = new Transition[runAutomaton.getSize()][];
for (State state : this.automaton.getNumberedStates()) {
- state.sortTransitions(Transition.CompareByMinMaxThenDestUTF8InUTF16Order);
+ state.sortTransitions(Transition.CompareByMinMaxThenDest);
state.trimTransitionsArray();
allTransitions[state.getNumber()] = state.transitionsArray;
}
@@ -158,11 +158,7 @@
// seek to the next possible string;
if (nextString()) {
// reposition
-
- // FIXME: this is really bad to turn off
- // but it cannot work correctly until terms are in utf8 order.
- linear = false;
-
+
if (linear)
setLinear(infinitePosition);
return seekBytesRef;
@@ -188,15 +184,15 @@
}
for (int i = 0; i < allTransitions[state].length; i++) {
Transition t = allTransitions[state][i];
- if (compareToUTF16(t.getMin(), (seekBytesRef.bytes[position] & 0xff)) <= 0 &&
- compareToUTF16((seekBytesRef.bytes[position] & 0xff), t.getMax()) <= 0) {
+ if (t.getMin() <= (seekBytesRef.bytes[position] & 0xff) &&
+ (seekBytesRef.bytes[position] & 0xff) <= t.getMax()) {
maxInterval = t.getMax();
break;
}
}
- // 0xef terms don't get the optimization... not worth the trouble.
- if (maxInterval != 0xef)
- maxInterval = incrementUTF16(maxInterval);
+ // 0xff terms don't get the optimization... not worth the trouble.
+ if (maxInterval != 0xff)
+ maxInterval = incrementUTF8(maxInterval);
int length = position + 1; /* position + maxTransition */
if (linearUpperBound.bytes.length < length)
linearUpperBound.bytes = new byte[length];
@@ -281,7 +277,7 @@
// if the next character is U+FFFF and is not part of the useful portion,
// then by definition it puts us in a reject state, and therefore this
// path is dead. there cannot be any higher transitions. backtrack.
- c = incrementUTF16(c);
+ c = incrementUTF8(c);
if (c == -1)
return false;
}
@@ -295,8 +291,8 @@
for (int i = 0; i < transitions.length; i++) {
Transition transition = transitions[i];
- if (compareToUTF16(transition.getMax(), c) >= 0) {
- int nextChar = compareToUTF16(c, transition.getMin()) > 0 ? c : transition.getMin();
+ if (transition.getMax() >= c) {
+ int nextChar = Math.max(c, transition.getMin());
// append either the next sequential char, or the minimum transition
seekBytesRef.grow(seekBytesRef.length + 1);
seekBytesRef.length++;
@@ -342,9 +338,9 @@
private boolean backtrack(int position) {
while (position > 0) {
int nextChar = seekBytesRef.bytes[position - 1] & 0xff;
- // if a character is 0xef its a dead-end too,
- // because there is no higher character in UTF-16 sort order.
- nextChar = incrementUTF16(nextChar);
+ // if a character is 0xff its a dead-end too,
+ // because there is no higher character in UTF-8 sort order.
+ nextChar = incrementUTF8(nextChar);
if (nextChar != -1) {
seekBytesRef.bytes[position - 1] = (byte) nextChar;
seekBytesRef.length = position;
@@ -355,34 +351,11 @@
return false; /* all solutions exhausted */
}
- /* return the next utf8 byte in utf16 order, or -1 if exhausted */
- private final int incrementUTF16(int utf8) {
+ /* return the next utf8 byte in utf8 order, or -1 if exhausted */
+ private final int incrementUTF8(int utf8) {
switch(utf8) {
- case 0xed: return 0xf0;
- case 0xfd: return 0xee;
- case 0xee: return 0xef;
- case 0xef: return -1;
+ case 0xff: return -1;
default: return utf8 + 1;
}
}
-
- int compareToUTF16(int aByte, int bByte) {
- if (aByte != bByte) {
- // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order
-
- // We know the terms are not equal, but, we may
- // have to carefully fixup the bytes at the
- // difference to match UTF16's sort order:
- if (aByte >= 0xee && bByte >= 0xee) {
- if ((aByte & 0xfe) == 0xee) {
- aByte += 0x10;
- }
- if ((bByte&0xfe) == 0xee) {
- bByte += 0x10;
- }
- }
- return aByte - bByte;
- }
- return 0;
- }
}
Index: lucene/src/java/org/apache/lucene/index/FieldInfos.java
===================================================================
--- lucene/src/java/org/apache/lucene/index/FieldInfos.java (revision 956375)
+++ lucene/src/java/org/apache/lucene/index/FieldInfos.java (working copy)
@@ -53,7 +53,7 @@
private final HashMap<String,FieldInfo> byName = new HashMap<String,FieldInfo>();
private int format;
- FieldInfos() { }
+ public FieldInfos() { }
/**
* Construct a FieldInfos object using the directory and the name of the file
@@ -62,7 +62,7 @@
* @param name The name of the file to open the IndexInput from in the Directory
* @throws IOException
*/
- FieldInfos(Directory d, String name) throws IOException {
+ public FieldInfos(Directory d, String name) throws IOException {
IndexInput input = d.openInput(name);
try {
read(input, name);
Index: lucene/src/java/org/apache/lucene/index/TermsEnum.java
===================================================================
--- lucene/src/java/org/apache/lucene/index/TermsEnum.java (revision 956375)
+++ lucene/src/java/org/apache/lucene/index/TermsEnum.java (working copy)
@@ -144,8 +144,7 @@
@Override
public Comparator<BytesRef> getComparator() {
- // return an unused dummy to prevent NPE
- return BytesRef.getUTF8SortedAsUTF16Comparator();
+ return null;
}
@Override
Index: lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java
===================================================================
--- lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java (revision 956375)
+++ lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java (working copy)
@@ -130,7 +130,7 @@
// TODO: we may want to make this sort in same order
// as Codec's terms dict?
- final int[] termIDs = termsHashPerField.sortPostings(BytesRef.getUTF8SortedAsUTF16Comparator());
+ final int[] termIDs = termsHashPerField.sortPostings(BytesRef.getUTF8SortedAsUnicodeComparator());
tvf.writeVInt(numPostings);
byte bits = 0x0;
Index: lucene/src/java/org/apache/lucene/index/IndexWriter.java
===================================================================
--- lucene/src/java/org/apache/lucene/index/IndexWriter.java (revision 956375)
+++ lucene/src/java/org/apache/lucene/index/IndexWriter.java (working copy)
@@ -3964,7 +3964,7 @@
// commit merged deletes
SegmentReader reader = merge.readers[i] = readerPool.get(info, merge.mergeDocStores,
MERGE_READ_BUFFER_SIZE,
- -1);
+ -config.getReaderTermsIndexDivisor());
// We clone the segment readers because other
// deletes may come in while we're merging so we
Index: lucene/src/java/org/apache/lucene/index/CheckIndex.java
===================================================================
--- lucene/src/java/org/apache/lucene/index/CheckIndex.java (revision 956375)
+++ lucene/src/java/org/apache/lucene/index/CheckIndex.java (working copy)
@@ -32,7 +32,7 @@
import java.io.IOException;
import java.io.File;
import java.util.Collection;
-
+import java.util.Comparator;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
@@ -596,6 +596,10 @@
boolean hasOrd = true;
final long termCountStart = status.termCount;
+ BytesRef lastTerm = null;
+
+ Comparator<BytesRef> termComp = terms.getComparator();
+
while(true) {
final BytesRef term = terms.next();
@@ -603,6 +607,17 @@
break;
}
+ // make sure terms arrive in order according to
+ // the comp
+ if (lastTerm == null) {
+ lastTerm = new BytesRef(term);
+ } else {
+ if (termComp.compare(lastTerm, term) >= 0) {
+ throw new RuntimeException("terms out of order: lastTerm=" + lastTerm + " term=" + term);
+ }
+ lastTerm.copy(term);
+ }
+
final int docFreq = terms.docFreq();
status.totFreq += docFreq;
Index: lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java
===================================================================
--- lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java (revision 956375)
+++ lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java (working copy)
@@ -80,7 +80,7 @@
// Terms dict
success = false;
try {
- FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, pulsingWriter, BytesRef.getUTF8SortedAsUTF16Comparator());
+ FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, pulsingWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
success = true;
return ret;
} finally {
@@ -111,7 +111,7 @@
state.fieldInfos,
state.segmentInfo.name,
state.termsIndexDivisor,
- BytesRef.getUTF8SortedAsUTF16Comparator());
+ BytesRef.getUTF8SortedAsUnicodeComparator());
success = true;
} finally {
if (!success) {
@@ -126,7 +126,7 @@
state.dir, state.fieldInfos, state.segmentInfo.name,
pulsingReader,
state.readBufferSize,
- BytesRef.getUTF8SortedAsUTF16Comparator(),
+ BytesRef.getUTF8SortedAsUnicodeComparator(),
StandardCodec.TERMS_CACHE_SIZE);
success = true;
return ret;
Index: lucene/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java
===================================================================
--- lucene/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java (revision 956375)
+++ lucene/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java (working copy)
@@ -63,7 +63,7 @@
success = false;
try {
- FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUTF16Comparator());
+ FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
success = true;
return ret;
} finally {
@@ -95,7 +95,7 @@
state.fieldInfos,
state.segmentInfo.name,
state.termsIndexDivisor,
- BytesRef.getUTF8SortedAsUTF16Comparator());
+ BytesRef.getUTF8SortedAsUnicodeComparator());
success = true;
} finally {
if (!success) {
@@ -111,7 +111,7 @@
state.segmentInfo.name,
postingsReader,
state.readBufferSize,
- BytesRef.getUTF8SortedAsUTF16Comparator(),
+ BytesRef.getUTF8SortedAsUnicodeComparator(),
StandardCodec.TERMS_CACHE_SIZE);
success = true;
return ret;
Index: lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java
===================================================================
--- lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java (revision 956375)
+++ lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java (working copy)
@@ -104,7 +104,7 @@
indexInterval = in.readInt();
this.indexDivisor = indexDivisor;
- if (indexDivisor == -1) {
+ if (indexDivisor < 0) {
totalIndexInterval = indexInterval;
} else {
// In case terms index gets loaded, later, on demand
@@ -131,7 +131,7 @@
}
success = true;
} finally {
- if (indexDivisor != -1) {
+ if (indexDivisor > 0) {
in.close();
this.in = null;
if (success) {
Index: lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java
===================================================================
--- lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java (revision 956375)
+++ lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java (working copy)
@@ -58,7 +58,7 @@
success = false;
try {
- FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, docs, BytesRef.getUTF8SortedAsUTF16Comparator());
+ FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, docs, BytesRef.getUTF8SortedAsUnicodeComparator());
success = true;
return ret;
} finally {
@@ -85,7 +85,7 @@
state.fieldInfos,
state.segmentInfo.name,
state.termsIndexDivisor,
- BytesRef.getUTF8SortedAsUTF16Comparator());
+ BytesRef.getUTF8SortedAsUnicodeComparator());
success = true;
} finally {
if (!success) {
@@ -101,7 +101,7 @@
state.segmentInfo.name,
postings,
state.readBufferSize,
- BytesRef.getUTF8SortedAsUTF16Comparator(),
+ BytesRef.getUTF8SortedAsUnicodeComparator(),
TERMS_CACHE_SIZE);
success = true;
return ret;
Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java
===================================================================
--- lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java (revision 956375)
+++ lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java (working copy)
@@ -53,6 +53,7 @@
long indexPointer = 0;
int indexInterval;
int skipInterval;
+ int newSuffixStart;
int maxSkipLevels;
private int formatM1SkipInterval;
@@ -136,6 +137,7 @@
prevBuffer.set(termBuffer);
termBuffer.read(input, fieldInfos);
+ newSuffixStart = termBuffer.newSuffixStart;
termInfo.docFreq = input.readVInt(); // read doc freq
termInfo.freqPointer += input.readVLong(); // read freq pointer
Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java
===================================================================
--- lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java (revision 956375)
+++ lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java (working copy)
@@ -19,7 +19,6 @@
import java.io.IOException;
import org.apache.lucene.store.IndexInput;
-import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.index.Term;
@@ -34,6 +33,8 @@
private UnicodeUtil.UTF16Result text = new UnicodeUtil.UTF16Result();
private BytesRef bytes = new BytesRef(10);
+ int newSuffixStart;
+
public final int compareTo(TermBuffer other) {
if (field == other.field) // fields are interned
return compareChars(text.result, text.length, other.text.result, other.text.length);
@@ -60,23 +61,33 @@
int start = input.readVInt();
int length = input.readVInt();
int totalLength = start + length;
+ if (bytes.bytes.length < totalLength) {
+ bytes.grow(totalLength);
+ }
if (dirty) {
// Fully convert all bytes since bytes is dirty
UnicodeUtil.UTF16toUTF8(text.result, 0, text.length, bytes);
- if (bytes.bytes.length < totalLength)
- bytes.bytes = new byte[totalLength];
bytes.length = totalLength;
input.readBytes(bytes.bytes, start, length);
UnicodeUtil.UTF8toUTF16(bytes.bytes, 0, totalLength, text);
dirty = false;
} else {
// Incrementally convert only the UTF8 bytes that are new:
- if (bytes.bytes.length < totalLength)
- bytes.bytes = ArrayUtil.grow(bytes.bytes, totalLength);
bytes.length = totalLength;
input.readBytes(bytes.bytes, start, length);
UnicodeUtil.UTF8toUTF16(bytes.bytes, start, length, text);
}
+
+ while(true) {
+ newSuffixStart = text.offsets[start];
+ if (newSuffixStart != -1) {
+ break;
+ }
+ if (--start == 0) {
+ newSuffixStart = 0;
+ break;
+ }
+ }
this.field = fieldInfos.fieldName(input.readVInt());
}
@@ -124,10 +135,11 @@
try {
clone = (TermBuffer)super.clone();
} catch (CloneNotSupportedException e) {}
-
clone.dirty = true;
clone.bytes = new BytesRef(10);
clone.text = new UnicodeUtil.UTF16Result();
+ clone.text.offsets = new int[text.offsets.length];
+ System.arraycopy(text.offsets, 0, clone.text.offsets, 0, text.offsets.length);
clone.text.copyText(text);
return clone;
}
Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java
===================================================================
--- lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java (revision 956375)
+++ lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java (working copy)
@@ -39,11 +39,15 @@
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util.ArrayUtil;
/** Exposes flex API on a pre-flex index, as a codec.
* @lucene.experimental */
public class PreFlexFields extends FieldsProducer {
+ private static final boolean DEBUG_SURROGATES = false;
+
public TermInfosReader tis;
public final TermInfosReader tisNoIndex;
@@ -60,6 +64,16 @@
throws IOException {
si = info;
+
+ // NOTE: we must always load terms index, even for
+ // "sequential" scan during merging, because what is
+ // sequential to merger may not be to TermInfosReader
+ // since we do the surrogates dance:
+ // nocommit -- how to pull right value from IW?
+ if (indexDivisor < 0) {
+ indexDivisor = -indexDivisor;
+ }
+
TermInfosReader r = new TermInfosReader(dir, info.name, fieldInfos, readBufferSize, indexDivisor);
if (indexDivisor == -1) {
tisNoIndex = r;
@@ -174,7 +188,6 @@
private class PreFlexFieldsEnum extends FieldsEnum {
final Iterator<FieldInfo> it;
private final PreTermsEnum termsEnum;
- private int count;
FieldInfo current;
public PreFlexFieldsEnum() throws IOException {
@@ -185,7 +198,6 @@
@Override
public String next() {
if (it.hasNext()) {
- count++;
current = it.next();
return current.name;
} else {
@@ -195,7 +207,7 @@
@Override
public TermsEnum terms() throws IOException {
- termsEnum.reset(current, count == 1);
+ termsEnum.reset(current);
return termsEnum;
}
}
@@ -209,14 +221,15 @@
@Override
public TermsEnum iterator() throws IOException {
PreTermsEnum termsEnum = new PreTermsEnum();
- termsEnum.reset(fieldInfo, false);
+ termsEnum.reset(fieldInfo);
return termsEnum;
}
@Override
public Comparator<BytesRef> getComparator() {
- // Pre-flex indexes always sorted in UTF16 order
- return BytesRef.getUTF8SortedAsUTF16Comparator();
+ // Pre-flex indexes always sorted in UTF16 order, but
+ // we remap on-the-fly to unicode order
+ return BytesRef.getUTF8SortedAsUnicodeComparator();
}
}
@@ -227,37 +240,229 @@
private BytesRef current;
private final BytesRef scratchBytesRef = new BytesRef();
- void reset(FieldInfo fieldInfo, boolean isFirstField) throws IOException {
+ private int[] surrogateSeekPending = new int[1];
+ private boolean[] surrogateDidSeekBack = new boolean[1];
+ private int surrogateSeekUpto;
+ private char[] pendingPrefix;
+
+ private SegmentTermEnum seekTermEnum;
+ private Term protoTerm;
+ private int newSuffixStart;
+
+ void reset(FieldInfo fieldInfo) throws IOException {
this.fieldInfo = fieldInfo;
+ protoTerm = new Term(fieldInfo.name);
if (termEnum == null) {
- // First time reset is called
- if (isFirstField) {
- termEnum = getTermsDict().terms();
- skipNext = false;
- } else {
- termEnum = getTermsDict().terms(new Term(fieldInfo.name, ""));
- skipNext = true;
- }
+ termEnum = getTermsDict().terms(protoTerm);
+ seekTermEnum = getTermsDict().terms(protoTerm);
} else {
- final Term t = termEnum.term();
- if (t != null && t.field() == fieldInfo.name) {
- // No need to seek -- we have already advanced onto
- // this field. We must be @ first term because
- // flex API will not advance this enum further, on
- // seeing a different field.
- } else {
- assert t == null || !t.field().equals(fieldInfo.name); // make sure field name is interned
- final TermInfosReader tis = getTermsDict();
- tis.seekEnum(termEnum, new Term(fieldInfo.name, ""));
+ getTermsDict().seekEnum(termEnum, protoTerm);
+ }
+ skipNext = true;
+
+ surrogateSeekUpto = 0;
+ newSuffixStart = 0;
+
+ surrogatesDance();
+ }
+
+ private void surrogatesDance() throws IOException {
+
+ // Tricky: prior to 4.0, Lucene index sorted terms in
+ // UTF16 order, but as of 4.0 we sort by Unicode code
+ // point order. These orders differ because of the
+ // surrrogates; so we have to fixup our enum, here, by
+ // carefully first seeking past the surrogates and
+ // then back again at the end. The process is
+ // recursive, since any given term could have multiple
+ // new occurrences of surrogate pairs, so we use a
+ // stack to record the pending seek-backs.
+ if (DEBUG_SURROGATES) {
+ System.out.println(" dance start term=" + (termEnum.term() == null ? null : UnicodeUtil.toHexString(termEnum.term().text())));
+ }
+
+ while(popPendingSeek());
+ while(pushNewSurrogate());
+ }
+
+ // only for debugging
+ private String getStack() {
+ if (surrogateSeekUpto == 0) {
+ return "null";
+ } else {
+ StringBuffer sb = new StringBuffer();
+ for(int i=0;i<surrogateSeekUpto;i++) {
+ if (i > 0) {
+ sb.append(' ');
+ }
+ sb.append(surrogateSeekPending[i]);
}
- skipNext = true;
+ sb.append(" pendingSeekText=" + new String(pendingPrefix, 0, surrogateSeekPending[surrogateSeekUpto-1]));
+ return sb.toString();
}
}
+ private boolean popPendingSeek() throws IOException {
+ if (DEBUG_SURROGATES) {
+ System.out.println(" check pop newSuffix=" + newSuffixStart + " stack=" + getStack());
+ }
+ // if a .next() has advanced beyond the
+ // after-surrogates range we had last seeked to, we
+ // must seek back to the start and resume .next from
+ // there. this pops the pending seek off the stack.
+ final Term t = termEnum.term();
+ if (surrogateSeekUpto > 0) {
+ final int seekPrefix = surrogateSeekPending[surrogateSeekUpto-1];
+ if (DEBUG_SURROGATES) {
+ System.out.println(" seekPrefix=" + seekPrefix);
+ }
+ if (newSuffixStart < seekPrefix) {
+ assert pendingPrefix != null;
+ assert pendingPrefix.length > seekPrefix;
+ pendingPrefix[seekPrefix] = UnicodeUtil.UNI_SUR_HIGH_START;
+ Term t2 = protoTerm.createTerm(new String(pendingPrefix, 0, 1+seekPrefix));
+ if (DEBUG_SURROGATES) {
+ System.out.println(" do pop; seek back to " + UnicodeUtil.toHexString(t2.text()));
+ }
+ getTermsDict().seekEnum(termEnum, t2);
+ surrogateDidSeekBack[surrogateSeekUpto-1] = true;
+
+ // +2 because we don't want to re-check the
+ // surrogates we just seek'd back to
+ newSuffixStart = seekPrefix + 2;
+ return true;
+ } else if (newSuffixStart == seekPrefix && surrogateDidSeekBack[surrogateSeekUpto-1] && t != null && t.field() == fieldInfo.name && t.text().charAt(seekPrefix) > UnicodeUtil.UNI_SUR_LOW_END) {
+ assert pendingPrefix != null;
+ assert pendingPrefix.length > seekPrefix;
+ pendingPrefix[seekPrefix] = 0xffff;
+ Term t2 = protoTerm.createTerm(new String(pendingPrefix, 0, 1+seekPrefix));
+ if (DEBUG_SURROGATES) {
+ System.out.println(" finish pop; seek fwd to " + UnicodeUtil.toHexString(t2.text()));
+ }
+ getTermsDict().seekEnum(termEnum, t2);
+ if (DEBUG_SURROGATES) {
+ System.out.println(" found term=" + (termEnum.term() == null ? null : UnicodeUtil.toHexString(termEnum.term().text())));
+ }
+ surrogateSeekUpto--;
+
+ if (termEnum.term() == null || termEnum.term().field() != fieldInfo.name) {
+ // force pop
+ newSuffixStart = -1;
+ } else {
+ newSuffixStart = termEnum.newSuffixStart;
+ }
+
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ private boolean pushNewSurrogate() throws IOException {
+ if (DEBUG_SURROGATES) {
+ System.out.println(" check push newSuffix=" + newSuffixStart + " stack=" + getStack());
+ }
+ final Term t = termEnum.term();
+ if (t == null || t.field() != fieldInfo.name) {
+ return false;
+ }
+ final String text = t.text();
+ final int textLen = text.length();
+
+ for(int i=Math.max(0,newSuffixStart);i<textLen;i++) {
+ final char ch = text.charAt(i);
+ if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && ch <= UnicodeUtil.UNI_SUR_HIGH_END && (surrogateSeekUpto == 0 || i > surrogateSeekPending[surrogateSeekUpto-1])) {
+
+ if (DEBUG_SURROGATES) {
+ System.out.println(" found high surr 0x" + Integer.toHexString(ch) + " at pos=" + i);
+ }
+
+ // the next() that we just did read in a new
+ // suffix, containing a surrogate pair
+
+ // seek forward to see if there are any terms with
+ // this same prefix, but with characters after the
+ // surrogate range; if so, we must first iterate
+ // them, then seek back to the surrogates
+
+ char[] testPrefix = new char[i+1];
+ for(int j=0;j<i;j++) {
+ testPrefix[j] = text.charAt(j);
+ }
+ testPrefix[i] = 1+UnicodeUtil.UNI_SUR_LOW_END;
+
+ getTermsDict().seekEnum(seekTermEnum, protoTerm.createTerm(new String(testPrefix)));
+
+ Term t2 = seekTermEnum.term();
+ boolean isPrefix;
+ if (t2 != null && t2.field() == fieldInfo.name) {
+ String seekText = t2.text();
+ isPrefix = true;
+ if (DEBUG_SURROGATES) {
+ System.out.println(" seek found " + UnicodeUtil.toHexString(seekText));
+ }
+ for(int j=0;j<i;j++) {
+ if (testPrefix[j] != seekText.charAt(j)) {
+ isPrefix = false;
+ break;
+ }
+ }
+ if (DEBUG_SURROGATES && !isPrefix) {
+ System.out.println(" no end terms");
+ }
+ } else {
+ if (DEBUG_SURROGATES) {
+ System.out.println(" no end terms");
+ }
+ isPrefix = false;
+ }
+
+ if (isPrefix) {
+ // we found a term, sharing the same prefix,
+ // with characters after the surrogates, so we
+ // must first enum those, and then return the
+ // the surrogates afterwards. push that pending
+ // seek on the surrogates stack now:
+ pendingPrefix = testPrefix;
+
+ getTermsDict().seekEnum(termEnum, t2);
+
+ if (surrogateSeekUpto == surrogateSeekPending.length) {
+ surrogateSeekPending = ArrayUtil.grow(surrogateSeekPending);
+ }
+ if (surrogateSeekUpto == surrogateDidSeekBack.length) {
+ surrogateDidSeekBack = ArrayUtil.grow(surrogateDidSeekBack);
+ }
+ surrogateSeekPending[surrogateSeekUpto] = i;
+ surrogateDidSeekBack[surrogateSeekUpto] = false;
+ surrogateSeekUpto++;
+
+ if (DEBUG_SURROGATES) {
+ System.out.println(" do push " + i + "; end term=" + UnicodeUtil.toHexString(t2.text()));
+ }
+
+ newSuffixStart = i+1;
+
+ return true;
+ } else {
+ // there are no terms after the surrogates, so
+ // we do nothing to the enum and just step
+ // through the surrogates like normal. but we
+ // must keep iterating through the term, in case
+ // another surrogate pair appears later
+ }
+ }
+ }
+
+ return false;
+ }
+
@Override
public Comparator<BytesRef> getComparator() {
- // Pre-flex indexes always sorted in UTF16 order
- return BytesRef.getUTF8SortedAsUTF16Comparator();
+ // Pre-flex indexes always sorted in UTF16 order, but
+ // we remap on-the-fly to unicode order
+ return BytesRef.getUTF8SortedAsUnicodeComparator();
}
@Override
@@ -272,14 +477,24 @@
@Override
public SeekStatus seek(BytesRef term, boolean useCache) throws IOException {
+ if (DEBUG_SURROGATES) {
+ System.out.println("TE.seek() term=" + term.utf8ToString());
+ }
skipNext = false;
final TermInfosReader tis = getTermsDict();
- final Term t0 = new Term(fieldInfo.name, term.utf8ToString());
+ final Term t0 = protoTerm.createTerm(term.utf8ToString());
+
+ assert termEnum != null;
+
if (termEnum == null) {
termEnum = tis.terms(t0);
} else {
tis.seekEnum(termEnum, t0);
}
+
+ surrogateSeekUpto = 0;
+ surrogatesDance();
+
final Term t = termEnum.term();
final BytesRef tr;
@@ -304,6 +519,9 @@
@Override
public BytesRef next() throws IOException {
+ if (DEBUG_SURROGATES) {
+ System.out.println("TE.next() skipNext=" + skipNext);
+ }
if (skipNext) {
skipNext = false;
if (termEnum.term() == null) {
@@ -313,19 +531,37 @@
return current = scratchBytesRef;
}
}
- if (termEnum.next()) {
+ if (termEnum.next() && termEnum.term().field() == fieldInfo.name) {
+ newSuffixStart = termEnum.newSuffixStart;
+ if (DEBUG_SURROGATES) {
+ System.out.println(" set newSuffixStart=" + newSuffixStart);
+ }
+ surrogatesDance();
final Term t = termEnum.term();
- if (t.field() == fieldInfo.name) {
+ if (t == null || t.field() != fieldInfo.name) {
+ assert t == null || !t.field().equals(fieldInfo.name); // make sure fields are in fact interned
+ current = null;
+ } else {
scratchBytesRef.copy(t.text());
current = scratchBytesRef;
+ }
+ return current;
+ } else {
+ if (DEBUG_SURROGATES) {
+ System.out.println(" force pop");
+ }
+ // force pop
+ newSuffixStart = -1;
+ surrogatesDance();
+ final Term t = termEnum.term();
+ if (t == null || t.field() != fieldInfo.name) {
+ assert t == null || !t.field().equals(fieldInfo.name); // make sure fields are in fact interned
+ return null;
+ } else {
+ scratchBytesRef.copy(t.text());
+ current = scratchBytesRef;
return current;
- } else {
- assert !t.field().equals(fieldInfo.name); // make sure field name is interned
- // Crossed into new field
- return null;
}
- } else {
- return null;
}
}
Index: lucene/src/java/org/apache/lucene/index/codecs/intblock/IntBlockCodec.java
===================================================================
--- lucene/src/java/org/apache/lucene/index/codecs/intblock/IntBlockCodec.java (revision 956375)
+++ lucene/src/java/org/apache/lucene/index/codecs/intblock/IntBlockCodec.java (working copy)
@@ -67,7 +67,7 @@
success = false;
try {
- FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUTF16Comparator());
+ FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
success = true;
return ret;
} finally {
@@ -95,7 +95,7 @@
state.fieldInfos,
state.segmentInfo.name,
state.termsIndexDivisor,
- BytesRef.getUTF8SortedAsUTF16Comparator());
+ BytesRef.getUTF8SortedAsUnicodeComparator());
success = true;
} finally {
if (!success) {
@@ -111,7 +111,7 @@
state.segmentInfo.name,
postingsReader,
state.readBufferSize,
- BytesRef.getUTF8SortedAsUTF16Comparator(),
+ BytesRef.getUTF8SortedAsUnicodeComparator(),
StandardCodec.TERMS_CACHE_SIZE);
success = true;
return ret;
Index: lucene/src/java/org/apache/lucene/util/automaton/Transition.java
===================================================================
--- lucene/src/java/org/apache/lucene/util/automaton/Transition.java (revision 956375)
+++ lucene/src/java/org/apache/lucene/util/automaton/Transition.java (working copy)
@@ -210,64 +210,4 @@
}
public static final Comparator<Transition> CompareByMinMaxThenDest = new CompareByMinMaxThenDestSingle();
-
- private static class UTF8InUTF16Order {
- protected int compareCodePoint(int aByte, int bByte) {
- if (aByte != bByte) {
- // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order
-
- // We know the terms are not equal, but, we may
- // have to carefully fixup the bytes at the
- // difference to match UTF16's sort order:
- if (aByte >= 0xee && bByte >= 0xee) {
- if ((aByte & 0xfe) == 0xee) {
- aByte += 0x10;
- }
- if ((bByte&0xfe) == 0xee) {
- bByte += 0x10;
- }
- }
- return aByte - bByte;
- }
- return 0;
- }
- }
-
- private static final class CompareByDestThenMinMaxUTF8InUTF16OrderSingle extends UTF8InUTF16Order implements Comparator<Transition> {
- public int compare(Transition t1, Transition t2) {
- if (t1.to != t2.to) {
- if (t1.to == null) return -1;
- else if (t2.to == null) return 1;
- else if (t1.to.number < t2.to.number) return -1;
- else if (t1.to.number > t2.to.number) return 1;
- }
- int minComp = compareCodePoint(t1.min, t2.min);
- if (minComp != 0) return minComp;
- int maxComp = compareCodePoint(t1.max, t2.max);
- if (maxComp != 0) return maxComp;
- return 0;
- }
- }
-
- public static final Comparator<Transition> CompareByDestThenMinMaxUTF8InUTF16Order = new CompareByDestThenMinMaxUTF8InUTF16OrderSingle();
-
- private static final class CompareByMinMaxThenDestUTF8InUTF16OrderSingle extends UTF8InUTF16Order implements Comparator<Transition> {
- public int compare(Transition t1, Transition t2) {
- int minComp = compareCodePoint(t1.min, t2.min);
- if (minComp != 0) return minComp;
- int maxComp = compareCodePoint(t1.max, t2.max);
- if (maxComp != 0) return maxComp;
- if (t1.to != t2.to) {
- if (t1.to == null) return -1;
- else if (t2.to == null) return 1;
- else if (t1.to.number < t2.to.number) return -1;
- else if (t1.to.number > t2.to.number) return 1;
- }
- return 0;
- }
- }
-
- public static final Comparator<Transition> CompareByMinMaxThenDestUTF8InUTF16Order = new CompareByMinMaxThenDestUTF8InUTF16OrderSingle();
-
-
}
Index: lucene/src/java/org/apache/lucene/util/ArrayUtil.java
===================================================================
--- lucene/src/java/org/apache/lucene/util/ArrayUtil.java (revision 956375)
+++ lucene/src/java/org/apache/lucene/util/ArrayUtil.java (working copy)
@@ -327,6 +327,29 @@
return array;
}
+ public static boolean[] grow(boolean[] array, int minSize) {
+ if (array.length < minSize) {
+ boolean[] newArray = new boolean[oversize(minSize, 1)];
+ System.arraycopy(array, 0, newArray, 0, array.length);
+ return newArray;
+ } else
+ return array;
+ }
+
+ public static boolean[] grow(boolean[] array) {
+ return grow(array, 1 + array.length);
+ }
+
+ public static boolean[] shrink(boolean[] array, int targetSize) {
+ final int newSize = getShrinkSize(array.length, targetSize, 1);
+ if (newSize != array.length) {
+ boolean[] newArray = new boolean[newSize];
+ System.arraycopy(array, 0, newArray, 0, newSize);
+ return newArray;
+ } else
+ return array;
+ }
+
public static char[] grow(char[] array, int minSize) {
if (array.length < minSize) {
char[] newArray = new char[oversize(minSize, RamUsageEstimator.NUM_BYTES_CHAR)];
Index: lucene/src/java/org/apache/lucene/util/BytesRef.java
===================================================================
--- lucene/src/java/org/apache/lucene/util/BytesRef.java (revision 956375)
+++ lucene/src/java/org/apache/lucene/util/BytesRef.java (working copy)
@@ -217,14 +217,7 @@
bytes = ArrayUtil.grow(bytes, newLength);
}
- private final static Comparator<BytesRef> utf8SortedAsUTF16SortOrder = new UTF8SortedAsUTF16Comparator();
-
- public static Comparator<BytesRef> getUTF8SortedAsUTF16Comparator() {
- return utf8SortedAsUTF16SortOrder;
- }
-
/** Unsigned byte order comparison */
- /*
public int compareTo(BytesRef other) {
if (this == other) return 0;
@@ -245,52 +238,18 @@
// One is a prefix of the other, or, they are equal:
return this.length - other.length;
}
- */
- /** Lucene default index order. Currently the same as String.compareTo() (UTF16) but will change
- * in the future to unsigned byte comparison. */
- public int compareTo(BytesRef other) {
- if (this == other) return 0;
+ private final static Comparator<BytesRef> utf8SortedAsUnicodeSortOrder = new UTF8SortedAsUnicodeComparator();
- final byte[] aBytes = this.bytes;
- int aUpto = this.offset;
- final byte[] bBytes = other.bytes;
- int bUpto = other.offset;
-
- final int aStop = aUpto + Math.min(this.length, other.length);
-
- while(aUpto < aStop) {
- int aByte = aBytes[aUpto++] & 0xff;
- int bByte = bBytes[bUpto++] & 0xff;
- if (aByte != bByte) {
-
- // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order
-
- // We know the terms are not equal, but, we may
- // have to carefully fixup the bytes at the
- // difference to match UTF16's sort order:
- if (aByte >= 0xee && bByte >= 0xee) {
- if ((aByte & 0xfe) == 0xee) {
- aByte += 0x10;
- }
- if ((bByte&0xfe) == 0xee) {
- bByte += 0x10;
- }
- }
- return aByte - bByte;
- }
- }
-
- // One is a prefix of the other, or, they are equal:
- return this.length - other.length;
+ public static Comparator<BytesRef> getUTF8SortedAsUnicodeComparator() {
+ return utf8SortedAsUnicodeSortOrder;
}
- private static class UTF8SortedAsUTF16Comparator implements Comparator<BytesRef> {
+ private static class UTF8SortedAsUnicodeComparator implements Comparator<BytesRef> {
// Only singleton
- private UTF8SortedAsUTF16Comparator() {};
+ private UTF8SortedAsUnicodeComparator() {};
public int compare(BytesRef a, BytesRef b) {
-
final byte[] aBytes = a.bytes;
int aUpto = a.offset;
final byte[] bBytes = b.bytes;
@@ -307,32 +266,15 @@
int aByte = aBytes[aUpto++] & 0xff;
int bByte = bBytes[bUpto++] & 0xff;
- if (aByte != bByte) {
-
- // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order
-
- // We know the terms are not equal, but, we may
- // have to carefully fixup the bytes at the
- // difference to match UTF16's sort order:
- if (aByte >= 0xee && bByte >= 0xee) {
- if ((aByte & 0xfe) == 0xee) {
- aByte += 0x10;
- }
- if ((bByte&0xfe) == 0xee) {
- bByte += 0x10;
- }
- }
- return aByte - bByte;
+ int diff = aByte - bByte;
+ if (diff != 0) {
+ return diff;
}
}
// One is a prefix of the other, or, they are equal:
return a.length - b.length;
- }
-
- public boolean equals(Object other) {
- return this == other;
- }
+ }
}
public void writeExternal(ObjectOutput out)
Index: lucene/src/java/org/apache/lucene/util/UnicodeUtil.java
===================================================================
--- lucene/src/java/org/apache/lucene/util/UnicodeUtil.java (revision 956375)
+++ lucene/src/java/org/apache/lucene/util/UnicodeUtil.java (working copy)
@@ -358,7 +358,6 @@
out[outUpto++] = (char) ((chHalf & HALF_MASK) + UNI_SUR_LOW_START);
}
}
-
offsets[upto] = outUpto;
result.length = outUpto;
}
@@ -483,7 +482,7 @@
}
}
*/
- public static final boolean validUTF16String(CharSequence s) {
+ public static boolean validUTF16String(CharSequence s) {
final int size = s.length();
for(int i=0;i<size;i++) {
char ch = s.charAt(i);
@@ -507,7 +506,7 @@
return true;
}
- public static final boolean validUTF16String(char[] s, int size) {
+ public static boolean validUTF16String(char[] s, int size) {
for(int i=0;i<size;i++) {
char ch = s[i];
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
@@ -559,7 +558,7 @@
/** Returns the number of code points in this utf8
* sequence. Behavior is undefined if the utf8 sequence
* is invalid.*/
- public static final int codePointCount(BytesRef utf8) {
+ public static int codePointCount(BytesRef utf8) {
int upto = utf8.offset;
final int limit = utf8.offset + utf8.length;
final byte[] bytes = utf8.bytes;
@@ -673,4 +672,33 @@
}
return new String(chars, 0, w);
}
+
+ // for debugging
+ public static String toHexString(String s) {
+ StringBuilder sb = new StringBuilder();
+ for(int i=0;i<s.length();i++) {
+ char ch = s.charAt(i);
+ if (i > 0) {
+ sb.append(' ');
+ }
+ if (ch < 128) {
+ sb.append(ch);
+ } else {
+ if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
+ sb.append("H:");
+ } else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
+ sb.append("L:");
+ } else if (ch > UNI_SUR_LOW_END) {
+ if (ch == 0xffff) {
+ sb.append("F:");
+ } else {
+ sb.append("E:");
+ }
+ }
+
+ sb.append("0x" + Integer.toHexString(ch));
+ }
+ }
+ return sb.toString();
+ }
}
Index: lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java
===================================================================
--- lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java (revision 956375)
+++ lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java (working copy)
@@ -426,7 +426,7 @@
@Override
public Comparator<BytesRef> getComparator() {
- return BytesRef.getUTF8SortedAsUTF16Comparator();
+ return BytesRef.getUTF8SortedAsUnicodeComparator();
}
};
}
Index: lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java
===================================================================
--- lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java (revision 956375)
+++ lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java (working copy)
@@ -123,7 +123,7 @@
@Override
public Comparator<BytesRef> getComparator() {
- return BytesRef.getUTF8SortedAsUTF16Comparator();
+ return BytesRef.getUTF8SortedAsUnicodeComparator();
}
}
Index: lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
===================================================================
--- lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java (revision 956375)
+++ lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java (working copy)
@@ -808,7 +808,7 @@
@Override
public Comparator<BytesRef> getComparator() {
- return BytesRef.getUTF8SortedAsUTF16Comparator();
+ return BytesRef.getUTF8SortedAsUnicodeComparator();
}
@Override
@@ -903,7 +903,7 @@
@Override
public Comparator<BytesRef> getComparator() {
- return BytesRef.getUTF8SortedAsUTF16Comparator();
+ return BytesRef.getUTF8SortedAsUnicodeComparator();
}
}