| Index: lucene/src/test/org/apache/lucene/TestExternalCodecs.java |
| =================================================================== |
| --- lucene/src/test/org/apache/lucene/TestExternalCodecs.java (revision 956375) |
| +++ lucene/src/test/org/apache/lucene/TestExternalCodecs.java (working copy) |
| @@ -179,7 +179,7 @@ |
| |
| @Override |
| public Comparator<BytesRef> getComparator() { |
| - return BytesRef.getUTF8SortedAsUTF16Comparator(); |
| + return BytesRef.getUTF8SortedAsUnicodeComparator(); |
| } |
| |
| @Override |
| @@ -263,7 +263,7 @@ |
| |
| @Override |
| public Comparator<BytesRef> getComparator() { |
| - return BytesRef.getUTF8SortedAsUTF16Comparator(); |
| + return BytesRef.getUTF8SortedAsUnicodeComparator(); |
| } |
| |
| @Override |
| Index: lucene/src/test/org/apache/lucene/index/TestIndexWriter.java |
| =================================================================== |
| --- lucene/src/test/org/apache/lucene/index/TestIndexWriter.java (revision 956375) |
| +++ lucene/src/test/org/apache/lucene/index/TestIndexWriter.java (working copy) |
| @@ -4621,38 +4621,22 @@ |
| private void checkTermsOrder(IndexReader r, Set<String> allTerms, boolean isTop) throws IOException { |
| TermsEnum terms = MultiFields.getFields(r).terms("f").iterator(); |
| |
| - char[] last = new char[2]; |
| - int lastLength = 0; |
| + BytesRef last = new BytesRef(); |
| |
| Set<String> seenTerms = new HashSet<String>(); |
| |
| - UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result(); |
| while(true) { |
| final BytesRef term = terms.next(); |
| if (term == null) { |
| break; |
| } |
| - UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16); |
| - assertTrue(utf16.length <= 2); |
| |
| - // Make sure last term comes before current one, in |
| - // UTF16 sort order |
| - int i = 0; |
| - for(i=0;i<lastLength && i<utf16.length;i++) { |
| - assertTrue("UTF16 code unit " + termDesc(new String(utf16.result, 0, utf16.length)) + " incorrectly sorted after code unit " + termDesc(new String(last, 0, lastLength)), last[i] <= utf16.result[i]); |
| - if (last[i] < utf16.result[i]) { |
| - break; |
| - } |
| - } |
| - // Terms should not have been identical |
| - assertTrue(lastLength != utf16.length || i < lastLength); |
| + assertTrue(last.compareTo(term) < 0); |
| + last.copy(term); |
| |
| - final String s = new String(utf16.result, 0, utf16.length); |
| + final String s = term.utf8ToString(); |
| assertTrue("term " + termDesc(s) + " was not added to index (count=" + allTerms.size() + ")", allTerms.contains(s)); |
| seenTerms.add(s); |
| - |
| - System.arraycopy(utf16.result, 0, last, 0, utf16.length); |
| - lastLength = utf16.length; |
| } |
| |
| if (isTop) { |
| Index: lucene/src/test/org/apache/lucene/index/codecs/intblock/TestIntBlockCodec.java |
| =================================================================== |
| --- lucene/src/test/org/apache/lucene/index/codecs/intblock/TestIntBlockCodec.java (revision 956375) |
| +++ lucene/src/test/org/apache/lucene/index/codecs/intblock/TestIntBlockCodec.java (working copy) |
| @@ -1,5 +1,22 @@ |
| package org.apache.lucene.index.codecs.intblock; |
| |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| import org.apache.lucene.util.LuceneTestCase; |
| import org.apache.lucene.store.*; |
| import org.apache.lucene.index.codecs.sep.*; |
| @@ -34,7 +51,7 @@ |
| out.close(); |
| |
| IntIndexInput in = new SimpleIntBlockIndexInput(dir, "test", 128); |
| - IntIndexInput.Reader r = in.reader(); |
| + in.reader(); |
| // read no ints |
| in.close(); |
| dir.close(); |
| Index: lucene/src/test/org/apache/lucene/index/codecs/preflex/TermInfosWriter.java |
| =================================================================== |
| --- lucene/src/test/org/apache/lucene/index/codecs/preflex/TermInfosWriter.java (revision 0) |
| +++ lucene/src/test/org/apache/lucene/index/codecs/preflex/TermInfosWriter.java (revision 0) |
| @@ -0,0 +1,227 @@ |
| +package org.apache.lucene.index.codecs.preflex; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| + |
| +import java.io.IOException; |
| +import org.apache.lucene.store.*; |
| +import org.apache.lucene.index.*; |
| +import org.apache.lucene.util.*; |
| + |
| + |
| +/** This stores a monotonically increasing set of <Term, TermInfo> pairs in a |
| + Directory. A TermInfos can be written once, in order. */ |
| + |
| +final class TermInfosWriter { |
| + /** The file format version, a negative number. */ |
| + public static final int FORMAT = -3; |
| + |
| + // Changed strings to true utf8 with length-in-bytes not |
| + // length-in-chars |
| + public static final int FORMAT_VERSION_UTF8_LENGTH_IN_BYTES = -4; |
| + |
| + // NOTE: always change this if you switch to a new format! |
| + public static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES; |
| + |
| + private FieldInfos fieldInfos; |
| + private IndexOutput output; |
| + private TermInfo lastTi = new TermInfo(); |
| + private long size; |
| + |
| + // TODO: the default values for these two parameters should be settable from |
| + // IndexWriter. However, once that's done, folks will start setting them to |
| + // ridiculous values and complaining that things don't work well, as with |
| + // mergeFactor. So, let's wait until a number of folks find that alternate |
| + // values work better. Note that both of these values are stored in the |
| + // segment, so that it's safe to change these w/o rebuilding all indexes. |
| + |
| + /** Expert: The fraction of terms in the "dictionary" which should be stored |
| + * in RAM. Smaller values use more memory, but make searching slightly |
| + * faster, while larger values use less memory and make searching slightly |
| + * slower. Searching is typically not dominated by dictionary lookup, so |
| + * tweaking this is rarely useful.*/ |
| + int indexInterval = 128; |
| + |
| + /** Expert: The fraction of {@link TermDocs} entries stored in skip tables, |
| + * used to accelerate {@link TermDocs#skipTo(int)}. Larger values result in |
| + * smaller indexes, greater acceleration, but fewer accelerable cases, while |
| + * smaller values result in bigger indexes, less acceleration and more |
| + * accelerable cases. More detailed experiments would be useful here. */ |
| + int skipInterval = 16; |
| + |
| + /** Expert: The maximum number of skip levels. Smaller values result in |
| + * slightly smaller indexes, but slower skipping in big posting lists. |
| + */ |
| + int maxSkipLevels = 10; |
| + |
| + private long lastIndexPointer; |
| + private boolean isIndex; |
| + private byte[] lastTermBytes = new byte[10]; |
| + private int lastTermBytesLength = 0; |
| + private int lastFieldNumber = -1; |
| + |
| + private TermInfosWriter other; |
| + private BytesRef utf8Result = new BytesRef(10); |
| + |
| + TermInfosWriter(Directory directory, String segment, FieldInfos fis, |
| + int interval) |
| + throws IOException { |
| + initialize(directory, segment, fis, interval, false); |
| + other = new TermInfosWriter(directory, segment, fis, interval, true); |
| + other.other = this; |
| + } |
| + |
| + private TermInfosWriter(Directory directory, String segment, FieldInfos fis, |
| + int interval, boolean isIndex) throws IOException { |
| + initialize(directory, segment, fis, interval, isIndex); |
| + } |
| + |
| + private void initialize(Directory directory, String segment, FieldInfos fis, |
| + int interval, boolean isi) throws IOException { |
| + indexInterval = interval; |
| + fieldInfos = fis; |
| + isIndex = isi; |
| + output = directory.createOutput(segment + (isIndex ? ".tii" : ".tis")); |
| + output.writeInt(FORMAT_CURRENT); // write format |
| + output.writeLong(0); // leave space for size |
| + output.writeInt(indexInterval); // write indexInterval |
| + output.writeInt(skipInterval); // write skipInterval |
| + output.writeInt(maxSkipLevels); // write maxSkipLevels |
| + assert initUTF16Results(); |
| + } |
| + |
| + void add(Term term, TermInfo ti) throws IOException { |
| + UnicodeUtil.UTF16toUTF8(term.text(), 0, term.text().length(), utf8Result); |
| + add(fieldInfos.fieldNumber(term.field()), utf8Result.bytes, utf8Result.length, ti); |
| + } |
| + |
| + // Currently used only by assert statements |
| + UnicodeUtil.UTF16Result utf16Result1; |
| + UnicodeUtil.UTF16Result utf16Result2; |
| + |
| + // Currently used only by assert statements |
| + private boolean initUTF16Results() { |
| + utf16Result1 = new UnicodeUtil.UTF16Result(); |
| + utf16Result2 = new UnicodeUtil.UTF16Result(); |
| + return true; |
| + } |
| + |
| + // Currently used only by assert statement |
| + private int compareToLastTerm(int fieldNumber, byte[] termBytes, int termBytesLength) { |
| + |
| + if (lastFieldNumber != fieldNumber) { |
| + final int cmp = fieldInfos.fieldName(lastFieldNumber).compareTo(fieldInfos.fieldName(fieldNumber)); |
| + // If there is a field named "" (empty string) then we |
| + // will get 0 on this comparison, yet, it's "OK". But |
| + // it's not OK if two different field numbers map to |
| + // the same name. |
| + if (cmp != 0 || lastFieldNumber != -1) |
| + return cmp; |
| + } |
| + |
| + UnicodeUtil.UTF8toUTF16(lastTermBytes, 0, lastTermBytesLength, utf16Result1); |
| + UnicodeUtil.UTF8toUTF16(termBytes, 0, termBytesLength, utf16Result2); |
| + final int len; |
| + if (utf16Result1.length < utf16Result2.length) |
| + len = utf16Result1.length; |
| + else |
| + len = utf16Result2.length; |
| + |
| + for(int i=0;i<len;i++) { |
| + final char ch1 = utf16Result1.result[i]; |
| + final char ch2 = utf16Result2.result[i]; |
| + if (ch1 != ch2) |
| + return ch1-ch2; |
| + } |
| + return utf16Result1.length - utf16Result2.length; |
| + } |
| + |
| + /** Adds a new <<fieldNumber, termBytes>, TermInfo> pair to the set. |
| + Term must be lexicographically greater than all previous Terms added. |
| + TermInfo pointers must be positive and greater than all previous.*/ |
| + void add(int fieldNumber, byte[] termBytes, int termBytesLength, TermInfo ti) |
| + throws IOException { |
| + |
| + assert compareToLastTerm(fieldNumber, termBytes, termBytesLength) < 0 || |
| + (isIndex && termBytesLength == 0 && lastTermBytesLength == 0) : |
| + "Terms are out of order: field=" + fieldInfos.fieldName(fieldNumber) + " (number " + fieldNumber + ")" + |
| + " lastField=" + fieldInfos.fieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" + |
| + " text=" + new String(termBytes, 0, termBytesLength, "UTF-8") + " lastText=" + new String(lastTermBytes, 0, lastTermBytesLength, "UTF-8"); |
| + |
| + assert ti.freqPointer >= lastTi.freqPointer: "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")"; |
| + assert ti.proxPointer >= lastTi.proxPointer: "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")"; |
| + |
| + if (!isIndex && size % indexInterval == 0) |
| + other.add(lastFieldNumber, lastTermBytes, lastTermBytesLength, lastTi); // add an index term |
| + |
| + writeTerm(fieldNumber, termBytes, termBytesLength); // write term |
| + |
| + output.writeVInt(ti.docFreq); // write doc freq |
| + output.writeVLong(ti.freqPointer - lastTi.freqPointer); // write pointers |
| + output.writeVLong(ti.proxPointer - lastTi.proxPointer); |
| + |
| + if (ti.docFreq >= skipInterval) { |
| + output.writeVInt(ti.skipOffset); |
| + } |
| + |
| + if (isIndex) { |
| + output.writeVLong(other.output.getFilePointer() - lastIndexPointer); |
| + lastIndexPointer = other.output.getFilePointer(); // write pointer |
| + } |
| + |
| + lastFieldNumber = fieldNumber; |
| + lastTi.set(ti); |
| + size++; |
| + } |
| + |
| + private void writeTerm(int fieldNumber, byte[] termBytes, int termBytesLength) |
| + throws IOException { |
| + |
| + // TODO: UTF16toUTF8 could tell us this prefix |
| + // Compute prefix in common with last term: |
| + int start = 0; |
| + final int limit = termBytesLength < lastTermBytesLength ? termBytesLength : lastTermBytesLength; |
| + while(start < limit) { |
| + if (termBytes[start] != lastTermBytes[start]) |
| + break; |
| + start++; |
| + } |
| + |
| + final int length = termBytesLength - start; |
| + output.writeVInt(start); // write shared prefix length |
| + output.writeVInt(length); // write delta length |
| + output.writeBytes(termBytes, start, length); // write delta bytes |
| + output.writeVInt(fieldNumber); // write field num |
| + if (lastTermBytes.length < termBytesLength) { |
| + lastTermBytes = ArrayUtil.grow(lastTermBytes, termBytesLength); |
| + } |
| + System.arraycopy(termBytes, start, lastTermBytes, start, length); |
| + lastTermBytesLength = termBytesLength; |
| + } |
| + |
| + /** Called to complete TermInfos creation. */ |
| + void close() throws IOException { |
| + output.seek(4); // write size after format |
| + output.writeLong(size); |
| + output.close(); |
| + |
| + if (!isIndex) |
| + other.close(); |
| + } |
| + |
| +} |
| |
| Property changes on: lucene/src/test/org/apache/lucene/index/codecs/preflex/TermInfosWriter.java |
| ___________________________________________________________________ |
| Added: svn:eol-style |
| + native |
| |
| Index: lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java |
| =================================================================== |
| --- lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java (revision 0) |
| +++ lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java (revision 0) |
| @@ -0,0 +1,206 @@ |
| +package org.apache.lucene.index.codecs.preflex; |
| + |
| +/** |
| + * Licensed to the Apache Software Foundation (ASF) under one or more |
| + * contributor license agreements. See the NOTICE file distributed with |
| + * this work for additional information regarding copyright ownership. |
| + * The ASF licenses this file to You under the Apache License, Version 2.0 |
| + * (the "License"); you may not use this file except in compliance with |
| + * the License. You may obtain a copy of the License at |
| + * |
| + * http://www.apache.org/licenses/LICENSE-2.0 |
| + * |
| + * Unless required by applicable law or agreed to in writing, software |
| + * distributed under the License is distributed on an "AS IS" BASIS, |
| + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| + * See the License for the specific language governing permissions and |
| + * limitations under the License. |
| + */ |
| + |
| +import org.apache.lucene.store.*; |
| +import org.apache.lucene.index.*; |
| +import org.apache.lucene.index.codecs.*; |
| +import org.apache.lucene.util.*; |
| + |
| +import java.util.*; |
| +import java.io.IOException; |
| + |
| +public class TestSurrogates extends LuceneTestCase { |
| + |
| + private static final boolean DEBUG = false; |
| + |
| + // like Term, but uses BytesRef for text |
| + private static class FieldAndText implements Comparable<FieldAndText> { |
| + String field; |
| + BytesRef text; |
| + |
| + public FieldAndText(Term t) { |
| + field = t.field(); |
| + text = new BytesRef(t.text()); |
| + } |
| + |
| + public int compareTo(FieldAndText other) { |
| + if (other.field == field) { |
| + return text.compareTo(other.text); |
| + } else { |
| + return field.compareTo(other.field); |
| + } |
| + } |
| + } |
| + |
| + // chooses from a very limited alphabet to exacerbate the |
| + // surrogate seeking required |
| + private static String makeDifficultRandomUnicodeString(Random r) { |
| + final int end = r.nextInt(20); |
| + if (end == 0) { |
| + // allow 0 length |
| + return ""; |
| + } |
| + final char[] buffer = new char[end]; |
| + for (int i = 0; i < end; i++) { |
| + int t = r.nextInt(5); |
| + |
| + if (0 == t && i < end - 1) { |
| + // hi |
| + buffer[i++] = (char) 0xd800; |
| + // lo |
| + buffer[i] = (char) 0xdc00; |
| + } else if (t <= 3) { |
| + buffer[i] = 'a'; |
| + } else if (4 == t) { |
| + buffer[i] = 0xe000; |
| + } |
| + } |
| + |
| + return new String(buffer, 0, end); |
| + } |
| + |
| + private SegmentInfo makePreFlexSegment(Random r, String segName, Directory dir, FieldInfos fieldInfos, Codec codec, List<FieldAndText> fieldTerms) throws IOException { |
| + |
| + final int numField = _TestUtil.nextInt(r, 2, 5); |
| + |
| + List<Term> terms = new ArrayList<Term>(); |
| + |
| + int tc = 0; |
| + |
| + for(int f=0;f<numField;f++) { |
| + String field = "f" + f; |
| + Term protoTerm = new Term(field); |
| + |
| + fieldInfos.add(field, true, false, false, false, false, false, false); |
| + final int numTerms = 1000*_TestUtil.getRandomMultiplier(); |
| + for(int i=0;i<numTerms;i++) { |
| + String s; |
| + if (r.nextInt(3) == 1) { |
| + s = makeDifficultRandomUnicodeString(r); |
| + } else { |
| + s = _TestUtil.randomUnicodeString(r); |
| + } |
| + terms.add(protoTerm.createTerm(s + "_" + (tc++))); |
| + } |
| + } |
| + |
| + fieldInfos.write(dir, segName); |
| + |
| + // sorts in UTF16 order, just like preflex: |
| + Collections.sort(terms); |
| + |
| + TermInfosWriter w = new TermInfosWriter(dir, segName, fieldInfos, 128); |
| + TermInfo ti = new TermInfo(); |
| + BytesRef utf8 = new BytesRef(10); |
| + String lastText = null; |
| + int uniqueTermCount = 0; |
| + if (DEBUG) { |
| + System.out.println("TEST: utf16 order:"); |
| + } |
| + for(Term t : terms) { |
| + FieldInfo fi = fieldInfos.fieldInfo(t.field()); |
| + |
| + String text = t.text(); |
| + if (lastText != null && lastText.equals(text)) { |
| + continue; |
| + } |
| + fieldTerms.add(new FieldAndText(t)); |
| + uniqueTermCount++; |
| + lastText = text; |
| + UnicodeUtil.UTF16toUTF8(text, 0, text.length(), utf8); |
| + |
| + if (DEBUG) { |
| + System.out.println(" " + toHexString(t)); |
| + } |
| + w.add(fi.number, utf8.bytes, utf8.length, ti); |
| + } |
| + w.close(); |
| + |
| + Collections.sort(fieldTerms); |
| + if (DEBUG) { |
| + System.out.println("\nTEST: codepoint order"); |
| + for(FieldAndText t: fieldTerms) { |
| + System.out.println(" " + t.field + ":" + UnicodeUtil.toHexString(t.text.utf8ToString())); |
| + } |
| + } |
| + |
| + dir.createOutput(segName + ".prx").close(); |
| + dir.createOutput(segName + ".frq").close(); |
| + |
| + // !!hack alert!! stuffing uniqueTermCount in as docCount |
| + return new SegmentInfo(segName, uniqueTermCount, dir, false, -1, null, false, true, codec); |
| + } |
| + |
| + private String toHexString(Term t) { |
| + return t.field() + ":" + UnicodeUtil.toHexString(t.text()); |
| + } |
| + |
| + public void testSurrogatesOrder() throws Exception { |
| + Directory dir = new MockRAMDirectory(); |
| + |
| + Codec codec = new PreFlexCodec(); |
| + |
| + Random r = newRandom(); |
| + FieldInfos fieldInfos = new FieldInfos(); |
| + List<FieldAndText> fieldTerms = new ArrayList<FieldAndText>(); |
| + SegmentInfo si = makePreFlexSegment(r, "_0", dir, fieldInfos, codec, fieldTerms); |
| + |
| + // hack alert!! |
| + int uniqueTermCount = si.docCount; |
| + |
| + FieldsProducer fields = codec.fieldsProducer(new SegmentReadState(dir, si, fieldInfos, 1024, 1)); |
| + assertNotNull(fields); |
| + |
| + if (DEBUG) { |
| + System.out.println("\nTEST: now enum"); |
| + } |
| + FieldsEnum fieldsEnum = fields.iterator(); |
| + String field; |
| + UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result(); |
| + |
| + int termCount = 0; |
| + while((field = fieldsEnum.next()) != null) { |
| + TermsEnum termsEnum = fieldsEnum.terms(); |
| + BytesRef text; |
| + BytesRef lastText = null; |
| + while((text = termsEnum.next()) != null) { |
| + UnicodeUtil.UTF8toUTF16(text.bytes, text.offset, text.length, utf16); |
| + if (DEBUG) { |
| + System.out.println("got term=" + field + ":" + UnicodeUtil.toHexString(new String(utf16.result, 0, utf16.length))); |
| + System.out.println(); |
| + } |
| + if (lastText == null) { |
| + lastText = new BytesRef(text); |
| + } else { |
| + assertTrue(lastText.compareTo(text) < 0); |
| + lastText.copy(text); |
| + } |
| + assertEquals(fieldTerms.get(termCount).field, field); |
| + assertEquals(fieldTerms.get(termCount).text, text); |
| + termCount++; |
| + } |
| + if (DEBUG) { |
| + System.out.println(" no more terms for field=" + field); |
| + } |
| + } |
| + assertEquals(uniqueTermCount, termCount); |
| + |
| + fields.close(); |
| + } |
| +} |
| |
| Property changes on: lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java |
| ___________________________________________________________________ |
| Added: svn:eol-style |
| + native |
| |
| Index: lucene/src/test/org/apache/lucene/util/_TestUtil.java |
| =================================================================== |
| --- lucene/src/test/org/apache/lucene/util/_TestUtil.java (revision 956375) |
| +++ lucene/src/test/org/apache/lucene/util/_TestUtil.java (working copy) |
| @@ -141,7 +141,7 @@ |
| else if (t <= 1) buffer[i] = (char) r.nextInt(0x80); |
| else if (2 == t) buffer[i] = (char) nextInt(r, 0x80, 0x800); |
| else if (3 == t) buffer[i] = (char) nextInt(r, 0x800, 0xd7ff); |
| - else if (4 == t) buffer[i] = (char) nextInt(r, 0xe000, 0xffff); |
| + else if (4 == t) buffer[i] = (char) nextInt(r, 0xe000, 0xfffe); |
| } |
| return new String(buffer, 0, end); |
| } |
| Index: lucene/src/test/org/apache/lucene/util/TestNumericUtils.java |
| =================================================================== |
| --- lucene/src/test/org/apache/lucene/util/TestNumericUtils.java (revision 956375) |
| +++ lucene/src/test/org/apache/lucene/util/TestNumericUtils.java (working copy) |
| @@ -30,7 +30,7 @@ |
| NumericUtils.longToPrefixCoded(l, 0, act); |
| if (last!=null) { |
| // test if smaller |
| - assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUTF16Comparator().compare(last, act) < 0 ); |
| + assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUnicodeComparator().compare(last, act) < 0 ); |
| assertTrue("actual bigger than last (as String)", last.utf8ToString().compareTo(act.utf8ToString()) < 0 ); |
| } |
| // test is back and forward conversion works |
| @@ -48,7 +48,7 @@ |
| NumericUtils.intToPrefixCoded(i, 0, act); |
| if (last!=null) { |
| // test if smaller |
| - assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUTF16Comparator().compare(last, act) < 0 ); |
| + assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUnicodeComparator().compare(last, act) < 0 ); |
| assertTrue("actual bigger than last (as String)", last.utf8ToString().compareTo(act.utf8ToString()) < 0 ); |
| } |
| // test is back and forward conversion works |
| @@ -84,7 +84,7 @@ |
| |
| // check sort order (prefixVals should be ascending) |
| for (int i=1; i<prefixVals.length; i++) { |
| - assertTrue( "check sort order", BytesRef.getUTF8SortedAsUTF16Comparator().compare(prefixVals[i-1], prefixVals[i] ) < 0 ); |
| + assertTrue( "check sort order", BytesRef.getUTF8SortedAsUnicodeComparator().compare(prefixVals[i-1], prefixVals[i] ) < 0 ); |
| } |
| |
| // check the prefix encoding, lower precision should have the difference to original value equal to the lower removed bits |
| @@ -124,7 +124,7 @@ |
| |
| // check sort order (prefixVals should be ascending) |
| for (int i=1; i<prefixVals.length; i++) { |
| - assertTrue( "check sort order", BytesRef.getUTF8SortedAsUTF16Comparator().compare(prefixVals[i-1], prefixVals[i] ) < 0 ); |
| + assertTrue( "check sort order", BytesRef.getUTF8SortedAsUnicodeComparator().compare(prefixVals[i-1], prefixVals[i] ) < 0 ); |
| } |
| |
| // check the prefix encoding, lower precision should have the difference to original value equal to the lower removed bits |
| Index: lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java |
| =================================================================== |
| --- lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java (revision 956375) |
| +++ lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java (working copy) |
| @@ -103,7 +103,7 @@ |
| // build a cache of sorted transitions for every state |
| allTransitions = new Transition[runAutomaton.getSize()][]; |
| for (State state : this.automaton.getNumberedStates()) { |
| - state.sortTransitions(Transition.CompareByMinMaxThenDestUTF8InUTF16Order); |
| + state.sortTransitions(Transition.CompareByMinMaxThenDest); |
| state.trimTransitionsArray(); |
| allTransitions[state.getNumber()] = state.transitionsArray; |
| } |
| @@ -158,11 +158,7 @@ |
| // seek to the next possible string; |
| if (nextString()) { |
| // reposition |
| - |
| - // FIXME: this is really bad to turn off |
| - // but it cannot work correctly until terms are in utf8 order. |
| - linear = false; |
| - |
| + |
| if (linear) |
| setLinear(infinitePosition); |
| return seekBytesRef; |
| @@ -188,15 +184,15 @@ |
| } |
| for (int i = 0; i < allTransitions[state].length; i++) { |
| Transition t = allTransitions[state][i]; |
| - if (compareToUTF16(t.getMin(), (seekBytesRef.bytes[position] & 0xff)) <= 0 && |
| - compareToUTF16((seekBytesRef.bytes[position] & 0xff), t.getMax()) <= 0) { |
| + if (t.getMin() <= (seekBytesRef.bytes[position] & 0xff) && |
| + (seekBytesRef.bytes[position] & 0xff) <= t.getMax()) { |
| maxInterval = t.getMax(); |
| break; |
| } |
| } |
| - // 0xef terms don't get the optimization... not worth the trouble. |
| - if (maxInterval != 0xef) |
| - maxInterval = incrementUTF16(maxInterval); |
| + // 0xff terms don't get the optimization... not worth the trouble. |
| + if (maxInterval != 0xff) |
| + maxInterval = incrementUTF8(maxInterval); |
| int length = position + 1; /* position + maxTransition */ |
| if (linearUpperBound.bytes.length < length) |
| linearUpperBound.bytes = new byte[length]; |
| @@ -281,7 +277,7 @@ |
| // if the next character is U+FFFF and is not part of the useful portion, |
| // then by definition it puts us in a reject state, and therefore this |
| // path is dead. there cannot be any higher transitions. backtrack. |
| - c = incrementUTF16(c); |
| + c = incrementUTF8(c); |
| if (c == -1) |
| return false; |
| } |
| @@ -295,8 +291,8 @@ |
| |
| for (int i = 0; i < transitions.length; i++) { |
| Transition transition = transitions[i]; |
| - if (compareToUTF16(transition.getMax(), c) >= 0) { |
| - int nextChar = compareToUTF16(c, transition.getMin()) > 0 ? c : transition.getMin(); |
| + if (transition.getMax() >= c) { |
| + int nextChar = Math.max(c, transition.getMin()); |
| // append either the next sequential char, or the minimum transition |
| seekBytesRef.grow(seekBytesRef.length + 1); |
| seekBytesRef.length++; |
| @@ -342,9 +338,9 @@ |
| private boolean backtrack(int position) { |
| while (position > 0) { |
| int nextChar = seekBytesRef.bytes[position - 1] & 0xff; |
| - // if a character is 0xef its a dead-end too, |
| - // because there is no higher character in UTF-16 sort order. |
| - nextChar = incrementUTF16(nextChar); |
| + // if a character is 0xff its a dead-end too, |
| + // because there is no higher character in UTF-8 sort order. |
| + nextChar = incrementUTF8(nextChar); |
| if (nextChar != -1) { |
| seekBytesRef.bytes[position - 1] = (byte) nextChar; |
| seekBytesRef.length = position; |
| @@ -355,34 +351,11 @@ |
| return false; /* all solutions exhausted */ |
| } |
| |
| - /* return the next utf8 byte in utf16 order, or -1 if exhausted */ |
| - private final int incrementUTF16(int utf8) { |
| + /* return the next utf8 byte in utf8 order, or -1 if exhausted */ |
| + private final int incrementUTF8(int utf8) { |
| switch(utf8) { |
| - case 0xed: return 0xf0; |
| - case 0xfd: return 0xee; |
| - case 0xee: return 0xef; |
| - case 0xef: return -1; |
| + case 0xff: return -1; |
| default: return utf8 + 1; |
| } |
| } |
| - |
| - int compareToUTF16(int aByte, int bByte) { |
| - if (aByte != bByte) { |
| - // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order |
| - |
| - // We know the terms are not equal, but, we may |
| - // have to carefully fixup the bytes at the |
| - // difference to match UTF16's sort order: |
| - if (aByte >= 0xee && bByte >= 0xee) { |
| - if ((aByte & 0xfe) == 0xee) { |
| - aByte += 0x10; |
| - } |
| - if ((bByte&0xfe) == 0xee) { |
| - bByte += 0x10; |
| - } |
| - } |
| - return aByte - bByte; |
| - } |
| - return 0; |
| - } |
| } |
| Index: lucene/src/java/org/apache/lucene/index/FieldInfos.java |
| =================================================================== |
| --- lucene/src/java/org/apache/lucene/index/FieldInfos.java (revision 956375) |
| +++ lucene/src/java/org/apache/lucene/index/FieldInfos.java (working copy) |
| @@ -53,7 +53,7 @@ |
| private final HashMap<String,FieldInfo> byName = new HashMap<String,FieldInfo>(); |
| private int format; |
| |
| - FieldInfos() { } |
| + public FieldInfos() { } |
| |
| /** |
| * Construct a FieldInfos object using the directory and the name of the file |
| @@ -62,7 +62,7 @@ |
| * @param name The name of the file to open the IndexInput from in the Directory |
| * @throws IOException |
| */ |
| - FieldInfos(Directory d, String name) throws IOException { |
| + public FieldInfos(Directory d, String name) throws IOException { |
| IndexInput input = d.openInput(name); |
| try { |
| read(input, name); |
| Index: lucene/src/java/org/apache/lucene/index/TermsEnum.java |
| =================================================================== |
| --- lucene/src/java/org/apache/lucene/index/TermsEnum.java (revision 956375) |
| +++ lucene/src/java/org/apache/lucene/index/TermsEnum.java (working copy) |
| @@ -144,8 +144,7 @@ |
| |
| @Override |
| public Comparator<BytesRef> getComparator() { |
| - // return an unused dummy to prevent NPE |
| - return BytesRef.getUTF8SortedAsUTF16Comparator(); |
| + return null; |
| } |
| |
| @Override |
| Index: lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java |
| =================================================================== |
| --- lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java (revision 956375) |
| +++ lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java (working copy) |
| @@ -130,7 +130,7 @@ |
| |
| // TODO: we may want to make this sort in same order |
| // as Codec's terms dict? |
| - final int[] termIDs = termsHashPerField.sortPostings(BytesRef.getUTF8SortedAsUTF16Comparator()); |
| + final int[] termIDs = termsHashPerField.sortPostings(BytesRef.getUTF8SortedAsUnicodeComparator()); |
| |
| tvf.writeVInt(numPostings); |
| byte bits = 0x0; |
| Index: lucene/src/java/org/apache/lucene/index/IndexWriter.java |
| =================================================================== |
| --- lucene/src/java/org/apache/lucene/index/IndexWriter.java (revision 956375) |
| +++ lucene/src/java/org/apache/lucene/index/IndexWriter.java (working copy) |
| @@ -3964,7 +3964,7 @@ |
| // commit merged deletes |
| SegmentReader reader = merge.readers[i] = readerPool.get(info, merge.mergeDocStores, |
| MERGE_READ_BUFFER_SIZE, |
| - -1); |
| + -config.getReaderTermsIndexDivisor()); |
| |
| // We clone the segment readers because other |
| // deletes may come in while we're merging so we |
| Index: lucene/src/java/org/apache/lucene/index/CheckIndex.java |
| =================================================================== |
| --- lucene/src/java/org/apache/lucene/index/CheckIndex.java (revision 956375) |
| +++ lucene/src/java/org/apache/lucene/index/CheckIndex.java (working copy) |
| @@ -32,7 +32,7 @@ |
| import java.io.IOException; |
| import java.io.File; |
| import java.util.Collection; |
| - |
| +import java.util.Comparator; |
| import java.util.List; |
| import java.util.ArrayList; |
| import java.util.Map; |
| @@ -596,6 +596,10 @@ |
| boolean hasOrd = true; |
| final long termCountStart = status.termCount; |
| |
| + BytesRef lastTerm = null; |
| + |
| + Comparator<BytesRef> termComp = terms.getComparator(); |
| + |
| while(true) { |
| |
| final BytesRef term = terms.next(); |
| @@ -603,6 +607,17 @@ |
| break; |
| } |
| |
| + // make sure terms arrive in order according to |
| + // the comp |
| + if (lastTerm == null) { |
| + lastTerm = new BytesRef(term); |
| + } else { |
| + if (termComp.compare(lastTerm, term) >= 0) { |
| + throw new RuntimeException("terms out of order: lastTerm=" + lastTerm + " term=" + term); |
| + } |
| + lastTerm.copy(term); |
| + } |
| + |
| final int docFreq = terms.docFreq(); |
| status.totFreq += docFreq; |
| |
| Index: lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java |
| =================================================================== |
| --- lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java (revision 956375) |
| +++ lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java (working copy) |
| @@ -80,7 +80,7 @@ |
| // Terms dict |
| success = false; |
| try { |
| - FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, pulsingWriter, BytesRef.getUTF8SortedAsUTF16Comparator()); |
| + FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, pulsingWriter, BytesRef.getUTF8SortedAsUnicodeComparator()); |
| success = true; |
| return ret; |
| } finally { |
| @@ -111,7 +111,7 @@ |
| state.fieldInfos, |
| state.segmentInfo.name, |
| state.termsIndexDivisor, |
| - BytesRef.getUTF8SortedAsUTF16Comparator()); |
| + BytesRef.getUTF8SortedAsUnicodeComparator()); |
| success = true; |
| } finally { |
| if (!success) { |
| @@ -126,7 +126,7 @@ |
| state.dir, state.fieldInfos, state.segmentInfo.name, |
| pulsingReader, |
| state.readBufferSize, |
| - BytesRef.getUTF8SortedAsUTF16Comparator(), |
| + BytesRef.getUTF8SortedAsUnicodeComparator(), |
| StandardCodec.TERMS_CACHE_SIZE); |
| success = true; |
| return ret; |
| Index: lucene/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java |
| =================================================================== |
| --- lucene/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java (revision 956375) |
| +++ lucene/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java (working copy) |
| @@ -63,7 +63,7 @@ |
| |
| success = false; |
| try { |
| - FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUTF16Comparator()); |
| + FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator()); |
| success = true; |
| return ret; |
| } finally { |
| @@ -95,7 +95,7 @@ |
| state.fieldInfos, |
| state.segmentInfo.name, |
| state.termsIndexDivisor, |
| - BytesRef.getUTF8SortedAsUTF16Comparator()); |
| + BytesRef.getUTF8SortedAsUnicodeComparator()); |
| success = true; |
| } finally { |
| if (!success) { |
| @@ -111,7 +111,7 @@ |
| state.segmentInfo.name, |
| postingsReader, |
| state.readBufferSize, |
| - BytesRef.getUTF8SortedAsUTF16Comparator(), |
| + BytesRef.getUTF8SortedAsUnicodeComparator(), |
| StandardCodec.TERMS_CACHE_SIZE); |
| success = true; |
| return ret; |
| Index: lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java |
| =================================================================== |
| --- lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java (revision 956375) |
| +++ lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java (working copy) |
| @@ -104,7 +104,7 @@ |
| indexInterval = in.readInt(); |
| this.indexDivisor = indexDivisor; |
| |
| - if (indexDivisor == -1) { |
| + if (indexDivisor < 0) { |
| totalIndexInterval = indexInterval; |
| } else { |
| // In case terms index gets loaded, later, on demand |
| @@ -131,7 +131,7 @@ |
| } |
| success = true; |
| } finally { |
| - if (indexDivisor != -1) { |
| + if (indexDivisor > 0) { |
| in.close(); |
| this.in = null; |
| if (success) { |
| Index: lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java |
| =================================================================== |
| --- lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java (revision 956375) |
| +++ lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java (working copy) |
| @@ -58,7 +58,7 @@ |
| |
| success = false; |
| try { |
| - FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, docs, BytesRef.getUTF8SortedAsUTF16Comparator()); |
| + FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, docs, BytesRef.getUTF8SortedAsUnicodeComparator()); |
| success = true; |
| return ret; |
| } finally { |
| @@ -85,7 +85,7 @@ |
| state.fieldInfos, |
| state.segmentInfo.name, |
| state.termsIndexDivisor, |
| - BytesRef.getUTF8SortedAsUTF16Comparator()); |
| + BytesRef.getUTF8SortedAsUnicodeComparator()); |
| success = true; |
| } finally { |
| if (!success) { |
| @@ -101,7 +101,7 @@ |
| state.segmentInfo.name, |
| postings, |
| state.readBufferSize, |
| - BytesRef.getUTF8SortedAsUTF16Comparator(), |
| + BytesRef.getUTF8SortedAsUnicodeComparator(), |
| TERMS_CACHE_SIZE); |
| success = true; |
| return ret; |
| Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java |
| =================================================================== |
| --- lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java (revision 956375) |
| +++ lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java (working copy) |
| @@ -53,6 +53,7 @@ |
| long indexPointer = 0; |
| int indexInterval; |
| int skipInterval; |
| + int newSuffixStart; |
| int maxSkipLevels; |
| private int formatM1SkipInterval; |
| |
| @@ -136,6 +137,7 @@ |
| |
| prevBuffer.set(termBuffer); |
| termBuffer.read(input, fieldInfos); |
| + newSuffixStart = termBuffer.newSuffixStart; |
| |
| termInfo.docFreq = input.readVInt(); // read doc freq |
| termInfo.freqPointer += input.readVLong(); // read freq pointer |
| Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java |
| =================================================================== |
| --- lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java (revision 956375) |
| +++ lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java (working copy) |
| @@ -19,7 +19,6 @@ |
| |
| import java.io.IOException; |
| import org.apache.lucene.store.IndexInput; |
| -import org.apache.lucene.util.ArrayUtil; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.UnicodeUtil; |
| import org.apache.lucene.index.Term; |
| @@ -34,6 +33,8 @@ |
| private UnicodeUtil.UTF16Result text = new UnicodeUtil.UTF16Result(); |
| private BytesRef bytes = new BytesRef(10); |
| |
| + int newSuffixStart; |
| + |
| public final int compareTo(TermBuffer other) { |
| if (field == other.field) // fields are interned |
| return compareChars(text.result, text.length, other.text.result, other.text.length); |
| @@ -60,23 +61,33 @@ |
| int start = input.readVInt(); |
| int length = input.readVInt(); |
| int totalLength = start + length; |
| + if (bytes.bytes.length < totalLength) { |
| + bytes.grow(totalLength); |
| + } |
| if (dirty) { |
| // Fully convert all bytes since bytes is dirty |
| UnicodeUtil.UTF16toUTF8(text.result, 0, text.length, bytes); |
| - if (bytes.bytes.length < totalLength) |
| - bytes.bytes = new byte[totalLength]; |
| bytes.length = totalLength; |
| input.readBytes(bytes.bytes, start, length); |
| UnicodeUtil.UTF8toUTF16(bytes.bytes, 0, totalLength, text); |
| dirty = false; |
| } else { |
| // Incrementally convert only the UTF8 bytes that are new: |
| - if (bytes.bytes.length < totalLength) |
| - bytes.bytes = ArrayUtil.grow(bytes.bytes, totalLength); |
| bytes.length = totalLength; |
| input.readBytes(bytes.bytes, start, length); |
| UnicodeUtil.UTF8toUTF16(bytes.bytes, start, length, text); |
| } |
| + |
| + while(true) { |
| + newSuffixStart = text.offsets[start]; |
| + if (newSuffixStart != -1) { |
| + break; |
| + } |
| + if (--start == 0) { |
| + newSuffixStart = 0; |
| + break; |
| + } |
| + } |
| this.field = fieldInfos.fieldName(input.readVInt()); |
| } |
| |
| @@ -124,10 +135,11 @@ |
| try { |
| clone = (TermBuffer)super.clone(); |
| } catch (CloneNotSupportedException e) {} |
| - |
| clone.dirty = true; |
| clone.bytes = new BytesRef(10); |
| clone.text = new UnicodeUtil.UTF16Result(); |
| + clone.text.offsets = new int[text.offsets.length]; |
| + System.arraycopy(text.offsets, 0, clone.text.offsets, 0, text.offsets.length); |
| clone.text.copyText(text); |
| return clone; |
| } |
| Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java |
| =================================================================== |
| --- lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java (revision 956375) |
| +++ lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java (working copy) |
| @@ -39,11 +39,15 @@ |
| import org.apache.lucene.store.IndexInput; |
| import org.apache.lucene.util.Bits; |
| import org.apache.lucene.util.BytesRef; |
| +import org.apache.lucene.util.UnicodeUtil; |
| +import org.apache.lucene.util.ArrayUtil; |
| |
| /** Exposes flex API on a pre-flex index, as a codec. |
| * @lucene.experimental */ |
| public class PreFlexFields extends FieldsProducer { |
| |
| + private static final boolean DEBUG_SURROGATES = false; |
| + |
| public TermInfosReader tis; |
| public final TermInfosReader tisNoIndex; |
| |
| @@ -60,6 +64,16 @@ |
| throws IOException { |
| |
| si = info; |
| + |
| + // NOTE: we must always load terms index, even for |
| + // "sequential" scan during merging, because what is |
| + // sequential to merger may not be to TermInfosReader |
| + // since we do the surrogates dance: |
| + // nocommit -- how to pull right value from IW? |
| + if (indexDivisor < 0) { |
| + indexDivisor = -indexDivisor; |
| + } |
| + |
| TermInfosReader r = new TermInfosReader(dir, info.name, fieldInfos, readBufferSize, indexDivisor); |
| if (indexDivisor == -1) { |
| tisNoIndex = r; |
| @@ -174,7 +188,6 @@ |
| private class PreFlexFieldsEnum extends FieldsEnum { |
| final Iterator<FieldInfo> it; |
| private final PreTermsEnum termsEnum; |
| - private int count; |
| FieldInfo current; |
| |
| public PreFlexFieldsEnum() throws IOException { |
| @@ -185,7 +198,6 @@ |
| @Override |
| public String next() { |
| if (it.hasNext()) { |
| - count++; |
| current = it.next(); |
| return current.name; |
| } else { |
| @@ -195,7 +207,7 @@ |
| |
| @Override |
| public TermsEnum terms() throws IOException { |
| - termsEnum.reset(current, count == 1); |
| + termsEnum.reset(current); |
| return termsEnum; |
| } |
| } |
| @@ -209,14 +221,15 @@ |
| @Override |
| public TermsEnum iterator() throws IOException { |
| PreTermsEnum termsEnum = new PreTermsEnum(); |
| - termsEnum.reset(fieldInfo, false); |
| + termsEnum.reset(fieldInfo); |
| return termsEnum; |
| } |
| |
| @Override |
| public Comparator<BytesRef> getComparator() { |
| - // Pre-flex indexes always sorted in UTF16 order |
| - return BytesRef.getUTF8SortedAsUTF16Comparator(); |
| + // Pre-flex indexes always sorted in UTF16 order, but |
| + // we remap on-the-fly to unicode order |
| + return BytesRef.getUTF8SortedAsUnicodeComparator(); |
| } |
| } |
| |
| @@ -227,37 +240,229 @@ |
| private BytesRef current; |
| private final BytesRef scratchBytesRef = new BytesRef(); |
| |
| - void reset(FieldInfo fieldInfo, boolean isFirstField) throws IOException { |
| + private int[] surrogateSeekPending = new int[1]; |
| + private boolean[] surrogateDidSeekBack = new boolean[1]; |
| + private int surrogateSeekUpto; |
| + private char[] pendingPrefix; |
| + |
| + private SegmentTermEnum seekTermEnum; |
| + private Term protoTerm; |
| + private int newSuffixStart; |
| + |
| + void reset(FieldInfo fieldInfo) throws IOException { |
| this.fieldInfo = fieldInfo; |
| + protoTerm = new Term(fieldInfo.name); |
| if (termEnum == null) { |
| - // First time reset is called |
| - if (isFirstField) { |
| - termEnum = getTermsDict().terms(); |
| - skipNext = false; |
| - } else { |
| - termEnum = getTermsDict().terms(new Term(fieldInfo.name, "")); |
| - skipNext = true; |
| - } |
| + termEnum = getTermsDict().terms(protoTerm); |
| + seekTermEnum = getTermsDict().terms(protoTerm); |
| } else { |
| - final Term t = termEnum.term(); |
| - if (t != null && t.field() == fieldInfo.name) { |
| - // No need to seek -- we have already advanced onto |
| - // this field. We must be @ first term because |
| - // flex API will not advance this enum further, on |
| - // seeing a different field. |
| - } else { |
| - assert t == null || !t.field().equals(fieldInfo.name); // make sure field name is interned |
| - final TermInfosReader tis = getTermsDict(); |
| - tis.seekEnum(termEnum, new Term(fieldInfo.name, "")); |
| + getTermsDict().seekEnum(termEnum, protoTerm); |
| + } |
| + skipNext = true; |
| + |
| + surrogateSeekUpto = 0; |
| + newSuffixStart = 0; |
| + |
| + surrogatesDance(); |
| + } |
| + |
| + private void surrogatesDance() throws IOException { |
| + |
| + // Tricky: prior to 4.0, Lucene index sorted terms in |
| + // UTF16 order, but as of 4.0 we sort by Unicode code |
| + // point order. These orders differ because of the |
| + // surrrogates; so we have to fixup our enum, here, by |
| + // carefully first seeking past the surrogates and |
| + // then back again at the end. The process is |
| + // recursive, since any given term could have multiple |
| + // new occurrences of surrogate pairs, so we use a |
| + // stack to record the pending seek-backs. |
| + if (DEBUG_SURROGATES) { |
| + System.out.println(" dance start term=" + (termEnum.term() == null ? null : UnicodeUtil.toHexString(termEnum.term().text()))); |
| + } |
| + |
| + while(popPendingSeek()); |
| + while(pushNewSurrogate()); |
| + } |
| + |
| + // only for debugging |
| + private String getStack() { |
| + if (surrogateSeekUpto == 0) { |
| + return "null"; |
| + } else { |
| + StringBuffer sb = new StringBuffer(); |
| + for(int i=0;i<surrogateSeekUpto;i++) { |
| + if (i > 0) { |
| + sb.append(' '); |
| + } |
| + sb.append(surrogateSeekPending[i]); |
| } |
| - skipNext = true; |
| + sb.append(" pendingSeekText=" + new String(pendingPrefix, 0, surrogateSeekPending[surrogateSeekUpto-1])); |
| + return sb.toString(); |
| } |
| } |
| |
| + private boolean popPendingSeek() throws IOException { |
| + if (DEBUG_SURROGATES) { |
| + System.out.println(" check pop newSuffix=" + newSuffixStart + " stack=" + getStack()); |
| + } |
| + // if a .next() has advanced beyond the |
| + // after-surrogates range we had last seeked to, we |
| + // must seek back to the start and resume .next from |
| + // there. this pops the pending seek off the stack. |
| + final Term t = termEnum.term(); |
| + if (surrogateSeekUpto > 0) { |
| + final int seekPrefix = surrogateSeekPending[surrogateSeekUpto-1]; |
| + if (DEBUG_SURROGATES) { |
| + System.out.println(" seekPrefix=" + seekPrefix); |
| + } |
| + if (newSuffixStart < seekPrefix) { |
| + assert pendingPrefix != null; |
| + assert pendingPrefix.length > seekPrefix; |
| + pendingPrefix[seekPrefix] = UnicodeUtil.UNI_SUR_HIGH_START; |
| + Term t2 = protoTerm.createTerm(new String(pendingPrefix, 0, 1+seekPrefix)); |
| + if (DEBUG_SURROGATES) { |
| + System.out.println(" do pop; seek back to " + UnicodeUtil.toHexString(t2.text())); |
| + } |
| + getTermsDict().seekEnum(termEnum, t2); |
| + surrogateDidSeekBack[surrogateSeekUpto-1] = true; |
| + |
| + // +2 because we don't want to re-check the |
| + // surrogates we just seek'd back to |
| + newSuffixStart = seekPrefix + 2; |
| + return true; |
| + } else if (newSuffixStart == seekPrefix && surrogateDidSeekBack[surrogateSeekUpto-1] && t != null && t.field() == fieldInfo.name && t.text().charAt(seekPrefix) > UnicodeUtil.UNI_SUR_LOW_END) { |
| + assert pendingPrefix != null; |
| + assert pendingPrefix.length > seekPrefix; |
| + pendingPrefix[seekPrefix] = 0xffff; |
| + Term t2 = protoTerm.createTerm(new String(pendingPrefix, 0, 1+seekPrefix)); |
| + if (DEBUG_SURROGATES) { |
| + System.out.println(" finish pop; seek fwd to " + UnicodeUtil.toHexString(t2.text())); |
| + } |
| + getTermsDict().seekEnum(termEnum, t2); |
| + if (DEBUG_SURROGATES) { |
| + System.out.println(" found term=" + (termEnum.term() == null ? null : UnicodeUtil.toHexString(termEnum.term().text()))); |
| + } |
| + surrogateSeekUpto--; |
| + |
| + if (termEnum.term() == null || termEnum.term().field() != fieldInfo.name) { |
| + // force pop |
| + newSuffixStart = -1; |
| + } else { |
| + newSuffixStart = termEnum.newSuffixStart; |
| + } |
| + |
| + return true; |
| + } |
| + } |
| + |
| + return false; |
| + } |
| + |
| + private boolean pushNewSurrogate() throws IOException { |
| + if (DEBUG_SURROGATES) { |
| + System.out.println(" check push newSuffix=" + newSuffixStart + " stack=" + getStack()); |
| + } |
| + final Term t = termEnum.term(); |
| + if (t == null || t.field() != fieldInfo.name) { |
| + return false; |
| + } |
| + final String text = t.text(); |
| + final int textLen = text.length(); |
| + |
| + for(int i=Math.max(0,newSuffixStart);i<textLen;i++) { |
| + final char ch = text.charAt(i); |
| + if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && ch <= UnicodeUtil.UNI_SUR_HIGH_END && (surrogateSeekUpto == 0 || i > surrogateSeekPending[surrogateSeekUpto-1])) { |
| + |
| + if (DEBUG_SURROGATES) { |
| + System.out.println(" found high surr 0x" + Integer.toHexString(ch) + " at pos=" + i); |
| + } |
| + |
| + // the next() that we just did read in a new |
| + // suffix, containing a surrogate pair |
| + |
| + // seek forward to see if there are any terms with |
| + // this same prefix, but with characters after the |
| + // surrogate range; if so, we must first iterate |
| + // them, then seek back to the surrogates |
| + |
| + char[] testPrefix = new char[i+1]; |
| + for(int j=0;j<i;j++) { |
| + testPrefix[j] = text.charAt(j); |
| + } |
| + testPrefix[i] = 1+UnicodeUtil.UNI_SUR_LOW_END; |
| + |
| + getTermsDict().seekEnum(seekTermEnum, protoTerm.createTerm(new String(testPrefix))); |
| + |
| + Term t2 = seekTermEnum.term(); |
| + boolean isPrefix; |
| + if (t2 != null && t2.field() == fieldInfo.name) { |
| + String seekText = t2.text(); |
| + isPrefix = true; |
| + if (DEBUG_SURROGATES) { |
| + System.out.println(" seek found " + UnicodeUtil.toHexString(seekText)); |
| + } |
| + for(int j=0;j<i;j++) { |
| + if (testPrefix[j] != seekText.charAt(j)) { |
| + isPrefix = false; |
| + break; |
| + } |
| + } |
| + if (DEBUG_SURROGATES && !isPrefix) { |
| + System.out.println(" no end terms"); |
| + } |
| + } else { |
| + if (DEBUG_SURROGATES) { |
| + System.out.println(" no end terms"); |
| + } |
| + isPrefix = false; |
| + } |
| + |
| + if (isPrefix) { |
| + // we found a term, sharing the same prefix, |
| + // with characters after the surrogates, so we |
| + // must first enum those, and then return the |
| + // the surrogates afterwards. push that pending |
| + // seek on the surrogates stack now: |
| + pendingPrefix = testPrefix; |
| + |
| + getTermsDict().seekEnum(termEnum, t2); |
| + |
| + if (surrogateSeekUpto == surrogateSeekPending.length) { |
| + surrogateSeekPending = ArrayUtil.grow(surrogateSeekPending); |
| + } |
| + if (surrogateSeekUpto == surrogateDidSeekBack.length) { |
| + surrogateDidSeekBack = ArrayUtil.grow(surrogateDidSeekBack); |
| + } |
| + surrogateSeekPending[surrogateSeekUpto] = i; |
| + surrogateDidSeekBack[surrogateSeekUpto] = false; |
| + surrogateSeekUpto++; |
| + |
| + if (DEBUG_SURROGATES) { |
| + System.out.println(" do push " + i + "; end term=" + UnicodeUtil.toHexString(t2.text())); |
| + } |
| + |
| + newSuffixStart = i+1; |
| + |
| + return true; |
| + } else { |
| + // there are no terms after the surrogates, so |
| + // we do nothing to the enum and just step |
| + // through the surrogates like normal. but we |
| + // must keep iterating through the term, in case |
| + // another surrogate pair appears later |
| + } |
| + } |
| + } |
| + |
| + return false; |
| + } |
| + |
| @Override |
| public Comparator<BytesRef> getComparator() { |
| - // Pre-flex indexes always sorted in UTF16 order |
| - return BytesRef.getUTF8SortedAsUTF16Comparator(); |
| + // Pre-flex indexes always sorted in UTF16 order, but |
| + // we remap on-the-fly to unicode order |
| + return BytesRef.getUTF8SortedAsUnicodeComparator(); |
| } |
| |
| @Override |
| @@ -272,14 +477,24 @@ |
| |
| @Override |
| public SeekStatus seek(BytesRef term, boolean useCache) throws IOException { |
| + if (DEBUG_SURROGATES) { |
| + System.out.println("TE.seek() term=" + term.utf8ToString()); |
| + } |
| skipNext = false; |
| final TermInfosReader tis = getTermsDict(); |
| - final Term t0 = new Term(fieldInfo.name, term.utf8ToString()); |
| + final Term t0 = protoTerm.createTerm(term.utf8ToString()); |
| + |
| + assert termEnum != null; |
| + |
| if (termEnum == null) { |
| termEnum = tis.terms(t0); |
| } else { |
| tis.seekEnum(termEnum, t0); |
| } |
| + |
| + surrogateSeekUpto = 0; |
| + surrogatesDance(); |
| + |
| final Term t = termEnum.term(); |
| |
| final BytesRef tr; |
| @@ -304,6 +519,9 @@ |
| |
| @Override |
| public BytesRef next() throws IOException { |
| + if (DEBUG_SURROGATES) { |
| + System.out.println("TE.next() skipNext=" + skipNext); |
| + } |
| if (skipNext) { |
| skipNext = false; |
| if (termEnum.term() == null) { |
| @@ -313,19 +531,37 @@ |
| return current = scratchBytesRef; |
| } |
| } |
| - if (termEnum.next()) { |
| + if (termEnum.next() && termEnum.term().field() == fieldInfo.name) { |
| + newSuffixStart = termEnum.newSuffixStart; |
| + if (DEBUG_SURROGATES) { |
| + System.out.println(" set newSuffixStart=" + newSuffixStart); |
| + } |
| + surrogatesDance(); |
| final Term t = termEnum.term(); |
| - if (t.field() == fieldInfo.name) { |
| + if (t == null || t.field() != fieldInfo.name) { |
| + assert t == null || !t.field().equals(fieldInfo.name); // make sure fields are in fact interned |
| + current = null; |
| + } else { |
| scratchBytesRef.copy(t.text()); |
| current = scratchBytesRef; |
| + } |
| + return current; |
| + } else { |
| + if (DEBUG_SURROGATES) { |
| + System.out.println(" force pop"); |
| + } |
| + // force pop |
| + newSuffixStart = -1; |
| + surrogatesDance(); |
| + final Term t = termEnum.term(); |
| + if (t == null || t.field() != fieldInfo.name) { |
| + assert t == null || !t.field().equals(fieldInfo.name); // make sure fields are in fact interned |
| + return null; |
| + } else { |
| + scratchBytesRef.copy(t.text()); |
| + current = scratchBytesRef; |
| return current; |
| - } else { |
| - assert !t.field().equals(fieldInfo.name); // make sure field name is interned |
| - // Crossed into new field |
| - return null; |
| } |
| - } else { |
| - return null; |
| } |
| } |
| |
| Index: lucene/src/java/org/apache/lucene/index/codecs/intblock/IntBlockCodec.java |
| =================================================================== |
| --- lucene/src/java/org/apache/lucene/index/codecs/intblock/IntBlockCodec.java (revision 956375) |
| +++ lucene/src/java/org/apache/lucene/index/codecs/intblock/IntBlockCodec.java (working copy) |
| @@ -67,7 +67,7 @@ |
| |
| success = false; |
| try { |
| - FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUTF16Comparator()); |
| + FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator()); |
| success = true; |
| return ret; |
| } finally { |
| @@ -95,7 +95,7 @@ |
| state.fieldInfos, |
| state.segmentInfo.name, |
| state.termsIndexDivisor, |
| - BytesRef.getUTF8SortedAsUTF16Comparator()); |
| + BytesRef.getUTF8SortedAsUnicodeComparator()); |
| success = true; |
| } finally { |
| if (!success) { |
| @@ -111,7 +111,7 @@ |
| state.segmentInfo.name, |
| postingsReader, |
| state.readBufferSize, |
| - BytesRef.getUTF8SortedAsUTF16Comparator(), |
| + BytesRef.getUTF8SortedAsUnicodeComparator(), |
| StandardCodec.TERMS_CACHE_SIZE); |
| success = true; |
| return ret; |
| Index: lucene/src/java/org/apache/lucene/util/automaton/Transition.java |
| =================================================================== |
| --- lucene/src/java/org/apache/lucene/util/automaton/Transition.java (revision 956375) |
| +++ lucene/src/java/org/apache/lucene/util/automaton/Transition.java (working copy) |
| @@ -210,64 +210,4 @@ |
| } |
| |
| public static final Comparator<Transition> CompareByMinMaxThenDest = new CompareByMinMaxThenDestSingle(); |
| - |
| - private static class UTF8InUTF16Order { |
| - protected int compareCodePoint(int aByte, int bByte) { |
| - if (aByte != bByte) { |
| - // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order |
| - |
| - // We know the terms are not equal, but, we may |
| - // have to carefully fixup the bytes at the |
| - // difference to match UTF16's sort order: |
| - if (aByte >= 0xee && bByte >= 0xee) { |
| - if ((aByte & 0xfe) == 0xee) { |
| - aByte += 0x10; |
| - } |
| - if ((bByte&0xfe) == 0xee) { |
| - bByte += 0x10; |
| - } |
| - } |
| - return aByte - bByte; |
| - } |
| - return 0; |
| - } |
| - } |
| - |
| - private static final class CompareByDestThenMinMaxUTF8InUTF16OrderSingle extends UTF8InUTF16Order implements Comparator<Transition> { |
| - public int compare(Transition t1, Transition t2) { |
| - if (t1.to != t2.to) { |
| - if (t1.to == null) return -1; |
| - else if (t2.to == null) return 1; |
| - else if (t1.to.number < t2.to.number) return -1; |
| - else if (t1.to.number > t2.to.number) return 1; |
| - } |
| - int minComp = compareCodePoint(t1.min, t2.min); |
| - if (minComp != 0) return minComp; |
| - int maxComp = compareCodePoint(t1.max, t2.max); |
| - if (maxComp != 0) return maxComp; |
| - return 0; |
| - } |
| - } |
| - |
| - public static final Comparator<Transition> CompareByDestThenMinMaxUTF8InUTF16Order = new CompareByDestThenMinMaxUTF8InUTF16OrderSingle(); |
| - |
| - private static final class CompareByMinMaxThenDestUTF8InUTF16OrderSingle extends UTF8InUTF16Order implements Comparator<Transition> { |
| - public int compare(Transition t1, Transition t2) { |
| - int minComp = compareCodePoint(t1.min, t2.min); |
| - if (minComp != 0) return minComp; |
| - int maxComp = compareCodePoint(t1.max, t2.max); |
| - if (maxComp != 0) return maxComp; |
| - if (t1.to != t2.to) { |
| - if (t1.to == null) return -1; |
| - else if (t2.to == null) return 1; |
| - else if (t1.to.number < t2.to.number) return -1; |
| - else if (t1.to.number > t2.to.number) return 1; |
| - } |
| - return 0; |
| - } |
| - } |
| - |
| - public static final Comparator<Transition> CompareByMinMaxThenDestUTF8InUTF16Order = new CompareByMinMaxThenDestUTF8InUTF16OrderSingle(); |
| - |
| - |
| } |
| Index: lucene/src/java/org/apache/lucene/util/ArrayUtil.java |
| =================================================================== |
| --- lucene/src/java/org/apache/lucene/util/ArrayUtil.java (revision 956375) |
| +++ lucene/src/java/org/apache/lucene/util/ArrayUtil.java (working copy) |
| @@ -327,6 +327,29 @@ |
| return array; |
| } |
| |
| + public static boolean[] grow(boolean[] array, int minSize) { |
| + if (array.length < minSize) { |
| + boolean[] newArray = new boolean[oversize(minSize, 1)]; |
| + System.arraycopy(array, 0, newArray, 0, array.length); |
| + return newArray; |
| + } else |
| + return array; |
| + } |
| + |
| + public static boolean[] grow(boolean[] array) { |
| + return grow(array, 1 + array.length); |
| + } |
| + |
| + public static boolean[] shrink(boolean[] array, int targetSize) { |
| + final int newSize = getShrinkSize(array.length, targetSize, 1); |
| + if (newSize != array.length) { |
| + boolean[] newArray = new boolean[newSize]; |
| + System.arraycopy(array, 0, newArray, 0, newSize); |
| + return newArray; |
| + } else |
| + return array; |
| + } |
| + |
| public static char[] grow(char[] array, int minSize) { |
| if (array.length < minSize) { |
| char[] newArray = new char[oversize(minSize, RamUsageEstimator.NUM_BYTES_CHAR)]; |
| Index: lucene/src/java/org/apache/lucene/util/BytesRef.java |
| =================================================================== |
| --- lucene/src/java/org/apache/lucene/util/BytesRef.java (revision 956375) |
| +++ lucene/src/java/org/apache/lucene/util/BytesRef.java (working copy) |
| @@ -217,14 +217,7 @@ |
| bytes = ArrayUtil.grow(bytes, newLength); |
| } |
| |
| - private final static Comparator<BytesRef> utf8SortedAsUTF16SortOrder = new UTF8SortedAsUTF16Comparator(); |
| - |
| - public static Comparator<BytesRef> getUTF8SortedAsUTF16Comparator() { |
| - return utf8SortedAsUTF16SortOrder; |
| - } |
| - |
| /** Unsigned byte order comparison */ |
| - /* |
| public int compareTo(BytesRef other) { |
| if (this == other) return 0; |
| |
| @@ -245,52 +238,18 @@ |
| // One is a prefix of the other, or, they are equal: |
| return this.length - other.length; |
| } |
| - */ |
| |
| - /** Lucene default index order. Currently the same as String.compareTo() (UTF16) but will change |
| - * in the future to unsigned byte comparison. */ |
| - public int compareTo(BytesRef other) { |
| - if (this == other) return 0; |
| + private final static Comparator<BytesRef> utf8SortedAsUnicodeSortOrder = new UTF8SortedAsUnicodeComparator(); |
| |
| - final byte[] aBytes = this.bytes; |
| - int aUpto = this.offset; |
| - final byte[] bBytes = other.bytes; |
| - int bUpto = other.offset; |
| - |
| - final int aStop = aUpto + Math.min(this.length, other.length); |
| - |
| - while(aUpto < aStop) { |
| - int aByte = aBytes[aUpto++] & 0xff; |
| - int bByte = bBytes[bUpto++] & 0xff; |
| - if (aByte != bByte) { |
| - |
| - // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order |
| - |
| - // We know the terms are not equal, but, we may |
| - // have to carefully fixup the bytes at the |
| - // difference to match UTF16's sort order: |
| - if (aByte >= 0xee && bByte >= 0xee) { |
| - if ((aByte & 0xfe) == 0xee) { |
| - aByte += 0x10; |
| - } |
| - if ((bByte&0xfe) == 0xee) { |
| - bByte += 0x10; |
| - } |
| - } |
| - return aByte - bByte; |
| - } |
| - } |
| - |
| - // One is a prefix of the other, or, they are equal: |
| - return this.length - other.length; |
| + public static Comparator<BytesRef> getUTF8SortedAsUnicodeComparator() { |
| + return utf8SortedAsUnicodeSortOrder; |
| } |
| |
| - private static class UTF8SortedAsUTF16Comparator implements Comparator<BytesRef> { |
| + private static class UTF8SortedAsUnicodeComparator implements Comparator<BytesRef> { |
| // Only singleton |
| - private UTF8SortedAsUTF16Comparator() {}; |
| + private UTF8SortedAsUnicodeComparator() {}; |
| |
| public int compare(BytesRef a, BytesRef b) { |
| - |
| final byte[] aBytes = a.bytes; |
| int aUpto = a.offset; |
| final byte[] bBytes = b.bytes; |
| @@ -307,32 +266,15 @@ |
| int aByte = aBytes[aUpto++] & 0xff; |
| int bByte = bBytes[bUpto++] & 0xff; |
| |
| - if (aByte != bByte) { |
| - |
| - // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order |
| - |
| - // We know the terms are not equal, but, we may |
| - // have to carefully fixup the bytes at the |
| - // difference to match UTF16's sort order: |
| - if (aByte >= 0xee && bByte >= 0xee) { |
| - if ((aByte & 0xfe) == 0xee) { |
| - aByte += 0x10; |
| - } |
| - if ((bByte&0xfe) == 0xee) { |
| - bByte += 0x10; |
| - } |
| - } |
| - return aByte - bByte; |
| + int diff = aByte - bByte; |
| + if (diff != 0) { |
| + return diff; |
| } |
| } |
| |
| // One is a prefix of the other, or, they are equal: |
| return a.length - b.length; |
| - } |
| - |
| - public boolean equals(Object other) { |
| - return this == other; |
| - } |
| + } |
| } |
| |
| public void writeExternal(ObjectOutput out) |
| Index: lucene/src/java/org/apache/lucene/util/UnicodeUtil.java |
| =================================================================== |
| --- lucene/src/java/org/apache/lucene/util/UnicodeUtil.java (revision 956375) |
| +++ lucene/src/java/org/apache/lucene/util/UnicodeUtil.java (working copy) |
| @@ -358,7 +358,6 @@ |
| out[outUpto++] = (char) ((chHalf & HALF_MASK) + UNI_SUR_LOW_START); |
| } |
| } |
| - |
| offsets[upto] = outUpto; |
| result.length = outUpto; |
| } |
| @@ -483,7 +482,7 @@ |
| } |
| } |
| */ |
| - public static final boolean validUTF16String(CharSequence s) { |
| + public static boolean validUTF16String(CharSequence s) { |
| final int size = s.length(); |
| for(int i=0;i<size;i++) { |
| char ch = s.charAt(i); |
| @@ -507,7 +506,7 @@ |
| return true; |
| } |
| |
| - public static final boolean validUTF16String(char[] s, int size) { |
| + public static boolean validUTF16String(char[] s, int size) { |
| for(int i=0;i<size;i++) { |
| char ch = s[i]; |
| if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { |
| @@ -559,7 +558,7 @@ |
| /** Returns the number of code points in this utf8 |
| * sequence. Behavior is undefined if the utf8 sequence |
| * is invalid.*/ |
| - public static final int codePointCount(BytesRef utf8) { |
| + public static int codePointCount(BytesRef utf8) { |
| int upto = utf8.offset; |
| final int limit = utf8.offset + utf8.length; |
| final byte[] bytes = utf8.bytes; |
| @@ -673,4 +672,33 @@ |
| } |
| return new String(chars, 0, w); |
| } |
| + |
| + // for debugging |
| + public static String toHexString(String s) { |
| + StringBuilder sb = new StringBuilder(); |
| + for(int i=0;i<s.length();i++) { |
| + char ch = s.charAt(i); |
| + if (i > 0) { |
| + sb.append(' '); |
| + } |
| + if (ch < 128) { |
| + sb.append(ch); |
| + } else { |
| + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { |
| + sb.append("H:"); |
| + } else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { |
| + sb.append("L:"); |
| + } else if (ch > UNI_SUR_LOW_END) { |
| + if (ch == 0xffff) { |
| + sb.append("F:"); |
| + } else { |
| + sb.append("E:"); |
| + } |
| + } |
| + |
| + sb.append("0x" + Integer.toHexString(ch)); |
| + } |
| + } |
| + return sb.toString(); |
| + } |
| } |
| Index: lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java |
| =================================================================== |
| --- lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java (revision 956375) |
| +++ lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java (working copy) |
| @@ -426,7 +426,7 @@ |
| |
| @Override |
| public Comparator<BytesRef> getComparator() { |
| - return BytesRef.getUTF8SortedAsUTF16Comparator(); |
| + return BytesRef.getUTF8SortedAsUnicodeComparator(); |
| } |
| }; |
| } |
| Index: lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java |
| =================================================================== |
| --- lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java (revision 956375) |
| +++ lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java (working copy) |
| @@ -123,7 +123,7 @@ |
| |
| @Override |
| public Comparator<BytesRef> getComparator() { |
| - return BytesRef.getUTF8SortedAsUTF16Comparator(); |
| + return BytesRef.getUTF8SortedAsUnicodeComparator(); |
| } |
| } |
| |
| Index: lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java |
| =================================================================== |
| --- lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java (revision 956375) |
| +++ lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java (working copy) |
| @@ -808,7 +808,7 @@ |
| |
| @Override |
| public Comparator<BytesRef> getComparator() { |
| - return BytesRef.getUTF8SortedAsUTF16Comparator(); |
| + return BytesRef.getUTF8SortedAsUnicodeComparator(); |
| } |
| |
| @Override |
| @@ -903,7 +903,7 @@ |
| |
| @Override |
| public Comparator<BytesRef> getComparator() { |
| - return BytesRef.getUTF8SortedAsUTF16Comparator(); |
| + return BytesRef.getUTF8SortedAsUnicodeComparator(); |
| } |
| } |
| |