docs/attachments/LUCENE-2426/LUCENE-2426.patch - lucene-jira-archive - Git at Google

 Index: lucene/src/test/org/apache/lucene/TestExternalCodecs.java
 ===================================================================
 --- lucene/src/test/org/apache/lucene/TestExternalCodecs.java	(revision 956375)
 +++ lucene/src/test/org/apache/lucene/TestExternalCodecs.java	(working copy)
 @@ -179,7 +179,7 @@

        @Override
        public Comparator<BytesRef> getComparator() {
 -        return BytesRef.getUTF8SortedAsUTF16Comparator();
 +        return BytesRef.getUTF8SortedAsUnicodeComparator();
        }

        @Override
 @@ -263,7 +263,7 @@

        @Override
        public Comparator<BytesRef> getComparator() {
 -        return BytesRef.getUTF8SortedAsUTF16Comparator();
 +        return BytesRef.getUTF8SortedAsUnicodeComparator();
        }

        @Override
 Index: lucene/src/test/org/apache/lucene/index/TestIndexWriter.java
 ===================================================================
 --- lucene/src/test/org/apache/lucene/index/TestIndexWriter.java	(revision 956375)
 +++ lucene/src/test/org/apache/lucene/index/TestIndexWriter.java	(working copy)
 @@ -4621,38 +4621,22 @@
    private void checkTermsOrder(IndexReader r, Set<String> allTerms, boolean isTop) throws IOException {
      TermsEnum terms = MultiFields.getFields(r).terms("f").iterator();

 -    char[] last = new char[2];
 -    int lastLength = 0;
 +    BytesRef last = new BytesRef();

      Set<String> seenTerms = new HashSet<String>();

 -    UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
      while(true) {
        final BytesRef term = terms.next();
        if (term == null) {
          break;
        }
 -      UnicodeUtil.UTF8toUTF16(term.bytes, term.offset, term.length, utf16);
 -      assertTrue(utf16.length <= 2);

 -      // Make sure last term comes before current one, in
 -      // UTF16 sort order
 -      int i = 0;
 -      for(i=0;i<lastLength && i<utf16.length;i++) {
 -        assertTrue("UTF16 code unit " + termDesc(new String(utf16.result, 0, utf16.length)) + " incorrectly sorted after code unit " + termDesc(new String(last, 0, lastLength)), last[i] <= utf16.result[i]);
 -        if (last[i] < utf16.result[i]) {
 -          break;
 -        }
 -      }
 -      // Terms should not have been identical
 -      assertTrue(lastLength != utf16.length || i < lastLength);
 +      assertTrue(last.compareTo(term) < 0);
 +      last.copy(term);

 -      final String s = new String(utf16.result, 0, utf16.length);
 +      final String s = term.utf8ToString();
        assertTrue("term " + termDesc(s) + " was not added to index (count=" + allTerms.size() + ")", allTerms.contains(s));
        seenTerms.add(s);
 -
 -      System.arraycopy(utf16.result, 0, last, 0, utf16.length);
 -      lastLength = utf16.length;
      }

      if (isTop) {
 Index: lucene/src/test/org/apache/lucene/index/codecs/intblock/TestIntBlockCodec.java
 ===================================================================
 --- lucene/src/test/org/apache/lucene/index/codecs/intblock/TestIntBlockCodec.java	(revision 956375)
 +++ lucene/src/test/org/apache/lucene/index/codecs/intblock/TestIntBlockCodec.java	(working copy)
 @@ -1,5 +1,22 @@
  package org.apache.lucene.index.codecs.intblock;

 +/**
 + * Licensed to the Apache Software Foundation (ASF) under one or more
 + * contributor license agreements.  See the NOTICE file distributed with
 + * this work for additional information regarding copyright ownership.
 + * The ASF licenses this file to You under the Apache License, Version 2.0
 + * (the "License"); you may not use this file except in compliance with
 + * the License.  You may obtain a copy of the License at
 + *
 + *     http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +
  import org.apache.lucene.util.LuceneTestCase;
  import org.apache.lucene.store.*;
  import org.apache.lucene.index.codecs.sep.*;
 @@ -34,7 +51,7 @@
      out.close();

      IntIndexInput in = new SimpleIntBlockIndexInput(dir, "test", 128);
 -    IntIndexInput.Reader r = in.reader();
 +    in.reader();
      // read no ints
      in.close();
      dir.close();
 Index: lucene/src/test/org/apache/lucene/index/codecs/preflex/TermInfosWriter.java
 ===================================================================
 --- lucene/src/test/org/apache/lucene/index/codecs/preflex/TermInfosWriter.java	(revision 0)
 +++ lucene/src/test/org/apache/lucene/index/codecs/preflex/TermInfosWriter.java	(revision 0)
 @@ -0,0 +1,227 @@
 +package org.apache.lucene.index.codecs.preflex;
 +
 +/**
 + * Licensed to the Apache Software Foundation (ASF) under one or more
 + * contributor license agreements.  See the NOTICE file distributed with
 + * this work for additional information regarding copyright ownership.
 + * The ASF licenses this file to You under the Apache License, Version 2.0
 + * (the "License"); you may not use this file except in compliance with
 + * the License.  You may obtain a copy of the License at
 + *
 + *     http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +
 +
 +import java.io.IOException;
 +import org.apache.lucene.store.*;
 +import org.apache.lucene.index.*;
 +import org.apache.lucene.util.*;
 +
 +
 +/** This stores a monotonically increasing set of <Term, TermInfo> pairs in a
 +  Directory.  A TermInfos can be written once, in order.  */
 +
 +final class TermInfosWriter {
 +  /** The file format version, a negative number. */
 +  public static final int FORMAT = -3;
 +
 +  // Changed strings to true utf8 with length-in-bytes not
 +  // length-in-chars
 +  public static final int FORMAT_VERSION_UTF8_LENGTH_IN_BYTES = -4;
 +
 +  // NOTE: always change this if you switch to a new format!
 +  public static final int FORMAT_CURRENT = FORMAT_VERSION_UTF8_LENGTH_IN_BYTES;
 +
 +  private FieldInfos fieldInfos;
 +  private IndexOutput output;
 +  private TermInfo lastTi = new TermInfo();
 +  private long size;
 +
 +  // TODO: the default values for these two parameters should be settable from
 +  // IndexWriter.  However, once that's done, folks will start setting them to
 +  // ridiculous values and complaining that things don't work well, as with
 +  // mergeFactor.  So, let's wait until a number of folks find that alternate
 +  // values work better.  Note that both of these values are stored in the
 +  // segment, so that it's safe to change these w/o rebuilding all indexes.
 +
 +  /** Expert: The fraction of terms in the "dictionary" which should be stored
 +   * in RAM.  Smaller values use more memory, but make searching slightly
 +   * faster, while larger values use less memory and make searching slightly
 +   * slower.  Searching is typically not dominated by dictionary lookup, so
 +   * tweaking this is rarely useful.*/
 +  int indexInterval = 128;
 +
 +  /** Expert: The fraction of {@link TermDocs} entries stored in skip tables,
 +   * used to accelerate {@link TermDocs#skipTo(int)}.  Larger values result in
 +   * smaller indexes, greater acceleration, but fewer accelerable cases, while
 +   * smaller values result in bigger indexes, less acceleration and more
 +   * accelerable cases. More detailed experiments would be useful here. */
 +  int skipInterval = 16;
 +
 +  /** Expert: The maximum number of skip levels. Smaller values result in
 +   * slightly smaller indexes, but slower skipping in big posting lists.
 +   */
 +  int maxSkipLevels = 10;
 +
 +  private long lastIndexPointer;
 +  private boolean isIndex;
 +  private byte[] lastTermBytes = new byte[10];
 +  private int lastTermBytesLength = 0;
 +  private int lastFieldNumber = -1;
 +
 +  private TermInfosWriter other;
 +  private BytesRef utf8Result = new BytesRef(10);
 +
 +  TermInfosWriter(Directory directory, String segment, FieldInfos fis,
 +                  int interval)
 +       throws IOException {
 +    initialize(directory, segment, fis, interval, false);
 +    other = new TermInfosWriter(directory, segment, fis, interval, true);
 +    other.other = this;
 +  }
 +
 +  private TermInfosWriter(Directory directory, String segment, FieldInfos fis,
 +                          int interval, boolean isIndex) throws IOException {
 +    initialize(directory, segment, fis, interval, isIndex);
 +  }
 +
 +  private void initialize(Directory directory, String segment, FieldInfos fis,
 +                          int interval, boolean isi) throws IOException {
 +    indexInterval = interval;
 +    fieldInfos = fis;
 +    isIndex = isi;
 +    output = directory.createOutput(segment + (isIndex ? ".tii" : ".tis"));
 +    output.writeInt(FORMAT_CURRENT);              // write format
 +    output.writeLong(0);                          // leave space for size
 +    output.writeInt(indexInterval);               // write indexInterval
 +    output.writeInt(skipInterval);                // write skipInterval
 +    output.writeInt(maxSkipLevels);               // write maxSkipLevels
 +    assert initUTF16Results();
 +  }
 +
 +  void add(Term term, TermInfo ti) throws IOException {
 +    UnicodeUtil.UTF16toUTF8(term.text(), 0, term.text().length(), utf8Result);
 +    add(fieldInfos.fieldNumber(term.field()), utf8Result.bytes, utf8Result.length, ti);
 +  }
 +
 +  // Currently used only by assert statements
 +  UnicodeUtil.UTF16Result utf16Result1;
 +  UnicodeUtil.UTF16Result utf16Result2;
 +
 +  // Currently used only by assert statements
 +  private boolean initUTF16Results() {
 +    utf16Result1 = new UnicodeUtil.UTF16Result();
 +    utf16Result2 = new UnicodeUtil.UTF16Result();
 +    return true;
 +  }
 +
 +  // Currently used only by assert statement
 +  private int compareToLastTerm(int fieldNumber, byte[] termBytes, int termBytesLength) {
 +
 +    if (lastFieldNumber != fieldNumber) {
 +      final int cmp = fieldInfos.fieldName(lastFieldNumber).compareTo(fieldInfos.fieldName(fieldNumber));
 +      // If there is a field named "" (empty string) then we
 +      // will get 0 on this comparison, yet, it's "OK".  But
 +      // it's not OK if two different field numbers map to
 +      // the same name.
 +      if (cmp != 0 || lastFieldNumber != -1)
 +        return cmp;
 +    }
 +
 +    UnicodeUtil.UTF8toUTF16(lastTermBytes, 0, lastTermBytesLength, utf16Result1);
 +    UnicodeUtil.UTF8toUTF16(termBytes, 0, termBytesLength, utf16Result2);
 +    final int len;
 +    if (utf16Result1.length < utf16Result2.length)
 +      len = utf16Result1.length;
 +    else
 +      len = utf16Result2.length;
 +
 +    for(int i=0;i<len;i++) {
 +      final char ch1 = utf16Result1.result[i];
 +      final char ch2 = utf16Result2.result[i];
 +      if (ch1 != ch2)
 +        return ch1-ch2;
 +    }
 +    return utf16Result1.length - utf16Result2.length;
 +  }
 +
 +  /** Adds a new <<fieldNumber, termBytes>, TermInfo> pair to the set.
 +    Term must be lexicographically greater than all previous Terms added.
 +    TermInfo pointers must be positive and greater than all previous.*/
 +  void add(int fieldNumber, byte[] termBytes, int termBytesLength, TermInfo ti)
 +    throws IOException {
 +
 +    assert compareToLastTerm(fieldNumber, termBytes, termBytesLength) < 0 ||
 +      (isIndex && termBytesLength == 0 && lastTermBytesLength == 0) :
 +      "Terms are out of order: field=" + fieldInfos.fieldName(fieldNumber) + " (number " + fieldNumber + ")" +
 +        " lastField=" + fieldInfos.fieldName(lastFieldNumber) + " (number " + lastFieldNumber + ")" +
 +        " text=" + new String(termBytes, 0, termBytesLength, "UTF-8") + " lastText=" + new String(lastTermBytes, 0, lastTermBytesLength, "UTF-8");
 +
 +    assert ti.freqPointer >= lastTi.freqPointer: "freqPointer out of order (" + ti.freqPointer + " < " + lastTi.freqPointer + ")";
 +    assert ti.proxPointer >= lastTi.proxPointer: "proxPointer out of order (" + ti.proxPointer + " < " + lastTi.proxPointer + ")";
 +
 +    if (!isIndex && size % indexInterval == 0)
 +      other.add(lastFieldNumber, lastTermBytes, lastTermBytesLength, lastTi);                      // add an index term
 +
 +    writeTerm(fieldNumber, termBytes, termBytesLength);                        // write term
 +
 +    output.writeVInt(ti.docFreq);                       // write doc freq
 +    output.writeVLong(ti.freqPointer - lastTi.freqPointer); // write pointers
 +    output.writeVLong(ti.proxPointer - lastTi.proxPointer);
 +
 +    if (ti.docFreq >= skipInterval) {
 +      output.writeVInt(ti.skipOffset);
 +    }
 +
 +    if (isIndex) {
 +      output.writeVLong(other.output.getFilePointer() - lastIndexPointer);
 +      lastIndexPointer = other.output.getFilePointer(); // write pointer
 +    }
 +
 +    lastFieldNumber = fieldNumber;
 +    lastTi.set(ti);
 +    size++;
 +  }
 +
 +  private void writeTerm(int fieldNumber, byte[] termBytes, int termBytesLength)
 +       throws IOException {
 +
 +    // TODO: UTF16toUTF8 could tell us this prefix
 +    // Compute prefix in common with last term:
 +    int start = 0;
 +    final int limit = termBytesLength < lastTermBytesLength ? termBytesLength : lastTermBytesLength;
 +    while(start < limit) {
 +      if (termBytes[start] != lastTermBytes[start])
 +        break;
 +      start++;
 +    }
 +
 +    final int length = termBytesLength - start;
 +    output.writeVInt(start);                     // write shared prefix length
 +    output.writeVInt(length);                  // write delta length
 +    output.writeBytes(termBytes, start, length);  // write delta bytes
 +    output.writeVInt(fieldNumber); // write field num
 +    if (lastTermBytes.length < termBytesLength) {
 +      lastTermBytes = ArrayUtil.grow(lastTermBytes, termBytesLength);
 +    }
 +    System.arraycopy(termBytes, start, lastTermBytes, start, length);
 +    lastTermBytesLength = termBytesLength;
 +  }
 +
 +  /** Called to complete TermInfos creation. */
 +  void close() throws IOException {
 +    output.seek(4);          // write size after format
 +    output.writeLong(size);
 +    output.close();
 +
 +    if (!isIndex)
 +      other.close();
 +  }
 +
 +}

 Property changes on: lucene/src/test/org/apache/lucene/index/codecs/preflex/TermInfosWriter.java
 ___________________________________________________________________
 Added: svn:eol-style
    + native

 Index: lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java
 ===================================================================
 --- lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java	(revision 0)
 +++ lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java	(revision 0)
 @@ -0,0 +1,206 @@
 +package org.apache.lucene.index.codecs.preflex;
 +
 +/**
 + * Licensed to the Apache Software Foundation (ASF) under one or more
 + * contributor license agreements.  See the NOTICE file distributed with
 + * this work for additional information regarding copyright ownership.
 + * The ASF licenses this file to You under the Apache License, Version 2.0
 + * (the "License"); you may not use this file except in compliance with
 + * the License.  You may obtain a copy of the License at
 + *
 + *     http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing, software
 + * distributed under the License is distributed on an "AS IS" BASIS,
 + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 + * See the License for the specific language governing permissions and
 + * limitations under the License.
 + */
 +
 +import org.apache.lucene.store.*;
 +import org.apache.lucene.index.*;
 +import org.apache.lucene.index.codecs.*;
 +import org.apache.lucene.util.*;
 +
 +import java.util.*;
 +import java.io.IOException;
 +
 +public class TestSurrogates extends LuceneTestCase {
 +
 +  private static final boolean DEBUG = false;
 +
 +  // like Term, but uses BytesRef for text
 +  private static class FieldAndText implements Comparable<FieldAndText> {
 +    String field;
 +    BytesRef text;
 +
 +    public FieldAndText(Term t) {
 +      field = t.field();
 +      text = new BytesRef(t.text());
 +    }
 +
 +    public int compareTo(FieldAndText other) {
 +      if (other.field == field) {
 +        return text.compareTo(other.text);
 +      } else {
 +        return field.compareTo(other.field);
 +      }
 +    }
 +  }
 +
 +  // chooses from a very limited alphabet to exacerbate the
 +  // surrogate seeking required
 +  private static String makeDifficultRandomUnicodeString(Random r) {
 +    final int end = r.nextInt(20);
 +    if (end == 0) {
 +      // allow 0 length
 +      return "";
 +    }
 +    final char[] buffer = new char[end];
 +    for (int i = 0; i < end; i++) {
 +      int t = r.nextInt(5);
 +
 +      if (0 == t && i < end - 1) {
 +        // hi
 +        buffer[i++] = (char) 0xd800;
 +        // lo
 +        buffer[i] = (char) 0xdc00;
 +      } else if (t <= 3) {
 +        buffer[i] = 'a';
 +      }  else if (4 == t) {
 +        buffer[i] = 0xe000;
 +      }
 +    }
 +
 +    return new String(buffer, 0, end);
 +  }
 +
 +  private SegmentInfo makePreFlexSegment(Random r, String segName, Directory dir, FieldInfos fieldInfos, Codec codec, List<FieldAndText> fieldTerms) throws IOException {
 +
 +    final int numField = _TestUtil.nextInt(r, 2, 5);
 +
 +    List<Term> terms = new ArrayList<Term>();
 +
 +    int tc = 0;
 +
 +    for(int f=0;f<numField;f++) {
 +      String field = "f" + f;
 +      Term protoTerm = new Term(field);
 +
 +      fieldInfos.add(field, true, false, false, false, false, false, false);
 +      final int numTerms = 1000*_TestUtil.getRandomMultiplier();
 +      for(int i=0;i<numTerms;i++) {
 +        String s;
 +        if (r.nextInt(3) == 1) {
 +          s = makeDifficultRandomUnicodeString(r);
 +        } else {
 +          s = _TestUtil.randomUnicodeString(r);
 +        }
 +        terms.add(protoTerm.createTerm(s + "_" + (tc++)));
 +      }
 +    }
 +
 +    fieldInfos.write(dir, segName);
 +
 +    // sorts in UTF16 order, just like preflex:
 +    Collections.sort(terms);
 +
 +    TermInfosWriter w = new TermInfosWriter(dir, segName, fieldInfos, 128);
 +    TermInfo ti = new TermInfo();
 +    BytesRef utf8 = new BytesRef(10);
 +    String lastText = null;
 +    int uniqueTermCount = 0;
 +    if (DEBUG) {
 +      System.out.println("TEST: utf16 order:");
 +    }
 +    for(Term t : terms) {
 +      FieldInfo fi = fieldInfos.fieldInfo(t.field());
 +
 +      String text = t.text();
 +      if (lastText != null && lastText.equals(text)) {
 +        continue;
 +      }
 +      fieldTerms.add(new FieldAndText(t));
 +      uniqueTermCount++;
 +      lastText = text;
 +      UnicodeUtil.UTF16toUTF8(text, 0, text.length(), utf8);
 +
 +      if (DEBUG) {
 +        System.out.println("  " + toHexString(t));
 +      }
 +      w.add(fi.number, utf8.bytes, utf8.length, ti);
 +    }
 +    w.close();
 +
 +    Collections.sort(fieldTerms);
 +    if (DEBUG) {
 +      System.out.println("\nTEST: codepoint order");
 +      for(FieldAndText t: fieldTerms) {
 +        System.out.println("  " + t.field + ":" + UnicodeUtil.toHexString(t.text.utf8ToString()));
 +      }
 +    }
 +
 +    dir.createOutput(segName + ".prx").close();
 +    dir.createOutput(segName + ".frq").close();
 +
 +    // !!hack alert!! stuffing uniqueTermCount in as docCount
 +    return new SegmentInfo(segName, uniqueTermCount, dir, false, -1, null, false, true, codec);
 +  }
 +
 +  private String toHexString(Term t) {
 +    return t.field() + ":" + UnicodeUtil.toHexString(t.text());
 +  }
 +
 +  public void testSurrogatesOrder() throws Exception {
 +    Directory dir = new MockRAMDirectory();
 +
 +    Codec codec = new PreFlexCodec();
 +
 +    Random r = newRandom();
 +    FieldInfos fieldInfos = new FieldInfos();
 +    List<FieldAndText> fieldTerms = new ArrayList<FieldAndText>();
 +    SegmentInfo si = makePreFlexSegment(r, "_0", dir, fieldInfos, codec, fieldTerms);
 +
 +    // hack alert!!
 +    int uniqueTermCount = si.docCount;
 +
 +    FieldsProducer fields = codec.fieldsProducer(new SegmentReadState(dir, si, fieldInfos, 1024, 1));
 +    assertNotNull(fields);
 +
 +    if (DEBUG) {
 +      System.out.println("\nTEST: now enum");
 +    }
 +    FieldsEnum fieldsEnum = fields.iterator();
 +    String field;
 +    UnicodeUtil.UTF16Result utf16 = new UnicodeUtil.UTF16Result();
 +
 +    int termCount = 0;
 +    while((field = fieldsEnum.next()) != null) {
 +      TermsEnum termsEnum = fieldsEnum.terms();
 +      BytesRef text;
 +      BytesRef lastText = null;
 +      while((text = termsEnum.next()) != null) {
 +        UnicodeUtil.UTF8toUTF16(text.bytes, text.offset, text.length, utf16);
 +        if (DEBUG) {
 +          System.out.println("got term=" + field + ":" + UnicodeUtil.toHexString(new String(utf16.result, 0, utf16.length)));
 +          System.out.println();
 +        }
 +        if (lastText == null) {
 +          lastText = new BytesRef(text);
 +        } else {
 +          assertTrue(lastText.compareTo(text) < 0);
 +          lastText.copy(text);
 +        }
 +        assertEquals(fieldTerms.get(termCount).field, field);
 +        assertEquals(fieldTerms.get(termCount).text, text);
 +        termCount++;
 +      }
 +      if (DEBUG) {
 +        System.out.println("  no more terms for field=" + field);
 +      }
 +    }
 +    assertEquals(uniqueTermCount, termCount);
 +
 +    fields.close();
 +  }
 +}

 Property changes on: lucene/src/test/org/apache/lucene/index/codecs/preflex/TestSurrogates.java
 ___________________________________________________________________
 Added: svn:eol-style
    + native

 Index: lucene/src/test/org/apache/lucene/util/_TestUtil.java
 ===================================================================
 --- lucene/src/test/org/apache/lucene/util/_TestUtil.java	(revision 956375)
 +++ lucene/src/test/org/apache/lucene/util/_TestUtil.java	(working copy)
 @@ -141,7 +141,7 @@
        else if (t <= 1) buffer[i] = (char) r.nextInt(0x80);
        else if (2 == t) buffer[i] = (char) nextInt(r, 0x80, 0x800);
        else if (3 == t) buffer[i] = (char) nextInt(r, 0x800, 0xd7ff);
 -      else if (4 == t) buffer[i] = (char) nextInt(r, 0xe000, 0xffff);
 +      else if (4 == t) buffer[i] = (char) nextInt(r, 0xe000, 0xfffe);
      }
      return new String(buffer, 0, end);
    }
 Index: lucene/src/test/org/apache/lucene/util/TestNumericUtils.java
 ===================================================================
 --- lucene/src/test/org/apache/lucene/util/TestNumericUtils.java	(revision 956375)
 +++ lucene/src/test/org/apache/lucene/util/TestNumericUtils.java	(working copy)
 @@ -30,7 +30,7 @@
        NumericUtils.longToPrefixCoded(l, 0, act);
        if (last!=null) {
          // test if smaller
 -        assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUTF16Comparator().compare(last, act) < 0 );
 +        assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUnicodeComparator().compare(last, act) < 0 );
          assertTrue("actual bigger than last (as String)", last.utf8ToString().compareTo(act.utf8ToString()) < 0 );
        }
        // test is back and forward conversion works
 @@ -48,7 +48,7 @@
        NumericUtils.intToPrefixCoded(i, 0, act);
        if (last!=null) {
          // test if smaller
 -        assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUTF16Comparator().compare(last, act) < 0 );
 +        assertTrue("actual bigger than last (BytesRef)", BytesRef.getUTF8SortedAsUnicodeComparator().compare(last, act) < 0 );
          assertTrue("actual bigger than last (as String)", last.utf8ToString().compareTo(act.utf8ToString()) < 0 );
        }
        // test is back and forward conversion works
 @@ -84,7 +84,7 @@

      // check sort order (prefixVals should be ascending)
      for (int i=1; i<prefixVals.length; i++) {
 -      assertTrue( "check sort order", BytesRef.getUTF8SortedAsUTF16Comparator().compare(prefixVals[i-1], prefixVals[i] ) < 0 );
 +      assertTrue( "check sort order", BytesRef.getUTF8SortedAsUnicodeComparator().compare(prefixVals[i-1], prefixVals[i] ) < 0 );
      }

      // check the prefix encoding, lower precision should have the difference to original value equal to the lower removed bits
 @@ -124,7 +124,7 @@

      // check sort order (prefixVals should be ascending)
      for (int i=1; i<prefixVals.length; i++) {
 -      assertTrue( "check sort order", BytesRef.getUTF8SortedAsUTF16Comparator().compare(prefixVals[i-1], prefixVals[i] ) < 0 );
 +      assertTrue( "check sort order", BytesRef.getUTF8SortedAsUnicodeComparator().compare(prefixVals[i-1], prefixVals[i] ) < 0 );
      }

      // check the prefix encoding, lower precision should have the difference to original value equal to the lower removed bits
 Index: lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java
 ===================================================================
 --- lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java	(revision 956375)
 +++ lucene/src/java/org/apache/lucene/search/AutomatonTermsEnum.java	(working copy)
 @@ -103,7 +103,7 @@
      // build a cache of sorted transitions for every state
      allTransitions = new Transition[runAutomaton.getSize()][];
      for (State state : this.automaton.getNumberedStates()) {
 -      state.sortTransitions(Transition.CompareByMinMaxThenDestUTF8InUTF16Order);
 +      state.sortTransitions(Transition.CompareByMinMaxThenDest);
        state.trimTransitionsArray();
        allTransitions[state.getNumber()] = state.transitionsArray;
      }
 @@ -158,11 +158,7 @@
      // seek to the next possible string;
      if (nextString()) {
        // reposition
 -
 -      // FIXME: this is really bad to turn off
 -      // but it cannot work correctly until terms are in utf8 order.
 -      linear = false;
 -
 +
        if (linear)
          setLinear(infinitePosition);
        return seekBytesRef;
 @@ -188,15 +184,15 @@
      }
      for (int i = 0; i < allTransitions[state].length; i++) {
        Transition t = allTransitions[state][i];
 -      if (compareToUTF16(t.getMin(), (seekBytesRef.bytes[position] & 0xff)) <= 0 &&
 -          compareToUTF16((seekBytesRef.bytes[position] & 0xff), t.getMax()) <= 0) {
 +      if (t.getMin() <= (seekBytesRef.bytes[position] & 0xff) &&
 +          (seekBytesRef.bytes[position] & 0xff) <= t.getMax()) {
          maxInterval = t.getMax();
          break;
        }
      }
 -    // 0xef terms don't get the optimization... not worth the trouble.
 -    if (maxInterval != 0xef)
 -      maxInterval = incrementUTF16(maxInterval);
 +    // 0xff terms don't get the optimization... not worth the trouble.
 +    if (maxInterval != 0xff)
 +      maxInterval = incrementUTF8(maxInterval);
      int length = position + 1; /* position + maxTransition */
      if (linearUpperBound.bytes.length < length)
        linearUpperBound.bytes = new byte[length];
 @@ -281,7 +277,7 @@
        // if the next character is U+FFFF and is not part of the useful portion,
        // then by definition it puts us in a reject state, and therefore this
        // path is dead. there cannot be any higher transitions. backtrack.
 -      c = incrementUTF16(c);
 +      c = incrementUTF8(c);
        if (c == -1)
          return false;
      }
 @@ -295,8 +291,8 @@

      for (int i = 0; i < transitions.length; i++) {
        Transition transition = transitions[i];
 -      if (compareToUTF16(transition.getMax(), c) >= 0) {
 -        int nextChar = compareToUTF16(c, transition.getMin()) > 0 ? c : transition.getMin();
 +      if (transition.getMax() >= c) {
 +        int nextChar = Math.max(c, transition.getMin());
          // append either the next sequential char, or the minimum transition
          seekBytesRef.grow(seekBytesRef.length + 1);
          seekBytesRef.length++;
 @@ -342,9 +338,9 @@
    private boolean backtrack(int position) {
      while (position > 0) {
        int nextChar = seekBytesRef.bytes[position - 1] & 0xff;
 -      // if a character is 0xef its a dead-end too,
 -      // because there is no higher character in UTF-16 sort order.
 -      nextChar = incrementUTF16(nextChar);
 +      // if a character is 0xff its a dead-end too,
 +      // because there is no higher character in UTF-8 sort order.
 +      nextChar = incrementUTF8(nextChar);
        if (nextChar != -1) {
          seekBytesRef.bytes[position - 1] = (byte) nextChar;
          seekBytesRef.length = position;
 @@ -355,34 +351,11 @@
      return false; /* all solutions exhausted */
    }

 -  /* return the next utf8 byte in utf16 order, or -1 if exhausted */
 -  private final int incrementUTF16(int utf8) {
 +  /* return the next utf8 byte in utf8 order, or -1 if exhausted */
 +  private final int incrementUTF8(int utf8) {
      switch(utf8) {
 -      case 0xed: return 0xf0;
 -      case 0xfd: return 0xee;
 -      case 0xee: return 0xef;
 -      case 0xef: return -1;
 +      case 0xff: return -1;
        default: return utf8 + 1;
      }
    }
 -
 -  int compareToUTF16(int aByte, int bByte) {
 -    if (aByte != bByte) {
 -      // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order
 -
 -      // We know the terms are not equal, but, we may
 -      // have to carefully fixup the bytes at the
 -      // difference to match UTF16's sort order:
 -      if (aByte >= 0xee && bByte >= 0xee) {
 -        if ((aByte & 0xfe) == 0xee) {
 -          aByte += 0x10;
 -        }
 -        if ((bByte&0xfe) == 0xee) {
 -          bByte += 0x10;
 -        }
 -      }
 -      return aByte - bByte;
 -    }
 -    return 0;
 -  }
  }
 Index: lucene/src/java/org/apache/lucene/index/FieldInfos.java
 ===================================================================
 --- lucene/src/java/org/apache/lucene/index/FieldInfos.java	(revision 956375)
 +++ lucene/src/java/org/apache/lucene/index/FieldInfos.java	(working copy)
 @@ -53,7 +53,7 @@
    private final HashMap<String,FieldInfo> byName = new HashMap<String,FieldInfo>();
    private int format;

 -  FieldInfos() { }
 +  public FieldInfos() { }

    /**
     * Construct a FieldInfos object using the directory and the name of the file
 @@ -62,7 +62,7 @@
     * @param name The name of the file to open the IndexInput from in the Directory
     * @throws IOException
     */
 -  FieldInfos(Directory d, String name) throws IOException {
 +  public FieldInfos(Directory d, String name) throws IOException {
      IndexInput input = d.openInput(name);
      try {
        read(input, name);
 Index: lucene/src/java/org/apache/lucene/index/TermsEnum.java
 ===================================================================
 --- lucene/src/java/org/apache/lucene/index/TermsEnum.java	(revision 956375)
 +++ lucene/src/java/org/apache/lucene/index/TermsEnum.java	(working copy)
 @@ -144,8 +144,7 @@

      @Override
      public Comparator<BytesRef> getComparator() {
 -      // return an unused dummy to prevent NPE
 -      return BytesRef.getUTF8SortedAsUTF16Comparator();
 +      return null;
      }

      @Override
 Index: lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java
 ===================================================================
 --- lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java	(revision 956375)
 +++ lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java	(working copy)
 @@ -130,7 +130,7 @@

      // TODO: we may want to make this sort in same order
      // as Codec's terms dict?
 -    final int[] termIDs = termsHashPerField.sortPostings(BytesRef.getUTF8SortedAsUTF16Comparator());
 +    final int[] termIDs = termsHashPerField.sortPostings(BytesRef.getUTF8SortedAsUnicodeComparator());

      tvf.writeVInt(numPostings);
      byte bits = 0x0;
 Index: lucene/src/java/org/apache/lucene/index/IndexWriter.java
 ===================================================================
 --- lucene/src/java/org/apache/lucene/index/IndexWriter.java	(revision 956375)
 +++ lucene/src/java/org/apache/lucene/index/IndexWriter.java	(working copy)
 @@ -3964,7 +3964,7 @@
          // commit merged deletes
          SegmentReader reader = merge.readers[i] = readerPool.get(info, merge.mergeDocStores,
                                                                   MERGE_READ_BUFFER_SIZE,
 -                                                                 -1);
 +                                                                 -config.getReaderTermsIndexDivisor());

          // We clone the segment readers because other
          // deletes may come in while we're merging so we
 Index: lucene/src/java/org/apache/lucene/index/CheckIndex.java
 ===================================================================
 --- lucene/src/java/org/apache/lucene/index/CheckIndex.java	(revision 956375)
 +++ lucene/src/java/org/apache/lucene/index/CheckIndex.java	(working copy)
 @@ -32,7 +32,7 @@
  import java.io.IOException;
  import java.io.File;
  import java.util.Collection;
 -
 +import java.util.Comparator;
  import java.util.List;
  import java.util.ArrayList;
  import java.util.Map;
 @@ -596,6 +596,10 @@
          boolean hasOrd = true;
          final long termCountStart = status.termCount;

 +        BytesRef lastTerm = null;
 +
 +        Comparator<BytesRef> termComp = terms.getComparator();
 +
          while(true) {

            final BytesRef term = terms.next();
 @@ -603,6 +607,17 @@
              break;
            }

 +          // make sure terms arrive in order according to
 +          // the comp
 +          if (lastTerm == null) {
 +            lastTerm = new BytesRef(term);
 +          } else {
 +            if (termComp.compare(lastTerm, term) >= 0) {
 +              throw new RuntimeException("terms out of order: lastTerm=" + lastTerm + " term=" + term);
 +            }
 +            lastTerm.copy(term);
 +          }
 +
            final int docFreq = terms.docFreq();
            status.totFreq += docFreq;

 Index: lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java
 ===================================================================
 --- lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java	(revision 956375)
 +++ lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingCodec.java	(working copy)
 @@ -80,7 +80,7 @@
      // Terms dict
      success = false;
      try {
 -      FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, pulsingWriter, BytesRef.getUTF8SortedAsUTF16Comparator());
 +      FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, pulsingWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
        success = true;
        return ret;
      } finally {
 @@ -111,7 +111,7 @@
                                                         state.fieldInfos,
                                                         state.segmentInfo.name,
                                                         state.termsIndexDivisor,
 -                                                       BytesRef.getUTF8SortedAsUTF16Comparator());
 +                                                       BytesRef.getUTF8SortedAsUnicodeComparator());
        success = true;
      } finally {
        if (!success) {
 @@ -126,7 +126,7 @@
                                                         state.dir, state.fieldInfos, state.segmentInfo.name,
                                                         pulsingReader,
                                                         state.readBufferSize,
 -                                                       BytesRef.getUTF8SortedAsUTF16Comparator(),
 +                                                       BytesRef.getUTF8SortedAsUnicodeComparator(),
                                                         StandardCodec.TERMS_CACHE_SIZE);
        success = true;
        return ret;
 Index: lucene/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java
 ===================================================================
 --- lucene/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java	(revision 956375)
 +++ lucene/src/java/org/apache/lucene/index/codecs/sep/SepCodec.java	(working copy)
 @@ -63,7 +63,7 @@

      success = false;
      try {
 -      FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUTF16Comparator());
 +      FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
        success = true;
        return ret;
      } finally {
 @@ -95,7 +95,7 @@
                                                         state.fieldInfos,
                                                         state.segmentInfo.name,
                                                         state.termsIndexDivisor,
 -                                                       BytesRef.getUTF8SortedAsUTF16Comparator());
 +                                                       BytesRef.getUTF8SortedAsUnicodeComparator());
        success = true;
      } finally {
        if (!success) {
 @@ -111,7 +111,7 @@
                                                         state.segmentInfo.name,
                                                         postingsReader,
                                                         state.readBufferSize,
 -                                                       BytesRef.getUTF8SortedAsUTF16Comparator(),
 +                                                       BytesRef.getUTF8SortedAsUnicodeComparator(),
                                                         StandardCodec.TERMS_CACHE_SIZE);
        success = true;
        return ret;
 Index: lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java
 ===================================================================
 --- lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java	(revision 956375)
 +++ lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java	(working copy)
 @@ -104,7 +104,7 @@
        indexInterval = in.readInt();
        this.indexDivisor = indexDivisor;

 -      if (indexDivisor == -1) {
 +      if (indexDivisor < 0) {
          totalIndexInterval = indexInterval;
        } else {
          // In case terms index gets loaded, later, on demand
 @@ -131,7 +131,7 @@
        }
        success = true;
      } finally {
 -      if (indexDivisor != -1) {
 +      if (indexDivisor > 0) {
          in.close();
          this.in = null;
          if (success) {
 Index: lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java
 ===================================================================
 --- lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java	(revision 956375)
 +++ lucene/src/java/org/apache/lucene/index/codecs/standard/StandardCodec.java	(working copy)
 @@ -58,7 +58,7 @@

      success = false;
      try {
 -      FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, docs, BytesRef.getUTF8SortedAsUTF16Comparator());
 +      FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, docs, BytesRef.getUTF8SortedAsUnicodeComparator());
        success = true;
        return ret;
      } finally {
 @@ -85,7 +85,7 @@
                                                         state.fieldInfos,
                                                         state.segmentInfo.name,
                                                         state.termsIndexDivisor,
 -                                                       BytesRef.getUTF8SortedAsUTF16Comparator());
 +                                                       BytesRef.getUTF8SortedAsUnicodeComparator());
        success = true;
      } finally {
        if (!success) {
 @@ -101,7 +101,7 @@
                                                         state.segmentInfo.name,
                                                         postings,
                                                         state.readBufferSize,
 -                                                       BytesRef.getUTF8SortedAsUTF16Comparator(),
 +                                                       BytesRef.getUTF8SortedAsUnicodeComparator(),
                                                         TERMS_CACHE_SIZE);
        success = true;
        return ret;
 Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java
 ===================================================================
 --- lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java	(revision 956375)
 +++ lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermEnum.java	(working copy)
 @@ -53,6 +53,7 @@
    long indexPointer = 0;
    int indexInterval;
    int skipInterval;
 +  int newSuffixStart;
    int maxSkipLevels;
    private int formatM1SkipInterval;

 @@ -136,6 +137,7 @@

      prevBuffer.set(termBuffer);
      termBuffer.read(input, fieldInfos);
 +    newSuffixStart = termBuffer.newSuffixStart;

      termInfo.docFreq = input.readVInt();	  // read doc freq
      termInfo.freqPointer += input.readVLong();	  // read freq pointer
 Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java
 ===================================================================
 --- lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java	(revision 956375)
 +++ lucene/src/java/org/apache/lucene/index/codecs/preflex/TermBuffer.java	(working copy)
 @@ -19,7 +19,6 @@

  import java.io.IOException;
  import org.apache.lucene.store.IndexInput;
 -import org.apache.lucene.util.ArrayUtil;
  import org.apache.lucene.util.BytesRef;
  import org.apache.lucene.util.UnicodeUtil;
  import org.apache.lucene.index.Term;
 @@ -34,6 +33,8 @@
    private UnicodeUtil.UTF16Result text = new UnicodeUtil.UTF16Result();
    private BytesRef bytes = new BytesRef(10);

 +  int newSuffixStart;
 +
    public final int compareTo(TermBuffer other) {
      if (field == other.field) 	  // fields are interned
        return compareChars(text.result, text.length, other.text.result, other.text.length);
 @@ -60,23 +61,33 @@
      int start = input.readVInt();
      int length = input.readVInt();
      int totalLength = start + length;
 +    if (bytes.bytes.length < totalLength) {
 +      bytes.grow(totalLength);
 +    }
      if (dirty) {
        // Fully convert all bytes since bytes is dirty
        UnicodeUtil.UTF16toUTF8(text.result, 0, text.length, bytes);
 -      if (bytes.bytes.length < totalLength)
 -        bytes.bytes = new byte[totalLength];
        bytes.length = totalLength;
        input.readBytes(bytes.bytes, start, length);
        UnicodeUtil.UTF8toUTF16(bytes.bytes, 0, totalLength, text);
        dirty = false;
      } else {
        // Incrementally convert only the UTF8 bytes that are new:
 -      if (bytes.bytes.length < totalLength)
 -        bytes.bytes = ArrayUtil.grow(bytes.bytes, totalLength);
        bytes.length = totalLength;
        input.readBytes(bytes.bytes, start, length);
        UnicodeUtil.UTF8toUTF16(bytes.bytes, start, length, text);
      }
 +
 +    while(true) {
 +      newSuffixStart = text.offsets[start];
 +      if (newSuffixStart != -1) {
 +        break;
 +      }
 +      if (--start == 0) {
 +        newSuffixStart = 0;
 +        break;
 +      }
 +    }
      this.field = fieldInfos.fieldName(input.readVInt());
    }

 @@ -124,10 +135,11 @@
      try {
        clone = (TermBuffer)super.clone();
      } catch (CloneNotSupportedException e) {}
 -
      clone.dirty = true;
      clone.bytes = new BytesRef(10);
      clone.text = new UnicodeUtil.UTF16Result();
 +    clone.text.offsets = new int[text.offsets.length];
 +    System.arraycopy(text.offsets, 0, clone.text.offsets, 0, text.offsets.length);
      clone.text.copyText(text);
      return clone;
    }
 Index: lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java
 ===================================================================
 --- lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java	(revision 956375)
 +++ lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java	(working copy)
 @@ -39,11 +39,15 @@
  import org.apache.lucene.store.IndexInput;
  import org.apache.lucene.util.Bits;
  import org.apache.lucene.util.BytesRef;
 +import org.apache.lucene.util.UnicodeUtil;
 +import org.apache.lucene.util.ArrayUtil;

  /** Exposes flex API on a pre-flex index, as a codec.
   * @lucene.experimental */
  public class PreFlexFields extends FieldsProducer {

 +  private static final boolean DEBUG_SURROGATES = false;
 +
    public TermInfosReader tis;
    public final TermInfosReader tisNoIndex;

 @@ -60,6 +64,16 @@
      throws IOException {

      si = info;
 +
 +    // NOTE: we must always load terms index, even for
 +    // "sequential" scan during merging, because what is
 +    // sequential to merger may not be to TermInfosReader
 +    // since we do the surrogates dance:
 +    // nocommit -- how to pull right value from IW?
 +    if (indexDivisor < 0) {
 +      indexDivisor = -indexDivisor;
 +    }
 +
      TermInfosReader r = new TermInfosReader(dir, info.name, fieldInfos, readBufferSize, indexDivisor);
      if (indexDivisor == -1) {
        tisNoIndex = r;
 @@ -174,7 +188,6 @@
    private class PreFlexFieldsEnum extends FieldsEnum {
      final Iterator<FieldInfo> it;
      private final PreTermsEnum termsEnum;
 -    private int count;
      FieldInfo current;

      public PreFlexFieldsEnum() throws IOException {
 @@ -185,7 +198,6 @@
      @Override
      public String next() {
        if (it.hasNext()) {
 -        count++;
          current = it.next();
          return current.name;
        } else {
 @@ -195,7 +207,7 @@

      @Override
      public TermsEnum terms() throws IOException {
 -      termsEnum.reset(current, count == 1);
 +      termsEnum.reset(current);
        return termsEnum;
      }
    }
 @@ -209,14 +221,15 @@
      @Override
      public TermsEnum iterator() throws IOException {
        PreTermsEnum termsEnum = new PreTermsEnum();
 -      termsEnum.reset(fieldInfo, false);
 +      termsEnum.reset(fieldInfo);
        return termsEnum;
      }

      @Override
      public Comparator<BytesRef> getComparator() {
 -      // Pre-flex indexes always sorted in UTF16 order
 -      return BytesRef.getUTF8SortedAsUTF16Comparator();
 +      // Pre-flex indexes always sorted in UTF16 order, but
 +      // we remap on-the-fly to unicode order
 +      return BytesRef.getUTF8SortedAsUnicodeComparator();
      }
    }

 @@ -227,37 +240,229 @@
      private BytesRef current;
      private final BytesRef scratchBytesRef = new BytesRef();

 -    void reset(FieldInfo fieldInfo, boolean isFirstField) throws IOException {
 +    private int[] surrogateSeekPending = new int[1];
 +    private boolean[] surrogateDidSeekBack = new boolean[1];
 +    private int surrogateSeekUpto;
 +    private char[] pendingPrefix;
 +
 +    private SegmentTermEnum seekTermEnum;
 +    private Term protoTerm;
 +    private int newSuffixStart;
 +
 +    void reset(FieldInfo fieldInfo) throws IOException {
        this.fieldInfo = fieldInfo;
 +      protoTerm = new Term(fieldInfo.name);
        if (termEnum == null) {
 -        // First time reset is called
 -        if (isFirstField) {
 -          termEnum = getTermsDict().terms();
 -          skipNext = false;
 -        } else {
 -          termEnum = getTermsDict().terms(new Term(fieldInfo.name, ""));
 -          skipNext = true;
 -        }
 +        termEnum = getTermsDict().terms(protoTerm);
 +        seekTermEnum = getTermsDict().terms(protoTerm);
        } else {
 -        final Term t = termEnum.term();
 -        if (t != null && t.field() == fieldInfo.name) {
 -          // No need to seek -- we have already advanced onto
 -          // this field.  We must be @ first term because
 -          // flex API will not advance this enum further, on
 -          // seeing a different field.
 -        } else {
 -          assert t == null || !t.field().equals(fieldInfo.name);  // make sure field name is interned
 -          final TermInfosReader tis = getTermsDict();
 -          tis.seekEnum(termEnum, new Term(fieldInfo.name, ""));
 +        getTermsDict().seekEnum(termEnum, protoTerm);
 +      }
 +      skipNext = true;
 +
 +      surrogateSeekUpto = 0;
 +      newSuffixStart = 0;
 +
 +      surrogatesDance();
 +    }
 +
 +    private void surrogatesDance() throws IOException {
 +
 +      // Tricky: prior to 4.0, Lucene index sorted terms in
 +      // UTF16 order, but as of 4.0 we sort by Unicode code
 +      // point order.  These orders differ because of the
 +      // surrrogates; so we have to fixup our enum, here, by
 +      // carefully first seeking past the surrogates and
 +      // then back again at the end.  The process is
 +      // recursive, since any given term could have multiple
 +      // new occurrences of surrogate pairs, so we use a
 +      // stack to record the pending seek-backs.
 +      if (DEBUG_SURROGATES) {
 +        System.out.println("  dance start term=" + (termEnum.term() == null ? null : UnicodeUtil.toHexString(termEnum.term().text())));
 +      }
 +
 +      while(popPendingSeek());
 +      while(pushNewSurrogate());
 +    }
 +
 +    // only for debugging
 +    private String getStack() {
 +      if (surrogateSeekUpto == 0) {
 +        return "null";
 +      } else {
 +        StringBuffer sb = new StringBuffer();
 +        for(int i=0;i<surrogateSeekUpto;i++) {
 +          if (i > 0) {
 +            sb.append(' ');
 +          }
 +          sb.append(surrogateSeekPending[i]);
          }
 -        skipNext = true;
 +        sb.append(" pendingSeekText=" + new String(pendingPrefix, 0, surrogateSeekPending[surrogateSeekUpto-1]));
 +        return sb.toString();
        }
      }

 +    private boolean popPendingSeek() throws IOException {
 +      if (DEBUG_SURROGATES) {
 +        System.out.println("  check pop newSuffix=" + newSuffixStart + " stack=" + getStack());
 +      }
 +      // if a .next() has advanced beyond the
 +      // after-surrogates range we had last seeked to, we
 +      // must seek back to the start and resume .next from
 +      // there.  this pops the pending seek off the stack.
 +      final Term t = termEnum.term();
 +      if (surrogateSeekUpto > 0) {
 +        final int seekPrefix = surrogateSeekPending[surrogateSeekUpto-1];
 +        if (DEBUG_SURROGATES) {
 +          System.out.println("    seekPrefix=" + seekPrefix);
 +        }
 +        if (newSuffixStart < seekPrefix) {
 +          assert pendingPrefix != null;
 +          assert pendingPrefix.length > seekPrefix;
 +          pendingPrefix[seekPrefix] = UnicodeUtil.UNI_SUR_HIGH_START;
 +          Term t2 = protoTerm.createTerm(new String(pendingPrefix, 0, 1+seekPrefix));
 +          if (DEBUG_SURROGATES) {
 +            System.out.println("    do pop; seek back to " + UnicodeUtil.toHexString(t2.text()));
 +          }
 +          getTermsDict().seekEnum(termEnum, t2);
 +          surrogateDidSeekBack[surrogateSeekUpto-1] = true;
 +
 +          // +2 because we don't want to re-check the
 +          // surrogates we just seek'd back to
 +          newSuffixStart = seekPrefix + 2;
 +          return true;
 +        } else if (newSuffixStart == seekPrefix && surrogateDidSeekBack[surrogateSeekUpto-1] && t != null && t.field() == fieldInfo.name && t.text().charAt(seekPrefix) > UnicodeUtil.UNI_SUR_LOW_END) {
 +          assert pendingPrefix != null;
 +          assert pendingPrefix.length > seekPrefix;
 +          pendingPrefix[seekPrefix] = 0xffff;
 +          Term t2 = protoTerm.createTerm(new String(pendingPrefix, 0, 1+seekPrefix));
 +          if (DEBUG_SURROGATES) {
 +            System.out.println("    finish pop; seek fwd to " + UnicodeUtil.toHexString(t2.text()));
 +          }
 +          getTermsDict().seekEnum(termEnum, t2);
 +          if (DEBUG_SURROGATES) {
 +            System.out.println("    found term=" + (termEnum.term() == null ? null : UnicodeUtil.toHexString(termEnum.term().text())));
 +          }
 +          surrogateSeekUpto--;
 +
 +          if (termEnum.term() == null || termEnum.term().field() != fieldInfo.name) {
 +            // force pop
 +            newSuffixStart = -1;
 +          } else {
 +            newSuffixStart = termEnum.newSuffixStart;
 +          }
 +
 +          return true;
 +        }
 +      }
 +
 +      return false;
 +    }
 +
 +    private boolean pushNewSurrogate() throws IOException {
 +      if (DEBUG_SURROGATES) {
 +        System.out.println("  check push newSuffix=" + newSuffixStart + " stack=" + getStack());
 +      }
 +      final Term t = termEnum.term();
 +      if (t == null || t.field() != fieldInfo.name) {
 +        return false;
 +      }
 +      final String text = t.text();
 +      final int textLen = text.length();
 +
 +      for(int i=Math.max(0,newSuffixStart);i<textLen;i++) {
 +        final char ch = text.charAt(i);
 +        if (ch >= UnicodeUtil.UNI_SUR_HIGH_START && ch <= UnicodeUtil.UNI_SUR_HIGH_END && (surrogateSeekUpto == 0 || i > surrogateSeekPending[surrogateSeekUpto-1])) {
 +
 +          if (DEBUG_SURROGATES) {
 +            System.out.println("    found high surr 0x" + Integer.toHexString(ch) + " at pos=" + i);
 +          }
 +
 +          // the next() that we just did read in a new
 +          // suffix, containing a surrogate pair
 +
 +          // seek forward to see if there are any terms with
 +          // this same prefix, but with characters after the
 +          // surrogate range; if so, we must first iterate
 +          // them, then seek back to the surrogates
 +
 +          char[] testPrefix = new char[i+1];
 +          for(int j=0;j<i;j++) {
 +            testPrefix[j] = text.charAt(j);
 +          }
 +          testPrefix[i] = 1+UnicodeUtil.UNI_SUR_LOW_END;
 +
 +          getTermsDict().seekEnum(seekTermEnum, protoTerm.createTerm(new String(testPrefix)));
 +
 +          Term t2 = seekTermEnum.term();
 +          boolean isPrefix;
 +          if (t2 != null && t2.field() == fieldInfo.name) {
 +            String seekText = t2.text();
 +            isPrefix = true;
 +            if (DEBUG_SURROGATES) {
 +              System.out.println("      seek found " + UnicodeUtil.toHexString(seekText));
 +            }
 +            for(int j=0;j<i;j++) {
 +              if (testPrefix[j] != seekText.charAt(j)) {
 +                isPrefix = false;
 +                break;
 +              }
 +            }
 +            if (DEBUG_SURROGATES && !isPrefix) {
 +              System.out.println("      no end terms");
 +            }
 +          } else {
 +            if (DEBUG_SURROGATES) {
 +              System.out.println("      no end terms");
 +            }
 +            isPrefix = false;
 +          }
 +
 +          if (isPrefix) {
 +            // we found a term, sharing the same prefix,
 +            // with characters after the surrogates, so we
 +            // must first enum those, and then return the
 +            // the surrogates afterwards.  push that pending
 +            // seek on the surrogates stack now:
 +            pendingPrefix = testPrefix;
 +
 +            getTermsDict().seekEnum(termEnum, t2);
 +
 +            if (surrogateSeekUpto == surrogateSeekPending.length) {
 +              surrogateSeekPending = ArrayUtil.grow(surrogateSeekPending);
 +            }
 +            if (surrogateSeekUpto == surrogateDidSeekBack.length) {
 +              surrogateDidSeekBack = ArrayUtil.grow(surrogateDidSeekBack);
 +            }
 +            surrogateSeekPending[surrogateSeekUpto] = i;
 +            surrogateDidSeekBack[surrogateSeekUpto] = false;
 +            surrogateSeekUpto++;
 +
 +            if (DEBUG_SURROGATES) {
 +              System.out.println("      do push " + i + "; end term=" + UnicodeUtil.toHexString(t2.text()));
 +            }
 +
 +            newSuffixStart = i+1;
 +
 +            return true;
 +          } else {
 +            // there are no terms after the surrogates, so
 +            // we do nothing to the enum and just step
 +            // through the surrogates like normal.  but we
 +            // must keep iterating through the term, in case
 +            // another surrogate pair appears later
 +          }
 +        }
 +      }
 +
 +      return false;
 +    }
 +
      @Override
      public Comparator<BytesRef> getComparator() {
 -      // Pre-flex indexes always sorted in UTF16 order
 -      return BytesRef.getUTF8SortedAsUTF16Comparator();
 +      // Pre-flex indexes always sorted in UTF16 order, but
 +      // we remap on-the-fly to unicode order
 +      return BytesRef.getUTF8SortedAsUnicodeComparator();
      }

      @Override
 @@ -272,14 +477,24 @@

      @Override
      public SeekStatus seek(BytesRef term, boolean useCache) throws IOException {
 +      if (DEBUG_SURROGATES) {
 +        System.out.println("TE.seek() term=" + term.utf8ToString());
 +      }
        skipNext = false;
        final TermInfosReader tis = getTermsDict();
 -      final Term t0 = new Term(fieldInfo.name, term.utf8ToString());
 +      final Term t0 = protoTerm.createTerm(term.utf8ToString());
 +
 +      assert termEnum != null;
 +
        if (termEnum == null) {
          termEnum = tis.terms(t0);
        } else {
          tis.seekEnum(termEnum, t0);
        }
 +
 +      surrogateSeekUpto = 0;
 +      surrogatesDance();
 +
        final Term t = termEnum.term();

        final BytesRef tr;
 @@ -304,6 +519,9 @@

      @Override
      public BytesRef next() throws IOException {
 +      if (DEBUG_SURROGATES) {
 +        System.out.println("TE.next() skipNext=" + skipNext);
 +      }
        if (skipNext) {
          skipNext = false;
          if (termEnum.term() == null) {
 @@ -313,19 +531,37 @@
            return current = scratchBytesRef;
          }
        }
 -      if (termEnum.next()) {
 +      if (termEnum.next() && termEnum.term().field() == fieldInfo.name) {
 +        newSuffixStart = termEnum.newSuffixStart;
 +        if (DEBUG_SURROGATES) {
 +          System.out.println("  set newSuffixStart=" + newSuffixStart);
 +        }
 +        surrogatesDance();
          final Term t = termEnum.term();
 -        if (t.field() == fieldInfo.name) {
 +        if (t == null || t.field() != fieldInfo.name) {
 +          assert t == null || !t.field().equals(fieldInfo.name); // make sure fields are in fact interned
 +          current = null;
 +        } else {
            scratchBytesRef.copy(t.text());
            current = scratchBytesRef;
 +        }
 +        return current;
 +      } else {
 +        if (DEBUG_SURROGATES) {
 +          System.out.println("  force pop");
 +        }
 +        // force pop
 +        newSuffixStart = -1;
 +        surrogatesDance();
 +        final Term t = termEnum.term();
 +        if (t == null || t.field() != fieldInfo.name) {
 +          assert t == null || !t.field().equals(fieldInfo.name); // make sure fields are in fact interned
 +          return null;
 +        } else {
 +          scratchBytesRef.copy(t.text());
 +          current = scratchBytesRef;
            return current;
 -        } else {
 -          assert !t.field().equals(fieldInfo.name);  // make sure field name is interned
 -          // Crossed into new field
 -          return null;
          }
 -      } else {
 -        return null;
        }
      }

 Index: lucene/src/java/org/apache/lucene/index/codecs/intblock/IntBlockCodec.java
 ===================================================================
 --- lucene/src/java/org/apache/lucene/index/codecs/intblock/IntBlockCodec.java	(revision 956375)
 +++ lucene/src/java/org/apache/lucene/index/codecs/intblock/IntBlockCodec.java	(working copy)
 @@ -67,7 +67,7 @@

      success = false;
      try {
 -      FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUTF16Comparator());
 +      FieldsConsumer ret = new StandardTermsDictWriter(indexWriter, state, postingsWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
        success = true;
        return ret;
      } finally {
 @@ -95,7 +95,7 @@
                                                         state.fieldInfos,
                                                         state.segmentInfo.name,
                                                         state.termsIndexDivisor,
 -                                                       BytesRef.getUTF8SortedAsUTF16Comparator());
 +                                                       BytesRef.getUTF8SortedAsUnicodeComparator());
        success = true;
      } finally {
        if (!success) {
 @@ -111,7 +111,7 @@
                                                         state.segmentInfo.name,
                                                         postingsReader,
                                                         state.readBufferSize,
 -                                                       BytesRef.getUTF8SortedAsUTF16Comparator(),
 +                                                       BytesRef.getUTF8SortedAsUnicodeComparator(),
                                                         StandardCodec.TERMS_CACHE_SIZE);
        success = true;
        return ret;
 Index: lucene/src/java/org/apache/lucene/util/automaton/Transition.java
 ===================================================================
 --- lucene/src/java/org/apache/lucene/util/automaton/Transition.java	(revision 956375)
 +++ lucene/src/java/org/apache/lucene/util/automaton/Transition.java	(working copy)
 @@ -210,64 +210,4 @@
    }

    public static final Comparator<Transition> CompareByMinMaxThenDest = new CompareByMinMaxThenDestSingle();
 -
 -  private static class UTF8InUTF16Order {
 -    protected int compareCodePoint(int aByte, int bByte) {
 -      if (aByte != bByte) {
 -        // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order
 -
 -        // We know the terms are not equal, but, we may
 -        // have to carefully fixup the bytes at the
 -        // difference to match UTF16's sort order:
 -        if (aByte >= 0xee && bByte >= 0xee) {
 -          if ((aByte & 0xfe) == 0xee) {
 -            aByte += 0x10;
 -          }
 -          if ((bByte&0xfe) == 0xee) {
 -            bByte += 0x10;
 -          }
 -        }
 -        return aByte - bByte;
 -      }
 -      return 0;
 -    }
 -  }
 -
 -  private static final class CompareByDestThenMinMaxUTF8InUTF16OrderSingle extends UTF8InUTF16Order implements Comparator<Transition> {
 -    public int compare(Transition t1, Transition t2) {
 -      if (t1.to != t2.to) {
 -        if (t1.to == null) return -1;
 -        else if (t2.to == null) return 1;
 -        else if (t1.to.number < t2.to.number) return -1;
 -        else if (t1.to.number > t2.to.number) return 1;
 -      }
 -      int minComp = compareCodePoint(t1.min, t2.min);
 -      if (minComp != 0) return minComp;
 -      int maxComp = compareCodePoint(t1.max, t2.max);
 -      if (maxComp != 0) return maxComp;
 -      return 0;
 -    }
 -  }
 -
 -  public static final Comparator<Transition> CompareByDestThenMinMaxUTF8InUTF16Order = new CompareByDestThenMinMaxUTF8InUTF16OrderSingle();
 -
 -  private static final class CompareByMinMaxThenDestUTF8InUTF16OrderSingle extends UTF8InUTF16Order implements Comparator<Transition> {
 -    public int compare(Transition t1, Transition t2) {
 -      int minComp = compareCodePoint(t1.min, t2.min);
 -      if (minComp != 0) return minComp;
 -      int maxComp = compareCodePoint(t1.max, t2.max);
 -      if (maxComp != 0) return maxComp;
 -      if (t1.to != t2.to) {
 -        if (t1.to == null) return -1;
 -        else if (t2.to == null) return 1;
 -        else if (t1.to.number < t2.to.number) return -1;
 -        else if (t1.to.number > t2.to.number) return 1;
 -      }
 -      return 0;
 -    }
 -  }
 -
 -  public static final Comparator<Transition> CompareByMinMaxThenDestUTF8InUTF16Order = new CompareByMinMaxThenDestUTF8InUTF16OrderSingle();
 -
 -
  }
 Index: lucene/src/java/org/apache/lucene/util/ArrayUtil.java
 ===================================================================
 --- lucene/src/java/org/apache/lucene/util/ArrayUtil.java	(revision 956375)
 +++ lucene/src/java/org/apache/lucene/util/ArrayUtil.java	(working copy)
 @@ -327,6 +327,29 @@
        return array;
    }

 +  public static boolean[] grow(boolean[] array, int minSize) {
 +    if (array.length < minSize) {
 +      boolean[] newArray = new boolean[oversize(minSize, 1)];
 +      System.arraycopy(array, 0, newArray, 0, array.length);
 +      return newArray;
 +    } else
 +      return array;
 +  }
 +
 +  public static boolean[] grow(boolean[] array) {
 +    return grow(array, 1 + array.length);
 +  }
 +
 +  public static boolean[] shrink(boolean[] array, int targetSize) {
 +    final int newSize = getShrinkSize(array.length, targetSize, 1);
 +    if (newSize != array.length) {
 +      boolean[] newArray = new boolean[newSize];
 +      System.arraycopy(array, 0, newArray, 0, newSize);
 +      return newArray;
 +    } else
 +      return array;
 +  }
 +
    public static char[] grow(char[] array, int minSize) {
      if (array.length < minSize) {
        char[] newArray = new char[oversize(minSize, RamUsageEstimator.NUM_BYTES_CHAR)];
 Index: lucene/src/java/org/apache/lucene/util/BytesRef.java
 ===================================================================
 --- lucene/src/java/org/apache/lucene/util/BytesRef.java	(revision 956375)
 +++ lucene/src/java/org/apache/lucene/util/BytesRef.java	(working copy)
 @@ -217,14 +217,7 @@
      bytes = ArrayUtil.grow(bytes, newLength);
    }

 -  private final static Comparator<BytesRef> utf8SortedAsUTF16SortOrder = new UTF8SortedAsUTF16Comparator();
 -
 -  public static Comparator<BytesRef> getUTF8SortedAsUTF16Comparator() {
 -    return utf8SortedAsUTF16SortOrder;
 -  }
 -
    /** Unsigned byte order comparison */
 -  /*
    public int compareTo(BytesRef other) {
      if (this == other) return 0;

 @@ -245,52 +238,18 @@
      // One is a prefix of the other, or, they are equal:
      return this.length - other.length;
    }
 -  */

 -  /** Lucene default index order. Currently the same as String.compareTo() (UTF16) but will change
 -   * in the future to unsigned byte comparison. */
 -  public int compareTo(BytesRef other) {
 -    if (this == other) return 0;
 +  private final static Comparator<BytesRef> utf8SortedAsUnicodeSortOrder = new UTF8SortedAsUnicodeComparator();

 -    final byte[] aBytes = this.bytes;
 -    int aUpto = this.offset;
 -    final byte[] bBytes = other.bytes;
 -    int bUpto = other.offset;
 -
 -    final int aStop = aUpto + Math.min(this.length, other.length);
 -
 -    while(aUpto < aStop) {
 -      int aByte = aBytes[aUpto++] & 0xff;
 -      int bByte = bBytes[bUpto++] & 0xff;
 -      if (aByte != bByte) {
 -
 -        // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order
 -
 -        // We know the terms are not equal, but, we may
 -        // have to carefully fixup the bytes at the
 -        // difference to match UTF16's sort order:
 -        if (aByte >= 0xee && bByte >= 0xee) {
 -          if ((aByte & 0xfe) == 0xee) {
 -            aByte += 0x10;
 -          }
 -          if ((bByte&0xfe) == 0xee) {
 -            bByte += 0x10;
 -          }
 -        }
 -        return aByte - bByte;
 -      }
 -    }
 -
 -    // One is a prefix of the other, or, they are equal:
 -    return this.length - other.length;
 +  public static Comparator<BytesRef> getUTF8SortedAsUnicodeComparator() {
 +    return utf8SortedAsUnicodeSortOrder;
    }

 -  private static class UTF8SortedAsUTF16Comparator implements Comparator<BytesRef> {
 +  private static class UTF8SortedAsUnicodeComparator implements Comparator<BytesRef> {
      // Only singleton
 -    private UTF8SortedAsUTF16Comparator() {};
 +    private UTF8SortedAsUnicodeComparator() {};

      public int compare(BytesRef a, BytesRef b) {
 -
        final byte[] aBytes = a.bytes;
        int aUpto = a.offset;
        final byte[] bBytes = b.bytes;
 @@ -307,32 +266,15 @@
          int aByte = aBytes[aUpto++] & 0xff;
          int bByte = bBytes[bUpto++] & 0xff;

 -        if (aByte != bByte) {
 -
 -          // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order
 -
 -          // We know the terms are not equal, but, we may
 -          // have to carefully fixup the bytes at the
 -          // difference to match UTF16's sort order:
 -          if (aByte >= 0xee && bByte >= 0xee) {
 -            if ((aByte & 0xfe) == 0xee) {
 -              aByte += 0x10;
 -            }
 -            if ((bByte&0xfe) == 0xee) {
 -              bByte += 0x10;
 -            }
 -          }
 -          return aByte - bByte;
 +        int diff = aByte - bByte;
 +        if (diff != 0) {
 +          return diff;
          }
        }

        // One is a prefix of the other, or, they are equal:
        return a.length - b.length;
 -    }
 -
 -    public boolean equals(Object other) {
 -      return this == other;
 -    }
 +    }
    }

    public void writeExternal(ObjectOutput out)
 Index: lucene/src/java/org/apache/lucene/util/UnicodeUtil.java
 ===================================================================
 --- lucene/src/java/org/apache/lucene/util/UnicodeUtil.java	(revision 956375)
 +++ lucene/src/java/org/apache/lucene/util/UnicodeUtil.java	(working copy)
 @@ -358,7 +358,6 @@
          out[outUpto++] = (char) ((chHalf & HALF_MASK) + UNI_SUR_LOW_START);
        }
      }
 -
      offsets[upto] = outUpto;
      result.length = outUpto;
    }
 @@ -483,7 +482,7 @@
      }
    }
    */
 -  public static final boolean validUTF16String(CharSequence s) {
 +  public static boolean validUTF16String(CharSequence s) {
      final int size = s.length();
      for(int i=0;i<size;i++) {
        char ch = s.charAt(i);
 @@ -507,7 +506,7 @@
      return true;
    }

 -  public static final boolean validUTF16String(char[] s, int size) {
 +  public static boolean validUTF16String(char[] s, int size) {
      for(int i=0;i<size;i++) {
        char ch = s[i];
        if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
 @@ -559,7 +558,7 @@
    /** Returns the number of code points in this utf8
     *  sequence.  Behavior is undefined if the utf8 sequence
     *  is invalid.*/
 -  public static final int codePointCount(BytesRef utf8) {
 +  public static int codePointCount(BytesRef utf8) {
      int upto = utf8.offset;
      final int limit = utf8.offset + utf8.length;
      final byte[] bytes = utf8.bytes;
 @@ -673,4 +672,33 @@
        }
        return new String(chars, 0, w);
    }
 +
 +  // for debugging
 +  public static String toHexString(String s) {
 +    StringBuilder sb = new StringBuilder();
 +    for(int i=0;i<s.length();i++) {
 +      char ch = s.charAt(i);
 +      if (i > 0) {
 +        sb.append(' ');
 +      }
 +      if (ch < 128) {
 +        sb.append(ch);
 +      } else {
 +        if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
 +          sb.append("H:");
 +        } else if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
 +          sb.append("L:");
 +        } else if (ch > UNI_SUR_LOW_END) {
 +          if (ch == 0xffff) {
 +            sb.append("F:");
 +          } else {
 +            sb.append("E:");
 +          }
 +        }
 +
 +        sb.append("0x" + Integer.toHexString(ch));
 +      }
 +    }
 +    return sb.toString();
 +  }
  }
 Index: lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java
 ===================================================================
 --- lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java	(revision 956375)
 +++ lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedIndexReader.java	(working copy)
 @@ -426,7 +426,7 @@

            @Override
            public Comparator<BytesRef> getComparator() {
 -            return BytesRef.getUTF8SortedAsUTF16Comparator();
 +            return BytesRef.getUTF8SortedAsUnicodeComparator();
            }
          };
        }
 Index: lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java
 ===================================================================
 --- lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java	(revision 956375)
 +++ lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermsEnum.java	(working copy)
 @@ -123,7 +123,7 @@

    @Override
    public Comparator<BytesRef> getComparator() {
 -    return BytesRef.getUTF8SortedAsUTF16Comparator();
 +    return BytesRef.getUTF8SortedAsUnicodeComparator();
    }
  }

 Index: lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
 ===================================================================
 --- lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java	(revision 956375)
 +++ lucene/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java	(working copy)
 @@ -808,7 +808,7 @@

                @Override
                public Comparator<BytesRef> getComparator() {
 -                return BytesRef.getUTF8SortedAsUTF16Comparator();
 +                return BytesRef.getUTF8SortedAsUnicodeComparator();
                }

                @Override
 @@ -903,7 +903,7 @@

        @Override
        public Comparator<BytesRef> getComparator() {
 -        return BytesRef.getUTF8SortedAsUTF16Comparator();
 +        return BytesRef.getUTF8SortedAsUnicodeComparator();
        }
      }