lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java - lucene-solr - Git at Google

 package org.apache.lucene.index.codecs.preflex;

 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 import java.io.IOException;
 import java.util.Collection;
 import java.util.Iterator;
 import java.util.TreeMap;
 import java.util.Comparator;

 import org.apache.lucene.index.DocsEnum;
 import org.apache.lucene.index.DocsAndPositionsEnum;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.FieldsEnum;
 import org.apache.lucene.index.IndexFileNames;
 import org.apache.lucene.index.SegmentInfo;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.index.CompoundFileReader;
 import org.apache.lucene.index.codecs.FieldsProducer;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.UnicodeUtil;

 /** Exposes flex API on a pre-flex index, as a codec.
  * @lucene.experimental */
 public class PreFlexFields extends FieldsProducer {

   private static final boolean DEBUG_SURROGATES = false;

   public TermInfosReader tis;
   public final TermInfosReader tisNoIndex;

   public final IndexInput freqStream;
   public final IndexInput proxStream;
   final private FieldInfos fieldInfos;
   private final SegmentInfo si;
   final TreeMap<String,FieldInfo> fields = new TreeMap<String,FieldInfo>();
   private final Directory dir;
   private final int readBufferSize;
   private Directory cfsReader;

   public PreFlexFields(Directory dir, FieldInfos fieldInfos, SegmentInfo info, int readBufferSize, int indexDivisor)
     throws IOException {

     si = info;

     // NOTE: we must always load terms index, even for
     // "sequential" scan during merging, because what is
     // sequential to merger may not be to TermInfosReader
     // since we do the surrogates dance:
     if (indexDivisor < 0) {
       indexDivisor = -indexDivisor;
     }

     TermInfosReader r = new TermInfosReader(dir, info.name, fieldInfos, readBufferSize, indexDivisor);
     if (indexDivisor == -1) {
       tisNoIndex = r;
     } else {
       tisNoIndex = null;
       tis = r;
     }
     this.readBufferSize = readBufferSize;
     this.fieldInfos = fieldInfos;

     // make sure that all index files have been read or are kept open
     // so that if an index update removes them we'll still have them
     freqStream = dir.openInput(info.name + ".frq", readBufferSize);
     boolean anyProx = false;
     final int numFields = fieldInfos.size();
     for(int i=0;i<numFields;i++) {
       final FieldInfo fieldInfo = fieldInfos.fieldInfo(i);
       if (fieldInfo.isIndexed) {
         fields.put(fieldInfo.name, fieldInfo);
         if (!fieldInfo.omitTermFreqAndPositions) {
           anyProx = true;
         }
       }
     }

     if (anyProx) {
       proxStream = dir.openInput(info.name + ".prx", readBufferSize);
     } else {
       proxStream = null;
     }

     this.dir = dir;
   }

   // If this returns, we do the surrogates dance so that the
   // terms are sorted by unicode sort order.  This should be
   // true when segments are used for "normal" searching;
   // it's only false during testing, to create a pre-flex
   // index, using the test-only PreFlexRW.
   protected boolean sortTermsByUnicode() {
     return true;
   }

   static void files(Directory dir, SegmentInfo info, Collection<String> files) throws IOException {
     files.add(IndexFileNames.segmentFileName(info.name, "", PreFlexCodec.TERMS_EXTENSION));
     files.add(IndexFileNames.segmentFileName(info.name, "", PreFlexCodec.TERMS_INDEX_EXTENSION));
     files.add(IndexFileNames.segmentFileName(info.name, "", PreFlexCodec.FREQ_EXTENSION));
     if (info.getHasProx()) {
       // LUCENE-1739: for certain versions of 2.9-dev,
       // hasProx would be incorrectly computed during
       // indexing as true, and then stored into the segments
       // file, when it should have been false.  So we do the
       // extra check, here:
       final String prx = IndexFileNames.segmentFileName(info.name, "", PreFlexCodec.PROX_EXTENSION);
       if (dir.fileExists(prx)) {
         files.add(prx);
       }
     }
   }

   @Override
   public FieldsEnum iterator() throws IOException {
     return new PreFlexFieldsEnum();
   }

   @Override
   public Terms terms(String field) {
     FieldInfo fi = fieldInfos.fieldInfo(field);
     if (fi != null) {
       return new PreTerms(fi);
     } else {
       return null;
     }
   }

   synchronized private TermInfosReader getTermsDict() {
     if (tis != null) {
       return tis;
     } else {
       return tisNoIndex;
     }
   }

   @Override
   synchronized public void loadTermsIndex(int indexDivisor) throws IOException {
     if (tis == null) {
       Directory dir0;
       if (si.getUseCompoundFile()) {
         // In some cases, we were originally opened when CFS
         // was not used, but then we are asked to open the
         // terms reader with index, the segment has switched
         // to CFS

         if (!(dir instanceof CompoundFileReader)) {
           dir0 = cfsReader = new CompoundFileReader(dir, IndexFileNames.segmentFileName(si.name, "", IndexFileNames.COMPOUND_FILE_EXTENSION), readBufferSize);
         } else {
           dir0 = dir;
         }
         dir0 = cfsReader;
       } else {
         dir0 = dir;
       }

       tis = new TermInfosReader(dir0, si.name, fieldInfos, readBufferSize, indexDivisor);
     }
   }

   @Override
   public void close() throws IOException {
     if (tis != null) {
       tis.close();
     }
     if (tisNoIndex != null) {
       tisNoIndex.close();
     }
     if (cfsReader != null) {
       cfsReader.close();
     }
     if (freqStream != null) {
       freqStream.close();
     }
     if (proxStream != null) {
       proxStream.close();
     }
   }

   private class PreFlexFieldsEnum extends FieldsEnum {
     final Iterator<FieldInfo> it;
     private final PreTermsEnum termsEnum;
     FieldInfo current;

     public PreFlexFieldsEnum() throws IOException {
       it = fields.values().iterator();
       termsEnum = new PreTermsEnum();
     }

     @Override
     public String next() {
       if (it.hasNext()) {
         current = it.next();
         return current.name;
       } else {
         return null;
       }
     }

     @Override
     public TermsEnum terms() throws IOException {
       termsEnum.reset(current);
       return termsEnum;
     }
   }

   private class PreTerms extends Terms {
     final FieldInfo fieldInfo;
     PreTerms(FieldInfo fieldInfo) {
       this.fieldInfo = fieldInfo;
     }

     @Override
     public TermsEnum iterator() throws IOException {
       PreTermsEnum termsEnum = new PreTermsEnum();
       termsEnum.reset(fieldInfo);
       return termsEnum;
     }

     @Override
     public Comparator<BytesRef> getComparator() {
       // Pre-flex indexes always sorted in UTF16 order, but
       // we remap on-the-fly to unicode order
       if (sortTermsByUnicode()) {
         return BytesRef.getUTF8SortedAsUnicodeComparator();
       } else {
         return BytesRef.getUTF8SortedAsUTF16Comparator();
       }
     }
   }

   private class PreTermsEnum extends TermsEnum {
     private SegmentTermEnum termEnum;
     private FieldInfo fieldInfo;
     private boolean skipNext;
     private BytesRef current;

     private SegmentTermEnum seekTermEnum;
     private Term protoTerm;

     private static final byte UTF8_NON_BMP_LEAD = (byte) 0xf0;
     private static final byte UTF8_HIGH_BMP_LEAD = (byte) 0xee;

     // Returns true if the unicode char is "after" the
     // surrogates in UTF16, ie >= U+E000 and <= U+FFFF:
     private final boolean isHighBMPChar(byte[] b, int idx) {
       return (b[idx] & UTF8_HIGH_BMP_LEAD) == UTF8_HIGH_BMP_LEAD;
     }

     // Returns true if the unicode char in the UTF8 byte
     // sequence starting at idx encodes a char outside of
     // BMP (ie what would be a surrogate pair in UTF16):
     private final boolean isNonBMPChar(byte[] b, int idx) {
       return (b[idx] & UTF8_NON_BMP_LEAD) == UTF8_NON_BMP_LEAD;
     }

     private final byte[] scratch = new byte[4];
     private final BytesRef prevTerm = new BytesRef();
     private final BytesRef scratchTerm = new BytesRef();
     private int newSuffixStart;

     // Swap in S, in place of E:
     private boolean seekToNonBMP(SegmentTermEnum te, BytesRef term, int pos) throws IOException {
       final int savLength = term.length;

       assert term.offset == 0;

       // The 3 bytes starting at downTo make up 1
       // unicode character:
       assert isHighBMPChar(term.bytes, pos);

       // NOTE: we cannot make this assert, because
       // AutomatonQuery legitimately sends us malformed UTF8
       // (eg the UTF8 bytes with just 0xee)
       // assert term.length >= pos + 3: "term.length=" + term.length + " pos+3=" + (pos+3) + " byte=" + Integer.toHexString(term.bytes[pos]) + " term=" + term.toString();

       // Save the bytes && length, since we need to
       // restore this if seek "back" finds no matching
       // terms
       if (term.bytes.length < 4+pos) {
         term.grow(4+pos);
       }

       scratch[0] = term.bytes[pos];
       scratch[1] = term.bytes[pos+1];
       scratch[2] = term.bytes[pos+2];

       term.bytes[pos] = (byte) 0xf0;
       term.bytes[pos+1] = (byte) 0x90;
       term.bytes[pos+2] = (byte) 0x80;
       term.bytes[pos+3] = (byte) 0x80;
       term.length = 4+pos;

       if (DEBUG_SURROGATES) {
         System.out.println("      try seek term=" + UnicodeUtil.toHexString(term.utf8ToString()));
       }

       // Seek "back":
       getTermsDict().seekEnum(te, protoTerm.createTerm(term));

       // Test if the term we seek'd to in fact found a
       // surrogate pair at the same position as the E:
       Term t2 = te.term();

       // Cannot be null (or move to next field) because at
       // "worst" it'd seek to the same term we are on now,
       // unless we are being called from seek
       if (t2 == null || t2.field() != fieldInfo.name) {
         return false;
       }

       if (DEBUG_SURROGATES) {
         System.out.println("      got term=" + UnicodeUtil.toHexString(t2.text()));
       }

       // Now test if prefix is identical and we found
       // a non-BMP char at the same position:
       BytesRef b2 = t2.bytes();
       assert b2.offset == 0;

       boolean matches;
       if (b2.length >= term.length && isNonBMPChar(b2.bytes, pos)) {
         matches = true;
         for(int i=0;i<pos;i++) {
           if (term.bytes[i] != b2.bytes[i]) {
             matches = false;
             break;
           }
         }
       } else {
         matches = false;
       }

       // Restore term:
       term.length = savLength;
       term.bytes[pos] = scratch[0];
       term.bytes[pos+1] = scratch[1];
       term.bytes[pos+2] = scratch[2];

       return matches;
     }

     // Seek type 2 "continue" (back to the start of the
     // surrogates): scan the stripped suffix from the
     // prior term, backwards. If there was an E in that
     // part, then we try to seek back to S.  If that
     // seek finds a matching term, we go there.
     private boolean doContinue() throws IOException {

       if (DEBUG_SURROGATES) {
         System.out.println("  try cont");
       }

       int downTo = prevTerm.length-1;

       boolean didSeek = false;

       final int limit = Math.min(newSuffixStart, scratchTerm.length-1);

       while(downTo > limit) {

         if (isHighBMPChar(prevTerm.bytes, downTo)) {

           if (DEBUG_SURROGATES) {
             System.out.println("    found E pos=" + downTo + " vs len=" + prevTerm.length);
           }

           if (seekToNonBMP(seekTermEnum, prevTerm, downTo)) {
             // TODO: more efficient seek?
             getTermsDict().seekEnum(termEnum, seekTermEnum.term());
             //newSuffixStart = downTo+4;
             newSuffixStart = downTo;
             scratchTerm.copy(termEnum.term().bytes());
             didSeek = true;
             if (DEBUG_SURROGATES) {
               System.out.println("      seek!");
             }
             break;
           } else {
             if (DEBUG_SURROGATES) {
               System.out.println("      no seek");
             }
           }
         }

         // Shorten prevTerm in place so that we don't redo
         // this loop if we come back here:
         if ((prevTerm.bytes[downTo] & 0xc0) == 0xc0 || (prevTerm.bytes[downTo] & 0x80) == 0) {
           prevTerm.length = downTo;
         }

         downTo--;
       }

       return didSeek;
     }

     // Look for seek type 3 ("pop"): if the delta from
     // prev -> current was replacing an S with an E,
     // we must now seek to beyond that E.  This seek
     // "finishes" the dance at this character
     // position.
     private boolean doPop() throws IOException {

       if (DEBUG_SURROGATES) {
         System.out.println("  try pop");
       }

       assert newSuffixStart <= prevTerm.length;
       assert newSuffixStart < scratchTerm.length || newSuffixStart == 0;

       if (prevTerm.length > newSuffixStart &&
           isNonBMPChar(prevTerm.bytes, newSuffixStart) &&
           isHighBMPChar(scratchTerm.bytes, newSuffixStart)) {

         // Seek type 2 -- put 0xFF at this position:
         scratchTerm.bytes[newSuffixStart] = (byte) 0xff;
         scratchTerm.length = newSuffixStart+1;

         if (DEBUG_SURROGATES) {
           System.out.println("    seek to term=" + UnicodeUtil.toHexString(scratchTerm.utf8ToString()) + " " + scratchTerm.toString());
         }

         // TODO: more efficient seek?  can we simply swap
         // the enums?
         getTermsDict().seekEnum(termEnum, protoTerm.createTerm(scratchTerm));

         final Term t2 = termEnum.term();

         // We could hit EOF or different field since this
         // was a seek "forward":
         if (t2 != null && t2.field() == fieldInfo.name) {

           if (DEBUG_SURROGATES) {
             System.out.println("      got term=" + UnicodeUtil.toHexString(t2.text()) + " " + t2.bytes());
           }

           final BytesRef b2 = t2.bytes();
           assert b2.offset == 0;


           // Set newSuffixStart -- we can't use
           // termEnum's since the above seek may have
           // done no scanning (eg, term was precisely
           // and index term, or, was in the term seek
           // cache):
           scratchTerm.copy(b2);
           setNewSuffixStart(prevTerm, scratchTerm);

           return true;
         } else if (newSuffixStart != 0 || scratchTerm.length != 0) {
           if (DEBUG_SURROGATES) {
             System.out.println("      got term=null (or next field)");
           }
           newSuffixStart = 0;
           scratchTerm.length = 0;
           return true;
         }
       }

       return false;
     }

     // Pre-flex indices store terms in UTF16 sort order, but
     // certain queries require Unicode codepoint order; this
     // method carefully seeks around surrogates to handle
     // this impedance mismatch

     private void surrogateDance() throws IOException {

       if (!unicodeSortOrder) {
         return;
       }

       // We are invoked after TIS.next() (by UTF16 order) to
       // possibly seek to a different "next" (by unicode
       // order) term.

       // We scan only the "delta" from the last term to the
       // current term, in UTF8 bytes.  We look at 1) the bytes
       // stripped from the prior term, and then 2) the bytes
       // appended to that prior term's prefix.

       // We don't care about specific UTF8 sequences, just
       // the "category" of the UTF16 character.  Category S
       // is a high/low surrogate pair (it non-BMP).
       // Category E is any BMP char > UNI_SUR_LOW_END (and <
       // U+FFFF). Category A is the rest (any unicode char
       // <= UNI_SUR_HIGH_START).

       // The core issue is that pre-flex indices sort the
       // characters as ASE, while flex must sort as AES.  So
       // when scanning, when we hit S, we must 1) seek
       // forward to E and enum the terms there, then 2) seek
       // back to S and enum all terms there, then 3) seek to
       // after E.  Three different seek points (1, 2, 3).

       // We can easily detect S in UTF8: if a byte has
       // prefix 11110 (0xf0), then that byte and the
       // following 3 bytes encode a single unicode codepoint
       // in S.  Similary,we can detect E: if a byte has
       // prefix 1110111 (0xee), then that byte and the
       // following 2 bytes encode a single unicode codepoint
       // in E.

       // Note that this is really a recursive process --
       // maybe the char at pos 2 needs to dance, but any
       // point in its dance, suddenly pos 4 needs to dance
       // so you must finish pos 4 before returning to pos
       // 2.  But then during pos 4's dance maybe pos 7 needs
       // to dance, etc.  However, despite being recursive,
       // we don't need to hold any state because the state
       // can always be derived by looking at prior term &
       // current term.

       // TODO: can we avoid this copy?
       if (termEnum.term() == null || termEnum.term().field() != fieldInfo.name) {
         scratchTerm.length = 0;
       } else {
         scratchTerm.copy(termEnum.term().bytes());
       }

       if (DEBUG_SURROGATES) {
         System.out.println("  dance");
         System.out.println("    prev=" + UnicodeUtil.toHexString(prevTerm.utf8ToString()));
         System.out.println("         " + prevTerm.toString());
         System.out.println("    term=" + UnicodeUtil.toHexString(scratchTerm.utf8ToString()));
         System.out.println("         " + scratchTerm.toString());
       }

       // This code assumes TermInfosReader/SegmentTermEnum
       // always use BytesRef.offset == 0
       assert prevTerm.offset == 0;
       assert scratchTerm.offset == 0;

       // Need to loop here because we may need to do multiple
       // pops, and possibly a continue in the end, ie:
       //
       //  cont
       //  pop, cont
       //  pop, pop, cont
       //  <nothing>
       //

       while(true) {
         if (doContinue()) {
           break;
         } else {
           if (!doPop()) {
             break;
           }
         }
       }

       if (DEBUG_SURROGATES) {
         System.out.println("  finish bmp ends");
       }

       doPushes();
     }


     // Look for seek type 1 ("push"): if the newly added
     // suffix contains any S, we must try to seek to the
     // corresponding E.  If we find a match, we go there;
     // else we keep looking for additional S's in the new
     // suffix.  This "starts" the dance, at this character
     // position:
     private void doPushes() throws IOException {

       int upTo = newSuffixStart;
       if (DEBUG_SURROGATES) {
         System.out.println("  try push newSuffixStart=" + newSuffixStart + " scratchLen=" + scratchTerm.length);
       }

       while(upTo < scratchTerm.length) {
         if (isNonBMPChar(scratchTerm.bytes, upTo) &&
             (upTo > newSuffixStart ||
              (upTo >= prevTerm.length ||
               (!isNonBMPChar(prevTerm.bytes, upTo) &&
                !isHighBMPChar(prevTerm.bytes, upTo))))) {

           // A non-BMP char (4 bytes UTF8) starts here:
           assert scratchTerm.length >= upTo + 4;

           final int savLength = scratchTerm.length;
           scratch[0] = scratchTerm.bytes[upTo];
           scratch[1] = scratchTerm.bytes[upTo+1];
           scratch[2] = scratchTerm.bytes[upTo+2];

           scratchTerm.bytes[upTo] = UTF8_HIGH_BMP_LEAD;
           scratchTerm.bytes[upTo+1] = (byte) 0x80;
           scratchTerm.bytes[upTo+2] = (byte) 0x80;
           scratchTerm.length = upTo+3;

           if (DEBUG_SURROGATES) {
             System.out.println("    try seek 1 pos=" + upTo + " term=" + UnicodeUtil.toHexString(scratchTerm.utf8ToString()) + " " + scratchTerm.toString() + " len=" + scratchTerm.length);
           }

           // Seek "forward":
           // TODO: more efficient seek?
           getTermsDict().seekEnum(seekTermEnum, protoTerm.createTerm(scratchTerm));

           scratchTerm.bytes[upTo] = scratch[0];
           scratchTerm.bytes[upTo+1] = scratch[1];
           scratchTerm.bytes[upTo+2] = scratch[2];
           scratchTerm.length = savLength;

           // Did we find a match?
           final Term t2 = seekTermEnum.term();

           if (DEBUG_SURROGATES) {
             if (t2 == null) {
               System.out.println("      hit term=null");
             } else {
               System.out.println("      hit term=" + UnicodeUtil.toHexString(t2.text()) + " " + (t2==null? null:t2.bytes()));
             }
           }

           // Since this was a seek "forward", we could hit
           // EOF or a different field:
           boolean matches;

           if (t2 != null && t2.field() == fieldInfo.name) {
             final BytesRef b2 = t2.bytes();
             assert b2.offset == 0;
             if (b2.length >= upTo+3 && isHighBMPChar(b2.bytes, upTo)) {
               matches = true;
               for(int i=0;i<upTo;i++) {
                 if (scratchTerm.bytes[i] != b2.bytes[i]) {
                   matches = false;
                   break;
                 }
               }

             } else {
               matches = false;
             }
           } else {
             matches = false;
           }

           if (matches) {

             if (DEBUG_SURROGATES) {
               System.out.println("      matches!");
             }

             // OK seek "back"
             // TODO: more efficient seek?
             getTermsDict().seekEnum(termEnum, seekTermEnum.term());

             scratchTerm.copy(seekTermEnum.term().bytes());

             // +3 because we don't need to check the char
             // at upTo: we know it's > BMP
             upTo += 3;

             // NOTE: we keep iterating, now, since this
             // can easily "recurse".  Ie, after seeking
             // forward at a certain char position, we may
             // find another surrogate in our [new] suffix
             // and must then do another seek (recurse)
           } else {
             upTo++;
           }
         } else {
           upTo++;
         }
       }
     }

     private boolean unicodeSortOrder;

     void reset(FieldInfo fieldInfo) throws IOException {
       //System.out.println("pff.reset te=" + termEnum);
       this.fieldInfo = fieldInfo;
       protoTerm = new Term(fieldInfo.name);
       if (termEnum == null) {
         termEnum = getTermsDict().terms(protoTerm);
         seekTermEnum = getTermsDict().terms(protoTerm);
         //System.out.println("  term=" + termEnum.term());
       } else {
         getTermsDict().seekEnum(termEnum, protoTerm);
       }
       skipNext = true;

       unicodeSortOrder = sortTermsByUnicode();

       final Term t = termEnum.term();
       if (t != null && t.field() == fieldInfo.name) {
         newSuffixStart = 0;
         prevTerm.length = 0;
         surrogateDance();
       }
     }

     @Override
     public Comparator<BytesRef> getComparator() {
       // Pre-flex indexes always sorted in UTF16 order, but
       // we remap on-the-fly to unicode order
       if (unicodeSortOrder) {
         return BytesRef.getUTF8SortedAsUnicodeComparator();
       } else {
         return BytesRef.getUTF8SortedAsUTF16Comparator();
       }
     }

     @Override
     public SeekStatus seek(long ord) throws IOException {
       throw new UnsupportedOperationException();
     }

     @Override
     public long ord() throws IOException {
       throw new UnsupportedOperationException();
     }

     @Override
     public SeekStatus seek(BytesRef term, boolean useCache) throws IOException {
       if (DEBUG_SURROGATES) {
         System.out.println("TE.seek target=" + UnicodeUtil.toHexString(term.utf8ToString()));
       }
       skipNext = false;
       final TermInfosReader tis = getTermsDict();
       final Term t0 = protoTerm.createTerm(term);

       assert termEnum != null;

       tis.seekEnum(termEnum, t0);

       final Term t = termEnum.term();

       if (t != null && t.field() == fieldInfo.name && term.bytesEquals(t.bytes())) {
         // If we found an exact match, no need to do the
         // surrogate dance
         if (DEBUG_SURROGATES) {
           System.out.println("  seek exact match");
         }
         current = t.bytes();
         return SeekStatus.FOUND;
       } else if (t == null || t.field() != fieldInfo.name) {

         // TODO: maybe we can handle this like the next()
         // into null?  set term as prevTerm then dance?

         if (DEBUG_SURROGATES) {
           System.out.println("  seek hit EOF");
         }

         // We hit EOF; try end-case surrogate dance: if we
         // find an E, try swapping in S, backwards:
         scratchTerm.copy(term);

         assert scratchTerm.offset == 0;

         for(int i=scratchTerm.length-1;i>=0;i--) {
           if (isHighBMPChar(scratchTerm.bytes, i)) {
             if (DEBUG_SURROGATES) {
               System.out.println("    found E pos=" + i + "; try seek");
             }

             if (seekToNonBMP(seekTermEnum, scratchTerm, i)) {

               scratchTerm.copy(seekTermEnum.term().bytes());
               getTermsDict().seekEnum(termEnum, seekTermEnum.term());

               newSuffixStart = 1+i;

               doPushes();

               // Found a match
               // TODO: faster seek?
               current = termEnum.term().bytes();
               return SeekStatus.NOT_FOUND;
             }
           }
         }

         if (DEBUG_SURROGATES) {
           System.out.println("  seek END");
         }

         current = null;
         return SeekStatus.END;
       } else {

         // We found a non-exact but non-null term; this one
         // is fun -- just treat it like next, by pretending
         // requested term was prev:
         prevTerm.copy(term);

         if (DEBUG_SURROGATES) {
           System.out.println("  seek hit non-exact term=" + UnicodeUtil.toHexString(t.text()));
         }

         final BytesRef br = t.bytes();
         assert br.offset == 0;

         setNewSuffixStart(term, br);

         surrogateDance();

         final Term t2 = termEnum.term();
         if (t2 == null || t2.field() != fieldInfo.name) {
           assert t2 == null || !t2.field().equals(fieldInfo.name); // make sure fields are in fact interned
           current = null;
           return SeekStatus.END;
         } else {
           current = t2.bytes();
           assert !unicodeSortOrder || term.compareTo(current) < 0 : "term=" + UnicodeUtil.toHexString(term.utf8ToString()) + " vs current=" + UnicodeUtil.toHexString(current.utf8ToString());
           return SeekStatus.NOT_FOUND;
         }
       }
     }

     private void setNewSuffixStart(BytesRef br1, BytesRef br2) {
       final int limit = Math.min(br1.length, br2.length);
       int lastStart = 0;
       for(int i=0;i<limit;i++) {
         if ((br1.bytes[br1.offset+i] & 0xc0) == 0xc0 || (br1.bytes[br1.offset+i] & 0x80) == 0) {
           lastStart = i;
         }
         if (br1.bytes[br1.offset+i] != br2.bytes[br2.offset+i]) {
           newSuffixStart = lastStart;
           if (DEBUG_SURROGATES) {
             System.out.println("    set newSuffixStart=" + newSuffixStart);
           }
           return;
         }
       }
       newSuffixStart = limit;
       if (DEBUG_SURROGATES) {
         System.out.println("    set newSuffixStart=" + newSuffixStart);
       }
     }

     @Override
     public BytesRef next() throws IOException {
       if (DEBUG_SURROGATES) {
         System.out.println("TE.next()");
       }
       if (skipNext) {
         if (DEBUG_SURROGATES) {
           System.out.println("  skipNext=true");
         }
         skipNext = false;
         if (termEnum.term() == null) {
           return null;
         } else if (termEnum.term().field() != fieldInfo.name) {
           return null;
         } else {
           return current = termEnum.term().bytes();
         }
       }

       // TODO: can we use STE's prevBuffer here?
       prevTerm.copy(termEnum.term().bytes());

       if (termEnum.next() && termEnum.term().field() == fieldInfo.name) {
         newSuffixStart = termEnum.newSuffixStart;
         if (DEBUG_SURROGATES) {
           System.out.println("  newSuffixStart=" + newSuffixStart);
         }
         surrogateDance();
         final Term t = termEnum.term();
         if (t == null || t.field() != fieldInfo.name) {
           assert t == null || !t.field().equals(fieldInfo.name); // make sure fields are in fact interned
           current = null;
         } else {
           current = t.bytes();
         }
         return current;
       } else {
         // This field is exhausted, but we have to give
         // surrogateDance a chance to seek back:
         if (DEBUG_SURROGATES) {
           System.out.println("  force cont");
         }
         //newSuffixStart = prevTerm.length;
         newSuffixStart = 0;
         surrogateDance();

         final Term t = termEnum.term();
         if (t == null || t.field() != fieldInfo.name) {
           assert t == null || !t.field().equals(fieldInfo.name); // make sure fields are in fact interned
           return null;
         } else {
           current = t.bytes();
           return current;
         }
       }
     }

     @Override
     public BytesRef term() {
       return current;
     }

     @Override
     public int docFreq() {
       return termEnum.docFreq();
     }

     @Override
     public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
       PreDocsEnum docsEnum;
       if (reuse == null || !(reuse instanceof PreDocsEnum)) {
         docsEnum = new PreDocsEnum();
       } else {
         docsEnum = (PreDocsEnum) reuse;
         if (docsEnum.getFreqStream() != freqStream) {
           docsEnum = new PreDocsEnum();
         }
       }
       return docsEnum.reset(termEnum, skipDocs);
     }

     @Override
     public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
       PreDocsAndPositionsEnum docsPosEnum;
       if (fieldInfo.omitTermFreqAndPositions) {
         return null;
       } else if (reuse == null || !(reuse instanceof PreDocsAndPositionsEnum)) {
         docsPosEnum = new PreDocsAndPositionsEnum();
       } else {
         docsPosEnum = (PreDocsAndPositionsEnum) reuse;
         if (docsPosEnum.getFreqStream() != freqStream) {
           docsPosEnum = new PreDocsAndPositionsEnum();
         }
       }
       return docsPosEnum.reset(termEnum, skipDocs);
     }
   }

   private final class PreDocsEnum extends DocsEnum {
     final private SegmentTermDocs docs;

     PreDocsEnum() throws IOException {
       docs = new SegmentTermDocs(freqStream, getTermsDict(), fieldInfos);
     }

     IndexInput getFreqStream() {
       return freqStream;
     }

     public PreDocsEnum reset(SegmentTermEnum termEnum, Bits skipDocs) throws IOException {
       docs.setSkipDocs(skipDocs);
       docs.seek(termEnum);
       return this;
     }

     @Override
     public int nextDoc() throws IOException {
       if (docs.next()) {
         return docs.doc();
       } else {
         return NO_MORE_DOCS;
       }
     }

     @Override
     public int advance(int target) throws IOException {
       if (docs.skipTo(target)) {
         return docs.doc();
       } else {
         return NO_MORE_DOCS;
       }
     }

     @Override
     public int freq() {
       return docs.freq();
     }

     @Override
     public int docID() {
       return docs.doc();
     }

     @Override
     public int read() throws IOException {
       if (bulkResult == null) {
         initBulkResult();
         bulkResult.docs.ints = new int[32];
         bulkResult.freqs.ints = new int[32];
       }
       return this.docs.read(bulkResult.docs.ints, bulkResult.freqs.ints);
     }
   }

   private final class PreDocsAndPositionsEnum extends DocsAndPositionsEnum {
     final private SegmentTermPositions pos;

     PreDocsAndPositionsEnum() throws IOException {
       pos = new SegmentTermPositions(freqStream, proxStream, getTermsDict(), fieldInfos);
     }

     IndexInput getFreqStream() {
       return freqStream;
     }

     public DocsAndPositionsEnum reset(SegmentTermEnum termEnum, Bits skipDocs) throws IOException {
       pos.setSkipDocs(skipDocs);
       pos.seek(termEnum);
       return this;
     }

     @Override
     public int nextDoc() throws IOException {
       if (pos.next()) {
         return pos.doc();
       } else {
         return NO_MORE_DOCS;
       }
     }

     @Override
     public int advance(int target) throws IOException {
       if (pos.skipTo(target)) {
         return pos.doc();
       } else {
         return NO_MORE_DOCS;
       }
     }

     @Override
     public int freq() {
       return pos.freq();
     }

     @Override
     public int docID() {
       return pos.doc();
     }

     @Override
     public int nextPosition() throws IOException {
       return pos.nextPosition();
     }

     @Override
     public boolean hasPayload() {
       return pos.isPayloadAvailable();
     }

     private BytesRef payload;

     @Override
     public BytesRef getPayload() throws IOException {
       final int len = pos.getPayloadLength();
       if (payload == null) {
         payload = new BytesRef();
         payload.bytes = new byte[len];
       } else {
         if (payload.bytes.length < len) {
           payload.grow(len);
         }
       }

       payload.bytes = pos.getPayload(payload.bytes, 0);
       payload.length = len;
       return payload;
     }
   }
 }