blob: 9351a3324dcb8a2ad1eedda4c004e2b94bc384be [file] [log] [blame]
package org.apache.lucene.index.codecs.preflex;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Collection;
import java.util.Iterator;
import java.util.TreeMap;
import java.util.Comparator;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.FieldsEnum;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.CompoundFileReader;
import org.apache.lucene.index.codecs.FieldsProducer;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnicodeUtil;
/** Exposes flex API on a pre-flex index, as a codec.
* @lucene.experimental */
public class PreFlexFields extends FieldsProducer {
private static final boolean DEBUG_SURROGATES = false;
public TermInfosReader tis;
public final TermInfosReader tisNoIndex;
public final IndexInput freqStream;
public final IndexInput proxStream;
final private FieldInfos fieldInfos;
private final SegmentInfo si;
final TreeMap<String,FieldInfo> fields = new TreeMap<String,FieldInfo>();
private final Directory dir;
private final int readBufferSize;
private Directory cfsReader;
public PreFlexFields(Directory dir, FieldInfos fieldInfos, SegmentInfo info, int readBufferSize, int indexDivisor)
throws IOException {
si = info;
// NOTE: we must always load terms index, even for
// "sequential" scan during merging, because what is
// sequential to merger may not be to TermInfosReader
// since we do the surrogates dance:
if (indexDivisor < 0) {
indexDivisor = -indexDivisor;
}
TermInfosReader r = new TermInfosReader(dir, info.name, fieldInfos, readBufferSize, indexDivisor);
if (indexDivisor == -1) {
tisNoIndex = r;
} else {
tisNoIndex = null;
tis = r;
}
this.readBufferSize = readBufferSize;
this.fieldInfos = fieldInfos;
// make sure that all index files have been read or are kept open
// so that if an index update removes them we'll still have them
freqStream = dir.openInput(info.name + ".frq", readBufferSize);
boolean anyProx = false;
final int numFields = fieldInfos.size();
for(int i=0;i<numFields;i++) {
final FieldInfo fieldInfo = fieldInfos.fieldInfo(i);
if (fieldInfo.isIndexed) {
fields.put(fieldInfo.name, fieldInfo);
if (!fieldInfo.omitTermFreqAndPositions) {
anyProx = true;
}
}
}
if (anyProx) {
proxStream = dir.openInput(info.name + ".prx", readBufferSize);
} else {
proxStream = null;
}
this.dir = dir;
}
// If this returns, we do the surrogates dance so that the
// terms are sorted by unicode sort order. This should be
// true when segments are used for "normal" searching;
// it's only false during testing, to create a pre-flex
// index, using the test-only PreFlexRW.
protected boolean sortTermsByUnicode() {
return true;
}
static void files(Directory dir, SegmentInfo info, Collection<String> files) throws IOException {
files.add(IndexFileNames.segmentFileName(info.name, "", PreFlexCodec.TERMS_EXTENSION));
files.add(IndexFileNames.segmentFileName(info.name, "", PreFlexCodec.TERMS_INDEX_EXTENSION));
files.add(IndexFileNames.segmentFileName(info.name, "", PreFlexCodec.FREQ_EXTENSION));
if (info.getHasProx()) {
// LUCENE-1739: for certain versions of 2.9-dev,
// hasProx would be incorrectly computed during
// indexing as true, and then stored into the segments
// file, when it should have been false. So we do the
// extra check, here:
final String prx = IndexFileNames.segmentFileName(info.name, "", PreFlexCodec.PROX_EXTENSION);
if (dir.fileExists(prx)) {
files.add(prx);
}
}
}
@Override
public FieldsEnum iterator() throws IOException {
return new PreFlexFieldsEnum();
}
@Override
public Terms terms(String field) {
FieldInfo fi = fieldInfos.fieldInfo(field);
if (fi != null) {
return new PreTerms(fi);
} else {
return null;
}
}
synchronized private TermInfosReader getTermsDict() {
if (tis != null) {
return tis;
} else {
return tisNoIndex;
}
}
@Override
synchronized public void loadTermsIndex(int indexDivisor) throws IOException {
if (tis == null) {
Directory dir0;
if (si.getUseCompoundFile()) {
// In some cases, we were originally opened when CFS
// was not used, but then we are asked to open the
// terms reader with index, the segment has switched
// to CFS
if (!(dir instanceof CompoundFileReader)) {
dir0 = cfsReader = new CompoundFileReader(dir, IndexFileNames.segmentFileName(si.name, "", IndexFileNames.COMPOUND_FILE_EXTENSION), readBufferSize);
} else {
dir0 = dir;
}
dir0 = cfsReader;
} else {
dir0 = dir;
}
tis = new TermInfosReader(dir0, si.name, fieldInfos, readBufferSize, indexDivisor);
}
}
@Override
public void close() throws IOException {
if (tis != null) {
tis.close();
}
if (tisNoIndex != null) {
tisNoIndex.close();
}
if (cfsReader != null) {
cfsReader.close();
}
if (freqStream != null) {
freqStream.close();
}
if (proxStream != null) {
proxStream.close();
}
}
private class PreFlexFieldsEnum extends FieldsEnum {
final Iterator<FieldInfo> it;
private final PreTermsEnum termsEnum;
FieldInfo current;
public PreFlexFieldsEnum() throws IOException {
it = fields.values().iterator();
termsEnum = new PreTermsEnum();
}
@Override
public String next() {
if (it.hasNext()) {
current = it.next();
return current.name;
} else {
return null;
}
}
@Override
public TermsEnum terms() throws IOException {
termsEnum.reset(current);
return termsEnum;
}
}
private class PreTerms extends Terms {
final FieldInfo fieldInfo;
PreTerms(FieldInfo fieldInfo) {
this.fieldInfo = fieldInfo;
}
@Override
public TermsEnum iterator() throws IOException {
PreTermsEnum termsEnum = new PreTermsEnum();
termsEnum.reset(fieldInfo);
return termsEnum;
}
@Override
public Comparator<BytesRef> getComparator() {
// Pre-flex indexes always sorted in UTF16 order, but
// we remap on-the-fly to unicode order
if (sortTermsByUnicode()) {
return BytesRef.getUTF8SortedAsUnicodeComparator();
} else {
return BytesRef.getUTF8SortedAsUTF16Comparator();
}
}
}
private class PreTermsEnum extends TermsEnum {
private SegmentTermEnum termEnum;
private FieldInfo fieldInfo;
private boolean skipNext;
private BytesRef current;
private SegmentTermEnum seekTermEnum;
private Term protoTerm;
private static final byte UTF8_NON_BMP_LEAD = (byte) 0xf0;
private static final byte UTF8_HIGH_BMP_LEAD = (byte) 0xee;
// Returns true if the unicode char is "after" the
// surrogates in UTF16, ie >= U+E000 and <= U+FFFF:
private final boolean isHighBMPChar(byte[] b, int idx) {
return (b[idx] & UTF8_HIGH_BMP_LEAD) == UTF8_HIGH_BMP_LEAD;
}
// Returns true if the unicode char in the UTF8 byte
// sequence starting at idx encodes a char outside of
// BMP (ie what would be a surrogate pair in UTF16):
private final boolean isNonBMPChar(byte[] b, int idx) {
return (b[idx] & UTF8_NON_BMP_LEAD) == UTF8_NON_BMP_LEAD;
}
private final byte[] scratch = new byte[4];
private final BytesRef prevTerm = new BytesRef();
private final BytesRef scratchTerm = new BytesRef();
private int newSuffixStart;
// Swap in S, in place of E:
private boolean seekToNonBMP(SegmentTermEnum te, BytesRef term, int pos) throws IOException {
final int savLength = term.length;
assert term.offset == 0;
// The 3 bytes starting at downTo make up 1
// unicode character:
assert isHighBMPChar(term.bytes, pos);
// NOTE: we cannot make this assert, because
// AutomatonQuery legitimately sends us malformed UTF8
// (eg the UTF8 bytes with just 0xee)
// assert term.length >= pos + 3: "term.length=" + term.length + " pos+3=" + (pos+3) + " byte=" + Integer.toHexString(term.bytes[pos]) + " term=" + term.toString();
// Save the bytes && length, since we need to
// restore this if seek "back" finds no matching
// terms
if (term.bytes.length < 4+pos) {
term.grow(4+pos);
}
scratch[0] = term.bytes[pos];
scratch[1] = term.bytes[pos+1];
scratch[2] = term.bytes[pos+2];
term.bytes[pos] = (byte) 0xf0;
term.bytes[pos+1] = (byte) 0x90;
term.bytes[pos+2] = (byte) 0x80;
term.bytes[pos+3] = (byte) 0x80;
term.length = 4+pos;
if (DEBUG_SURROGATES) {
System.out.println(" try seek term=" + UnicodeUtil.toHexString(term.utf8ToString()));
}
// Seek "back":
getTermsDict().seekEnum(te, protoTerm.createTerm(term));
// Test if the term we seek'd to in fact found a
// surrogate pair at the same position as the E:
Term t2 = te.term();
// Cannot be null (or move to next field) because at
// "worst" it'd seek to the same term we are on now,
// unless we are being called from seek
if (t2 == null || t2.field() != fieldInfo.name) {
return false;
}
if (DEBUG_SURROGATES) {
System.out.println(" got term=" + UnicodeUtil.toHexString(t2.text()));
}
// Now test if prefix is identical and we found
// a non-BMP char at the same position:
BytesRef b2 = t2.bytes();
assert b2.offset == 0;
boolean matches;
if (b2.length >= term.length && isNonBMPChar(b2.bytes, pos)) {
matches = true;
for(int i=0;i<pos;i++) {
if (term.bytes[i] != b2.bytes[i]) {
matches = false;
break;
}
}
} else {
matches = false;
}
// Restore term:
term.length = savLength;
term.bytes[pos] = scratch[0];
term.bytes[pos+1] = scratch[1];
term.bytes[pos+2] = scratch[2];
return matches;
}
// Seek type 2 "continue" (back to the start of the
// surrogates): scan the stripped suffix from the
// prior term, backwards. If there was an E in that
// part, then we try to seek back to S. If that
// seek finds a matching term, we go there.
private boolean doContinue() throws IOException {
if (DEBUG_SURROGATES) {
System.out.println(" try cont");
}
int downTo = prevTerm.length-1;
boolean didSeek = false;
final int limit = Math.min(newSuffixStart, scratchTerm.length-1);
while(downTo > limit) {
if (isHighBMPChar(prevTerm.bytes, downTo)) {
if (DEBUG_SURROGATES) {
System.out.println(" found E pos=" + downTo + " vs len=" + prevTerm.length);
}
if (seekToNonBMP(seekTermEnum, prevTerm, downTo)) {
// TODO: more efficient seek?
getTermsDict().seekEnum(termEnum, seekTermEnum.term());
//newSuffixStart = downTo+4;
newSuffixStart = downTo;
scratchTerm.copy(termEnum.term().bytes());
didSeek = true;
if (DEBUG_SURROGATES) {
System.out.println(" seek!");
}
break;
} else {
if (DEBUG_SURROGATES) {
System.out.println(" no seek");
}
}
}
// Shorten prevTerm in place so that we don't redo
// this loop if we come back here:
if ((prevTerm.bytes[downTo] & 0xc0) == 0xc0 || (prevTerm.bytes[downTo] & 0x80) == 0) {
prevTerm.length = downTo;
}
downTo--;
}
return didSeek;
}
// Look for seek type 3 ("pop"): if the delta from
// prev -> current was replacing an S with an E,
// we must now seek to beyond that E. This seek
// "finishes" the dance at this character
// position.
private boolean doPop() throws IOException {
if (DEBUG_SURROGATES) {
System.out.println(" try pop");
}
assert newSuffixStart <= prevTerm.length;
assert newSuffixStart < scratchTerm.length || newSuffixStart == 0;
if (prevTerm.length > newSuffixStart &&
isNonBMPChar(prevTerm.bytes, newSuffixStart) &&
isHighBMPChar(scratchTerm.bytes, newSuffixStart)) {
// Seek type 2 -- put 0xFF at this position:
scratchTerm.bytes[newSuffixStart] = (byte) 0xff;
scratchTerm.length = newSuffixStart+1;
if (DEBUG_SURROGATES) {
System.out.println(" seek to term=" + UnicodeUtil.toHexString(scratchTerm.utf8ToString()) + " " + scratchTerm.toString());
}
// TODO: more efficient seek? can we simply swap
// the enums?
getTermsDict().seekEnum(termEnum, protoTerm.createTerm(scratchTerm));
final Term t2 = termEnum.term();
// We could hit EOF or different field since this
// was a seek "forward":
if (t2 != null && t2.field() == fieldInfo.name) {
if (DEBUG_SURROGATES) {
System.out.println(" got term=" + UnicodeUtil.toHexString(t2.text()) + " " + t2.bytes());
}
final BytesRef b2 = t2.bytes();
assert b2.offset == 0;
// Set newSuffixStart -- we can't use
// termEnum's since the above seek may have
// done no scanning (eg, term was precisely
// and index term, or, was in the term seek
// cache):
scratchTerm.copy(b2);
setNewSuffixStart(prevTerm, scratchTerm);
return true;
} else if (newSuffixStart != 0 || scratchTerm.length != 0) {
if (DEBUG_SURROGATES) {
System.out.println(" got term=null (or next field)");
}
newSuffixStart = 0;
scratchTerm.length = 0;
return true;
}
}
return false;
}
// Pre-flex indices store terms in UTF16 sort order, but
// certain queries require Unicode codepoint order; this
// method carefully seeks around surrogates to handle
// this impedance mismatch
private void surrogateDance() throws IOException {
if (!unicodeSortOrder) {
return;
}
// We are invoked after TIS.next() (by UTF16 order) to
// possibly seek to a different "next" (by unicode
// order) term.
// We scan only the "delta" from the last term to the
// current term, in UTF8 bytes. We look at 1) the bytes
// stripped from the prior term, and then 2) the bytes
// appended to that prior term's prefix.
// We don't care about specific UTF8 sequences, just
// the "category" of the UTF16 character. Category S
// is a high/low surrogate pair (it non-BMP).
// Category E is any BMP char > UNI_SUR_LOW_END (and <
// U+FFFF). Category A is the rest (any unicode char
// <= UNI_SUR_HIGH_START).
// The core issue is that pre-flex indices sort the
// characters as ASE, while flex must sort as AES. So
// when scanning, when we hit S, we must 1) seek
// forward to E and enum the terms there, then 2) seek
// back to S and enum all terms there, then 3) seek to
// after E. Three different seek points (1, 2, 3).
// We can easily detect S in UTF8: if a byte has
// prefix 11110 (0xf0), then that byte and the
// following 3 bytes encode a single unicode codepoint
// in S. Similary,we can detect E: if a byte has
// prefix 1110111 (0xee), then that byte and the
// following 2 bytes encode a single unicode codepoint
// in E.
// Note that this is really a recursive process --
// maybe the char at pos 2 needs to dance, but any
// point in its dance, suddenly pos 4 needs to dance
// so you must finish pos 4 before returning to pos
// 2. But then during pos 4's dance maybe pos 7 needs
// to dance, etc. However, despite being recursive,
// we don't need to hold any state because the state
// can always be derived by looking at prior term &
// current term.
// TODO: can we avoid this copy?
if (termEnum.term() == null || termEnum.term().field() != fieldInfo.name) {
scratchTerm.length = 0;
} else {
scratchTerm.copy(termEnum.term().bytes());
}
if (DEBUG_SURROGATES) {
System.out.println(" dance");
System.out.println(" prev=" + UnicodeUtil.toHexString(prevTerm.utf8ToString()));
System.out.println(" " + prevTerm.toString());
System.out.println(" term=" + UnicodeUtil.toHexString(scratchTerm.utf8ToString()));
System.out.println(" " + scratchTerm.toString());
}
// This code assumes TermInfosReader/SegmentTermEnum
// always use BytesRef.offset == 0
assert prevTerm.offset == 0;
assert scratchTerm.offset == 0;
// Need to loop here because we may need to do multiple
// pops, and possibly a continue in the end, ie:
//
// cont
// pop, cont
// pop, pop, cont
// <nothing>
//
while(true) {
if (doContinue()) {
break;
} else {
if (!doPop()) {
break;
}
}
}
if (DEBUG_SURROGATES) {
System.out.println(" finish bmp ends");
}
doPushes();
}
// Look for seek type 1 ("push"): if the newly added
// suffix contains any S, we must try to seek to the
// corresponding E. If we find a match, we go there;
// else we keep looking for additional S's in the new
// suffix. This "starts" the dance, at this character
// position:
private void doPushes() throws IOException {
int upTo = newSuffixStart;
if (DEBUG_SURROGATES) {
System.out.println(" try push newSuffixStart=" + newSuffixStart + " scratchLen=" + scratchTerm.length);
}
while(upTo < scratchTerm.length) {
if (isNonBMPChar(scratchTerm.bytes, upTo) &&
(upTo > newSuffixStart ||
(upTo >= prevTerm.length ||
(!isNonBMPChar(prevTerm.bytes, upTo) &&
!isHighBMPChar(prevTerm.bytes, upTo))))) {
// A non-BMP char (4 bytes UTF8) starts here:
assert scratchTerm.length >= upTo + 4;
final int savLength = scratchTerm.length;
scratch[0] = scratchTerm.bytes[upTo];
scratch[1] = scratchTerm.bytes[upTo+1];
scratch[2] = scratchTerm.bytes[upTo+2];
scratchTerm.bytes[upTo] = UTF8_HIGH_BMP_LEAD;
scratchTerm.bytes[upTo+1] = (byte) 0x80;
scratchTerm.bytes[upTo+2] = (byte) 0x80;
scratchTerm.length = upTo+3;
if (DEBUG_SURROGATES) {
System.out.println(" try seek 1 pos=" + upTo + " term=" + UnicodeUtil.toHexString(scratchTerm.utf8ToString()) + " " + scratchTerm.toString() + " len=" + scratchTerm.length);
}
// Seek "forward":
// TODO: more efficient seek?
getTermsDict().seekEnum(seekTermEnum, protoTerm.createTerm(scratchTerm));
scratchTerm.bytes[upTo] = scratch[0];
scratchTerm.bytes[upTo+1] = scratch[1];
scratchTerm.bytes[upTo+2] = scratch[2];
scratchTerm.length = savLength;
// Did we find a match?
final Term t2 = seekTermEnum.term();
if (DEBUG_SURROGATES) {
if (t2 == null) {
System.out.println(" hit term=null");
} else {
System.out.println(" hit term=" + UnicodeUtil.toHexString(t2.text()) + " " + (t2==null? null:t2.bytes()));
}
}
// Since this was a seek "forward", we could hit
// EOF or a different field:
boolean matches;
if (t2 != null && t2.field() == fieldInfo.name) {
final BytesRef b2 = t2.bytes();
assert b2.offset == 0;
if (b2.length >= upTo+3 && isHighBMPChar(b2.bytes, upTo)) {
matches = true;
for(int i=0;i<upTo;i++) {
if (scratchTerm.bytes[i] != b2.bytes[i]) {
matches = false;
break;
}
}
} else {
matches = false;
}
} else {
matches = false;
}
if (matches) {
if (DEBUG_SURROGATES) {
System.out.println(" matches!");
}
// OK seek "back"
// TODO: more efficient seek?
getTermsDict().seekEnum(termEnum, seekTermEnum.term());
scratchTerm.copy(seekTermEnum.term().bytes());
// +3 because we don't need to check the char
// at upTo: we know it's > BMP
upTo += 3;
// NOTE: we keep iterating, now, since this
// can easily "recurse". Ie, after seeking
// forward at a certain char position, we may
// find another surrogate in our [new] suffix
// and must then do another seek (recurse)
} else {
upTo++;
}
} else {
upTo++;
}
}
}
private boolean unicodeSortOrder;
void reset(FieldInfo fieldInfo) throws IOException {
//System.out.println("pff.reset te=" + termEnum);
this.fieldInfo = fieldInfo;
protoTerm = new Term(fieldInfo.name);
if (termEnum == null) {
termEnum = getTermsDict().terms(protoTerm);
seekTermEnum = getTermsDict().terms(protoTerm);
//System.out.println(" term=" + termEnum.term());
} else {
getTermsDict().seekEnum(termEnum, protoTerm);
}
skipNext = true;
unicodeSortOrder = sortTermsByUnicode();
final Term t = termEnum.term();
if (t != null && t.field() == fieldInfo.name) {
newSuffixStart = 0;
prevTerm.length = 0;
surrogateDance();
}
}
@Override
public Comparator<BytesRef> getComparator() {
// Pre-flex indexes always sorted in UTF16 order, but
// we remap on-the-fly to unicode order
if (unicodeSortOrder) {
return BytesRef.getUTF8SortedAsUnicodeComparator();
} else {
return BytesRef.getUTF8SortedAsUTF16Comparator();
}
}
@Override
public SeekStatus seek(long ord) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public long ord() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public SeekStatus seek(BytesRef term, boolean useCache) throws IOException {
if (DEBUG_SURROGATES) {
System.out.println("TE.seek target=" + UnicodeUtil.toHexString(term.utf8ToString()));
}
skipNext = false;
final TermInfosReader tis = getTermsDict();
final Term t0 = protoTerm.createTerm(term);
assert termEnum != null;
tis.seekEnum(termEnum, t0);
final Term t = termEnum.term();
if (t != null && t.field() == fieldInfo.name && term.bytesEquals(t.bytes())) {
// If we found an exact match, no need to do the
// surrogate dance
if (DEBUG_SURROGATES) {
System.out.println(" seek exact match");
}
current = t.bytes();
return SeekStatus.FOUND;
} else if (t == null || t.field() != fieldInfo.name) {
// TODO: maybe we can handle this like the next()
// into null? set term as prevTerm then dance?
if (DEBUG_SURROGATES) {
System.out.println(" seek hit EOF");
}
// We hit EOF; try end-case surrogate dance: if we
// find an E, try swapping in S, backwards:
scratchTerm.copy(term);
assert scratchTerm.offset == 0;
for(int i=scratchTerm.length-1;i>=0;i--) {
if (isHighBMPChar(scratchTerm.bytes, i)) {
if (DEBUG_SURROGATES) {
System.out.println(" found E pos=" + i + "; try seek");
}
if (seekToNonBMP(seekTermEnum, scratchTerm, i)) {
scratchTerm.copy(seekTermEnum.term().bytes());
getTermsDict().seekEnum(termEnum, seekTermEnum.term());
newSuffixStart = 1+i;
doPushes();
// Found a match
// TODO: faster seek?
current = termEnum.term().bytes();
return SeekStatus.NOT_FOUND;
}
}
}
if (DEBUG_SURROGATES) {
System.out.println(" seek END");
}
current = null;
return SeekStatus.END;
} else {
// We found a non-exact but non-null term; this one
// is fun -- just treat it like next, by pretending
// requested term was prev:
prevTerm.copy(term);
if (DEBUG_SURROGATES) {
System.out.println(" seek hit non-exact term=" + UnicodeUtil.toHexString(t.text()));
}
final BytesRef br = t.bytes();
assert br.offset == 0;
setNewSuffixStart(term, br);
surrogateDance();
final Term t2 = termEnum.term();
if (t2 == null || t2.field() != fieldInfo.name) {
assert t2 == null || !t2.field().equals(fieldInfo.name); // make sure fields are in fact interned
current = null;
return SeekStatus.END;
} else {
current = t2.bytes();
assert !unicodeSortOrder || term.compareTo(current) < 0 : "term=" + UnicodeUtil.toHexString(term.utf8ToString()) + " vs current=" + UnicodeUtil.toHexString(current.utf8ToString());
return SeekStatus.NOT_FOUND;
}
}
}
private void setNewSuffixStart(BytesRef br1, BytesRef br2) {
final int limit = Math.min(br1.length, br2.length);
int lastStart = 0;
for(int i=0;i<limit;i++) {
if ((br1.bytes[br1.offset+i] & 0xc0) == 0xc0 || (br1.bytes[br1.offset+i] & 0x80) == 0) {
lastStart = i;
}
if (br1.bytes[br1.offset+i] != br2.bytes[br2.offset+i]) {
newSuffixStart = lastStart;
if (DEBUG_SURROGATES) {
System.out.println(" set newSuffixStart=" + newSuffixStart);
}
return;
}
}
newSuffixStart = limit;
if (DEBUG_SURROGATES) {
System.out.println(" set newSuffixStart=" + newSuffixStart);
}
}
@Override
public BytesRef next() throws IOException {
if (DEBUG_SURROGATES) {
System.out.println("TE.next()");
}
if (skipNext) {
if (DEBUG_SURROGATES) {
System.out.println(" skipNext=true");
}
skipNext = false;
if (termEnum.term() == null) {
return null;
} else if (termEnum.term().field() != fieldInfo.name) {
return null;
} else {
return current = termEnum.term().bytes();
}
}
// TODO: can we use STE's prevBuffer here?
prevTerm.copy(termEnum.term().bytes());
if (termEnum.next() && termEnum.term().field() == fieldInfo.name) {
newSuffixStart = termEnum.newSuffixStart;
if (DEBUG_SURROGATES) {
System.out.println(" newSuffixStart=" + newSuffixStart);
}
surrogateDance();
final Term t = termEnum.term();
if (t == null || t.field() != fieldInfo.name) {
assert t == null || !t.field().equals(fieldInfo.name); // make sure fields are in fact interned
current = null;
} else {
current = t.bytes();
}
return current;
} else {
// This field is exhausted, but we have to give
// surrogateDance a chance to seek back:
if (DEBUG_SURROGATES) {
System.out.println(" force cont");
}
//newSuffixStart = prevTerm.length;
newSuffixStart = 0;
surrogateDance();
final Term t = termEnum.term();
if (t == null || t.field() != fieldInfo.name) {
assert t == null || !t.field().equals(fieldInfo.name); // make sure fields are in fact interned
return null;
} else {
current = t.bytes();
return current;
}
}
}
@Override
public BytesRef term() {
return current;
}
@Override
public int docFreq() {
return termEnum.docFreq();
}
@Override
public DocsEnum docs(Bits skipDocs, DocsEnum reuse) throws IOException {
PreDocsEnum docsEnum;
if (reuse == null || !(reuse instanceof PreDocsEnum)) {
docsEnum = new PreDocsEnum();
} else {
docsEnum = (PreDocsEnum) reuse;
if (docsEnum.getFreqStream() != freqStream) {
docsEnum = new PreDocsEnum();
}
}
return docsEnum.reset(termEnum, skipDocs);
}
@Override
public DocsAndPositionsEnum docsAndPositions(Bits skipDocs, DocsAndPositionsEnum reuse) throws IOException {
PreDocsAndPositionsEnum docsPosEnum;
if (fieldInfo.omitTermFreqAndPositions) {
return null;
} else if (reuse == null || !(reuse instanceof PreDocsAndPositionsEnum)) {
docsPosEnum = new PreDocsAndPositionsEnum();
} else {
docsPosEnum = (PreDocsAndPositionsEnum) reuse;
if (docsPosEnum.getFreqStream() != freqStream) {
docsPosEnum = new PreDocsAndPositionsEnum();
}
}
return docsPosEnum.reset(termEnum, skipDocs);
}
}
private final class PreDocsEnum extends DocsEnum {
final private SegmentTermDocs docs;
PreDocsEnum() throws IOException {
docs = new SegmentTermDocs(freqStream, getTermsDict(), fieldInfos);
}
IndexInput getFreqStream() {
return freqStream;
}
public PreDocsEnum reset(SegmentTermEnum termEnum, Bits skipDocs) throws IOException {
docs.setSkipDocs(skipDocs);
docs.seek(termEnum);
return this;
}
@Override
public int nextDoc() throws IOException {
if (docs.next()) {
return docs.doc();
} else {
return NO_MORE_DOCS;
}
}
@Override
public int advance(int target) throws IOException {
if (docs.skipTo(target)) {
return docs.doc();
} else {
return NO_MORE_DOCS;
}
}
@Override
public int freq() {
return docs.freq();
}
@Override
public int docID() {
return docs.doc();
}
@Override
public int read() throws IOException {
if (bulkResult == null) {
initBulkResult();
bulkResult.docs.ints = new int[32];
bulkResult.freqs.ints = new int[32];
}
return this.docs.read(bulkResult.docs.ints, bulkResult.freqs.ints);
}
}
private final class PreDocsAndPositionsEnum extends DocsAndPositionsEnum {
final private SegmentTermPositions pos;
PreDocsAndPositionsEnum() throws IOException {
pos = new SegmentTermPositions(freqStream, proxStream, getTermsDict(), fieldInfos);
}
IndexInput getFreqStream() {
return freqStream;
}
public DocsAndPositionsEnum reset(SegmentTermEnum termEnum, Bits skipDocs) throws IOException {
pos.setSkipDocs(skipDocs);
pos.seek(termEnum);
return this;
}
@Override
public int nextDoc() throws IOException {
if (pos.next()) {
return pos.doc();
} else {
return NO_MORE_DOCS;
}
}
@Override
public int advance(int target) throws IOException {
if (pos.skipTo(target)) {
return pos.doc();
} else {
return NO_MORE_DOCS;
}
}
@Override
public int freq() {
return pos.freq();
}
@Override
public int docID() {
return pos.doc();
}
@Override
public int nextPosition() throws IOException {
return pos.nextPosition();
}
@Override
public boolean hasPayload() {
return pos.isPayloadAvailable();
}
private BytesRef payload;
@Override
public BytesRef getPayload() throws IOException {
final int len = pos.getPayloadLength();
if (payload == null) {
payload = new BytesRef();
payload.bytes = new byte[len];
} else {
if (payload.bytes.length < len) {
payload.grow(len);
}
}
payload.bytes = pos.getPayload(payload.bytes, 0);
payload.length = len;
return payload;
}
}
}