blob: 70403c3a9909b6df3bfaf8b095bfce83e6e1601f [file] [log] [blame]
using Lucene.Net.Diagnostics;
using System;
using System.Collections.Generic;
using System.IO;
namespace Lucene.Net.Index
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using BytesRef = Lucene.Net.Util.BytesRef;
using DocIdSetIterator = Lucene.Net.Search.DocIdSetIterator;
using IBits = Lucene.Net.Util.IBits;
using PagedBytes = Lucene.Net.Util.PagedBytes;
using PostingsFormat = Lucene.Net.Codecs.PostingsFormat; // javadocs
using SeekStatus = Lucene.Net.Index.TermsEnum.SeekStatus;
using StringHelper = Lucene.Net.Util.StringHelper;
/// <summary>
/// This class enables fast access to multiple term ords for
/// a specified field across all docIDs.
/// <para/>
/// Like <see cref="Search.IFieldCache"/>, it uninverts the index and holds a
/// packed data structure in RAM to enable fast access.
/// Unlike <see cref="Search.IFieldCache"/>, it can handle multi-valued fields,
/// and, it does not hold the term bytes in RAM. Rather, you
/// must obtain a <see cref="TermsEnum"/> from the <see cref="GetOrdTermsEnum"/>
/// method, and then seek-by-ord to get the term's bytes.
/// <para/>
/// While normally term ords are type <see cref="long"/>, in this API they are
/// <see cref="int"/> as the internal representation here cannot address
/// more than <see cref="BufferedUpdates.MAX_INT32"/> unique terms. Also, typically this
/// class is used on fields with relatively few unique terms
/// vs the number of documents. In addition, there is an
/// internal limit (16 MB) on how many bytes each chunk of
/// documents may consume. If you trip this limit you'll hit
/// an <see cref="InvalidOperationException"/>.
/// <para/>
/// Deleted documents are skipped during uninversion, and if
/// you look them up you'll get 0 ords.
/// <para/>
/// The returned per-document ords do not retain their
/// original order in the document. Instead they are returned
/// in sorted (by ord, ie term's <see cref="BytesRef"/> comparer) order. They
/// are also de-dup'd (ie if doc has same term more than once
/// in this field, you'll only get that ord back once).
/// <para/>
/// This class tests whether the provided reader is able to
/// retrieve terms by ord (ie, it's single segment, and it
/// uses an ord-capable terms index). If not, this class
/// will create its own term index internally, allowing to
/// create a wrapped <see cref="TermsEnum"/> that can handle ord. The
/// <see cref="GetOrdTermsEnum"/> method then provides this
/// wrapped enum, if necessary.
/// <para/>
/// The RAM consumption of this class can be high!
/// <para/>
/// @lucene.experimental
/// </summary>
/// <remarks>
/// Final form of the un-inverted field:
/// <list type="bullet">
/// <item><description>Each document points to a list of term numbers that are contained in that document.</description></item>
/// <item><description>
/// Term numbers are in sorted order, and are encoded as variable-length deltas from the
/// previous term number. Real term numbers start at 2 since 0 and 1 are reserved. A
/// term number of 0 signals the end of the termNumber list.
/// </description></item>
/// <item><description>
/// There is a single int[maxDoc()] which either contains a pointer into a byte[] for
/// the termNumber lists, or directly contains the termNumber list if it fits in the 4
/// bytes of an integer. If the first byte in the integer is 1, the next 3 bytes
/// are a pointer into a byte[] where the termNumber list starts.
/// </description></item>
/// <item><description>
/// There are actually 256 byte arrays, to compensate for the fact that the pointers
/// into the byte arrays are only 3 bytes long. The correct byte array for a document
/// is a function of it's id.
/// </description></item>
/// <item><description>
/// To save space and speed up faceting, any term that matches enough documents will
/// not be un-inverted... it will be skipped while building the un-inverted field structure,
/// and will use a set intersection method during faceting.
/// </description></item>
/// <item><description>
/// To further save memory, the terms (the actual string values) are not all stored in
/// memory, but a TermIndex is used to convert term numbers to term values only
/// for the terms needed after faceting has completed. Only every 128th term value
/// is stored, along with it's corresponding term number, and this is used as an
/// index to find the closest term and iterate until the desired number is hit (very
/// much like Lucene's own internal term index).
/// </description></item>
/// </list>
/// </remarks>
public class DocTermOrds
{
/// <summary>
/// Term ords are shifted by this, internally, to reserve
/// values 0 (end term) and 1 (index is a pointer into byte array)
/// </summary>
private static readonly int TNUM_OFFSET = 2;
/// <summary>
/// Every 128th term is indexed, by default. </summary>
public static readonly int DEFAULT_INDEX_INTERVAL_BITS = 7; // decrease to a low number like 2 for testing
private int indexIntervalBits;
private int indexIntervalMask;
private int indexInterval;
/// <summary>
/// Don't uninvert terms that exceed this count. </summary>
protected readonly int m_maxTermDocFreq;
/// <summary>
/// Field we are uninverting. </summary>
protected readonly string m_field;
/// <summary>
/// Number of terms in the field. </summary>
protected int m_numTermsInField;
/// <summary>
/// Total number of references to term numbers. </summary>
protected long m_termInstances;
private long memsz;
/// <summary>
/// Total time to uninvert the field. </summary>
protected int m_total_time;
/// <summary>
/// Time for phase1 of the uninvert process. </summary>
protected int m_phase1_time;
/// <summary>
/// Holds the per-document ords or a pointer to the ords. </summary>
protected int[] m_index;
/// <summary>
/// Holds term ords for documents. </summary>
[CLSCompliant(false)] // LUCENENET NOTE: Since jagged arrays are not CLS compliant anyway, this would need to be byte[,] to be CLS compliant - not sure it is worth the effort
protected sbyte[][] m_tnums = new sbyte[256][];
/// <summary>
/// Total bytes (sum of term lengths) for all indexed terms. </summary>
protected long m_sizeOfIndexedStrings;
/// <summary>
/// Holds the indexed (by default every 128th) terms. </summary>
protected BytesRef[] m_indexedTermsArray;
/// <summary>
/// If non-null, only terms matching this prefix were
/// indexed.
/// </summary>
protected BytesRef m_prefix;
/// <summary>
/// Ordinal of the first term in the field, or 0 if the
/// <see cref="PostingsFormat"/> does not implement
/// <see cref="TermsEnum.Ord"/>.
/// </summary>
protected int m_ordBase;
/// <summary>
/// Used while uninverting. </summary>
protected DocsEnum m_docsEnum;
/// <summary>
/// Returns total bytes used. </summary>
public virtual long RamUsedInBytes()
{
// can cache the mem size since it shouldn't change
if (memsz != 0)
{
return memsz;
}
long sz = 8 * 8 + 32; // local fields
if (m_index != null)
{
sz += m_index.Length * 4;
}
if (m_tnums != null)
{
for (int i = 0; i < m_tnums.Length; i++)
{
var arr = m_tnums[i];
if (arr != null)
sz += arr.Length;
}
}
memsz = sz;
return sz;
}
/// <summary>
/// Inverts all terms </summary>
public DocTermOrds(AtomicReader reader, IBits liveDocs, string field)
: this(reader, liveDocs, field, null, int.MaxValue)
{
}
/// <summary>
/// Inverts only terms starting w/ prefix </summary>
public DocTermOrds(AtomicReader reader, IBits liveDocs, string field, BytesRef termPrefix)
: this(reader, liveDocs, field, termPrefix, int.MaxValue)
{
}
/// <summary>
/// Inverts only terms starting w/ prefix, and only terms
/// whose docFreq (not taking deletions into account) is
/// &lt;= <paramref name="maxTermDocFreq"/>
/// </summary>
public DocTermOrds(AtomicReader reader, IBits liveDocs, string field, BytesRef termPrefix, int maxTermDocFreq)
: this(reader, liveDocs, field, termPrefix, maxTermDocFreq, DEFAULT_INDEX_INTERVAL_BITS)
{
}
/// <summary>
/// Inverts only terms starting w/ prefix, and only terms
/// whose docFreq (not taking deletions into account) is
/// &lt;= <paramref name="maxTermDocFreq"/>, with a custom indexing interval
/// (default is every 128nd term).
/// </summary>
public DocTermOrds(AtomicReader reader, IBits liveDocs, string field, BytesRef termPrefix, int maxTermDocFreq, int indexIntervalBits)
: this(field, maxTermDocFreq, indexIntervalBits)
{
Uninvert(reader, liveDocs, termPrefix);
}
/// <summary>
/// Subclass inits w/ this, but be sure you then call
/// uninvert, only once
/// </summary>
protected DocTermOrds(string field, int maxTermDocFreq, int indexIntervalBits)
{
//System.out.println("DTO init field=" + field + " maxTDFreq=" + maxTermDocFreq);
this.m_field = field;
this.m_maxTermDocFreq = maxTermDocFreq;
this.indexIntervalBits = indexIntervalBits;
indexIntervalMask = (int)((uint)0xffffffff >> (32 - indexIntervalBits));
indexInterval = 1 << indexIntervalBits;
}
/// <summary>
/// Returns a <see cref="TermsEnum"/> that implements <see cref="TermsEnum.Ord"/>. If the
/// provided <paramref name="reader"/> supports <see cref="TermsEnum.Ord"/>, we just return its
/// <see cref="TermsEnum"/>; if it does not, we build a "private" terms
/// index internally (WARNING: consumes RAM) and use that
/// index to implement <see cref="TermsEnum.Ord"/>. This also enables <see cref="TermsEnum.Ord"/> on top
/// of a composite reader. The returned <see cref="TermsEnum"/> is
/// unpositioned. This returns <c>null</c> if there are no terms.
///
/// <para/><b>NOTE</b>: you must pass the same reader that was
/// used when creating this class
/// </summary>
public virtual TermsEnum GetOrdTermsEnum(AtomicReader reader)
{
if (m_indexedTermsArray == null)
{
//System.out.println("GET normal enum");
Fields fields = reader.Fields;
if (fields == null)
{
return null;
}
Terms terms = fields.GetTerms(m_field);
if (terms == null)
{
return null;
}
else
{
return terms.GetEnumerator();
}
}
else
{
//System.out.println("GET wrapped enum ordBase=" + ordBase);
return new OrdWrappedTermsEnum(this, reader);
}
}
/// <summary>
/// Returns the number of terms in this field
/// </summary>
public virtual int NumTerms => m_numTermsInField;
/// <summary>
/// Returns <c>true</c> if no terms were indexed.
/// </summary>
public virtual bool IsEmpty => m_index == null;
/// <summary>
/// Subclass can override this </summary>
protected virtual void VisitTerm(TermsEnum te, int termNum)
{
}
/// <summary>
/// Invoked during <see cref="Uninvert(AtomicReader, IBits, BytesRef)"/>
/// to record the document frequency for each uninverted
/// term.
/// </summary>
protected virtual void SetActualDocFreq(int termNum, int df)
{
}
/// <summary>
/// Call this only once (if you subclass!) </summary>
protected virtual void Uninvert(AtomicReader reader, IBits liveDocs, BytesRef termPrefix)
{
FieldInfo info = reader.FieldInfos.FieldInfo(m_field);
if (info != null && info.HasDocValues)
{
throw new InvalidOperationException("Type mismatch: " + m_field + " was indexed as " + info.DocValuesType);
}
//System.out.println("DTO uninvert field=" + field + " prefix=" + termPrefix);
long startTime = Environment.TickCount;
m_prefix = termPrefix == null ? null : BytesRef.DeepCopyOf(termPrefix);
int maxDoc = reader.MaxDoc;
int[] index = new int[maxDoc]; // immediate term numbers, or the index into the byte[] representing the last number
int[] lastTerm = new int[maxDoc]; // last term we saw for this document
var bytes = new sbyte[maxDoc][]; // list of term numbers for the doc (delta encoded vInts)
Fields fields = reader.Fields;
if (fields == null)
{
// No terms
return;
}
Terms terms = fields.GetTerms(m_field);
if (terms == null)
{
// No terms
return;
}
TermsEnum te = terms.GetEnumerator();
BytesRef seekStart = termPrefix ?? new BytesRef();
//System.out.println("seekStart=" + seekStart.utf8ToString());
if (te.SeekCeil(seekStart) == TermsEnum.SeekStatus.END)
{
// No terms match
return;
}
// If we need our "term index wrapper", these will be
// init'd below:
IList<BytesRef> indexedTerms = null;
PagedBytes indexedTermsBytes = null;
bool testedOrd = false;
// we need a minimum of 9 bytes, but round up to 12 since the space would
// be wasted with most allocators anyway.
var tempArr = new sbyte[12];
//
// enumerate all terms, and build an intermediate form of the un-inverted field.
//
// During this intermediate form, every document has a (potential) byte[]
// and the int[maxDoc()] array either contains the termNumber list directly
// or the *end* offset of the termNumber list in it's byte array (for faster
// appending and faster creation of the final form).
//
// idea... if things are too large while building, we could do a range of docs
// at a time (but it would be a fair amount slower to build)
// could also do ranges in parallel to take advantage of multiple CPUs
// OPTIONAL: remap the largest df terms to the lowest 128 (single byte)
// values. this requires going over the field first to find the most
// frequent terms ahead of time.
int termNum = 0;
m_docsEnum = null;
// Loop begins with te positioned to first term (we call
// seek above):
for (; ; )
{
BytesRef t = te.Term;
if (t == null || (termPrefix != null && !StringHelper.StartsWith(t, termPrefix)))
{
break;
}
//System.out.println("visit term=" + t.utf8ToString() + " " + t + " termNum=" + termNum);
if (!testedOrd)
{
try
{
m_ordBase = (int)te.Ord;
//System.out.println("got ordBase=" + ordBase);
}
#pragma warning disable 168
catch (NotSupportedException uoe)
#pragma warning restore 168
{
// Reader cannot provide ord support, so we wrap
// our own support by creating our own terms index:
indexedTerms = new List<BytesRef>();
indexedTermsBytes = new PagedBytes(15);
//System.out.println("NO ORDS");
}
testedOrd = true;
}
VisitTerm(te, termNum);
if (indexedTerms != null && (termNum & indexIntervalMask) == 0)
{
// Index this term
m_sizeOfIndexedStrings += t.Length;
BytesRef indexedTerm = new BytesRef();
indexedTermsBytes.Copy(t, indexedTerm);
// TODO: really should 1) strip off useless suffix,
// and 2) use FST not array/PagedBytes
indexedTerms.Add(indexedTerm);
}
int df = te.DocFreq;
if (df <= m_maxTermDocFreq)
{
m_docsEnum = te.Docs(liveDocs, m_docsEnum, DocsFlags.NONE);
// dF, but takes deletions into account
int actualDF = 0;
for (; ; )
{
int doc = m_docsEnum.NextDoc();
if (doc == DocIdSetIterator.NO_MORE_DOCS)
{
break;
}
//System.out.println(" chunk=" + chunk + " docs");
actualDF++;
m_termInstances++;
//System.out.println(" docID=" + doc);
// add TNUM_OFFSET to the term number to make room for special reserved values:
// 0 (end term) and 1 (index into byte array follows)
int delta = termNum - lastTerm[doc] + TNUM_OFFSET;
lastTerm[doc] = termNum;
int val = index[doc];
if ((val & 0xff) == 1)
{
// index into byte array (actually the end of
// the doc-specific byte[] when building)
int pos = (int)((uint)val >> 8);
int ilen = VInt32Size(delta);
var arr = bytes[doc];
int newend = pos + ilen;
if (newend > arr.Length)
{
// We avoid a doubling strategy to lower memory usage.
// this faceting method isn't for docs with many terms.
// In hotspot, objects have 2 words of overhead, then fields, rounded up to a 64-bit boundary.
// TODO: figure out what array lengths we can round up to w/o actually using more memory
// (how much space does a byte[] take up? Is data preceded by a 32 bit length only?
// It should be safe to round up to the nearest 32 bits in any case.
int newLen = (newend + 3) & unchecked((int)0xfffffffc); // 4 byte alignment
var newarr = new sbyte[newLen];
Array.Copy(arr, 0, newarr, 0, pos);
arr = newarr;
bytes[doc] = newarr;
}
pos = WriteInt32(delta, arr, pos);
index[doc] = (pos << 8) | 1; // update pointer to end index in byte[]
}
else
{
// OK, this int has data in it... find the end (a zero starting byte - not
// part of another number, hence not following a byte with the high bit set).
int ipos;
if (val == 0)
{
ipos = 0;
}
else if ((val & 0x0000ff80) == 0)
{
ipos = 1;
}
else if ((val & 0x00ff8000) == 0)
{
ipos = 2;
}
else if ((val & 0xff800000) == 0)
{
ipos = 3;
}
else
{
ipos = 4;
}
//System.out.println(" ipos=" + ipos);
int endPos = WriteInt32(delta, tempArr, ipos);
//System.out.println(" endpos=" + endPos);
if (endPos <= 4)
{
//System.out.println(" fits!");
// value will fit in the integer... move bytes back
for (int j = ipos; j < endPos; j++)
{
val |= (tempArr[j] & 0xff) << (j << 3);
}
index[doc] = val;
}
else
{
// value won't fit... move integer into byte[]
for (int j = 0; j < ipos; j++)
{
tempArr[j] = (sbyte)val;
val = (int)((uint)val >> 8);
}
// point at the end index in the byte[]
index[doc] = (endPos << 8) | 1;
bytes[doc] = tempArr;
tempArr = new sbyte[12];
}
}
}
SetActualDocFreq(termNum, actualDF);
}
termNum++;
if (!te.MoveNext())
{
break;
}
}
m_numTermsInField = termNum;
long midPoint = Environment.TickCount;
if (m_termInstances == 0)
{
// we didn't invert anything
// lower memory consumption.
m_tnums = null;
}
else
{
this.m_index = index;
//
// transform intermediate form into the final form, building a single byte[]
// at a time, and releasing the intermediate byte[]s as we go to avoid
// increasing the memory footprint.
//
for (int pass = 0; pass < 256; pass++)
{
var target = m_tnums[pass];
var pos = 0; // end in target;
if (target != null)
{
pos = target.Length;
}
else
{
target = new sbyte[4096];
}
// loop over documents, 0x00ppxxxx, 0x01ppxxxx, 0x02ppxxxx
// where pp is the pass (which array we are building), and xx is all values.
// each pass shares the same byte[] for termNumber lists.
for (int docbase = pass << 16; docbase < maxDoc; docbase += (1 << 24))
{
int lim = Math.Min(docbase + (1 << 16), maxDoc);
for (int doc = docbase; doc < lim; doc++)
{
//System.out.println(" pass=" + pass + " process docID=" + doc);
int val = index[doc];
if ((val & 0xff) == 1)
{
int len = (int)((uint)val >> 8);
//System.out.println(" ptr pos=" + pos);
index[doc] = (pos << 8) | 1; // change index to point to start of array
if ((pos & 0xff000000) != 0)
{
// we only have 24 bits for the array index
throw new InvalidOperationException("Too many values for UnInvertedField faceting on field " + m_field);
}
var arr = bytes[doc];
/*
for(byte b : arr) {
//System.out.println(" b=" + Integer.toHexString((int) b));
}
*/
bytes[doc] = null; // IMPORTANT: allow GC to avoid OOM
if (target.Length <= pos + len)
{
int newlen = target.Length;
//* we don't have to worry about the array getting too large
// since the "pos" param will overflow first (only 24 bits available)
// if ((newlen<<1) <= 0) {
// // overflow...
// newlen = Integer.MAX_VALUE;
// if (newlen <= pos + len) {
// throw new SolrException(400,"Too many terms to uninvert field!");
// }
// } else {
// while (newlen <= pos + len) newlen<<=1; // doubling strategy
// }
//
while (newlen <= pos + len) // doubling strategy
{
newlen <<= 1;
}
var newtarget = new sbyte[newlen];
Array.Copy(target, 0, newtarget, 0, pos);
target = newtarget;
}
Array.Copy(arr, 0, target, pos, len);
pos += len + 1; // skip single byte at end and leave it 0 for terminator
}
}
}
// shrink array
if (pos < target.Length)
{
var newtarget = new sbyte[pos];
Array.Copy(target, 0, newtarget, 0, pos);
target = newtarget;
}
m_tnums[pass] = target;
if ((pass << 16) > maxDoc)
{
break;
}
}
}
if (indexedTerms != null)
{
m_indexedTermsArray = new BytesRef[indexedTerms.Count];
indexedTerms.CopyTo(m_indexedTermsArray, 0);
}
long endTime = Environment.TickCount;
m_total_time = (int)(endTime - startTime);
m_phase1_time = (int)(midPoint - startTime);
}
/// <summary>
/// Number of bytes to represent an unsigned int as a vint.
/// <para/>
/// NOTE: This was vIntSize() in Lucene
/// </summary>
private static int VInt32Size(int x)
{
if ((x & (0xffffffff << (7 * 1))) == 0)
{
return 1;
}
if ((x & (0xffffffff << (7 * 2))) == 0)
{
return 2;
}
if ((x & (0xffffffff << (7 * 3))) == 0)
{
return 3;
}
if ((x & (0xffffffff << (7 * 4))) == 0)
{
return 4;
}
return 5;
}
// todo: if we know the size of the vInt already, we could do
// a single switch on the size
/// <summary>
/// NOTE: This was writeInt() in Lucene
/// </summary>
private static int WriteInt32(int x, sbyte[] arr, int pos)
{
var a = ((int)((uint)x >> (7 * 4)));
if (a != 0)
{
arr[pos++] = (sbyte)(a | 0x80);
}
a = ((int)((uint)x >> (7 * 3)));
if (a != 0)
{
arr[pos++] = (sbyte)(a | 0x80);
}
a = ((int)((uint)x >> (7 * 2)));
if (a != 0)
{
arr[pos++] = (sbyte)(a | 0x80);
}
a = ((int)((uint)x >> (7 * 1)));
if (a != 0)
{
arr[pos++] = (sbyte)(a | 0x80);
}
arr[pos++] = (sbyte)(x & 0x7f);
return pos;
}
/// <summary>
/// Only used if original <see cref="IndexReader"/> doesn't implement
/// <see cref="TermsEnum.Ord"/>; in this case we "wrap" our own terms index
/// around it.
/// </summary>
private sealed class OrdWrappedTermsEnum : TermsEnum
{
internal void InitializeInstanceFields()
{
ord = -outerInstance.indexInterval - 1;
}
private readonly DocTermOrds outerInstance;
internal readonly TermsEnum termsEnum;
internal BytesRef term;
internal long ord; // force "real" seek
public OrdWrappedTermsEnum(DocTermOrds outerInstance, AtomicReader reader)
{
this.outerInstance = outerInstance;
InitializeInstanceFields();
if (Debugging.AssertsEnabled) Debugging.Assert(outerInstance.m_indexedTermsArray != null);
termsEnum = reader.Fields.GetTerms(outerInstance.m_field).GetEnumerator();
}
public override IComparer<BytesRef> Comparer => termsEnum.Comparer;
public override DocsEnum Docs(IBits liveDocs, DocsEnum reuse, DocsFlags flags)
{
return termsEnum.Docs(liveDocs, reuse, flags);
}
public override DocsAndPositionsEnum DocsAndPositions(IBits liveDocs, DocsAndPositionsEnum reuse, DocsAndPositionsFlags flags)
{
return termsEnum.DocsAndPositions(liveDocs, reuse, flags);
}
public override BytesRef Term => term;
public override bool MoveNext()
{
if (++ord < 0)
{
ord = 0;
}
if (!termsEnum.MoveNext())
{
term = null;
return false;
}
SetTerm(); // this is extra work if we know we are in bounds...
return true;
}
[Obsolete("Use MoveNext() and Term instead. This method will be removed in 4.8.0 release candidate."), System.ComponentModel.EditorBrowsable(System.ComponentModel.EditorBrowsableState.Never)]
public override BytesRef Next()
{
if (MoveNext())
return term;
return null;
}
public override int DocFreq => termsEnum.DocFreq;
public override long TotalTermFreq => termsEnum.TotalTermFreq;
public override long Ord => outerInstance.m_ordBase + ord;
public override SeekStatus SeekCeil(BytesRef target)
{
// already here
if (term != null && term.Equals(target))
{
return SeekStatus.FOUND;
}
int startIdx = Array.BinarySearch(outerInstance.m_indexedTermsArray, target);
if (startIdx >= 0)
{
// we hit the term exactly... lucky us!
TermsEnum.SeekStatus seekStatus = termsEnum.SeekCeil(target);
if (Debugging.AssertsEnabled) Debugging.Assert(seekStatus == TermsEnum.SeekStatus.FOUND);
ord = startIdx << outerInstance.indexIntervalBits;
SetTerm();
if (Debugging.AssertsEnabled) Debugging.Assert(term != null);
return SeekStatus.FOUND;
}
// we didn't hit the term exactly
startIdx = -startIdx - 1;
if (startIdx == 0)
{
// our target occurs *before* the first term
TermsEnum.SeekStatus seekStatus = termsEnum.SeekCeil(target);
if (Debugging.AssertsEnabled) Debugging.Assert(seekStatus == TermsEnum.SeekStatus.NOT_FOUND);
ord = 0;
SetTerm();
if (Debugging.AssertsEnabled) Debugging.Assert(term != null);
return SeekStatus.NOT_FOUND;
}
// back up to the start of the block
startIdx--;
if ((ord >> outerInstance.indexIntervalBits) == startIdx && term != null && term.CompareTo(target) <= 0)
{
// we are already in the right block and the current term is before the term we want,
// so we don't need to seek.
}
else
{
// seek to the right block
TermsEnum.SeekStatus seekStatus = termsEnum.SeekCeil(outerInstance.m_indexedTermsArray[startIdx]);
if (Debugging.AssertsEnabled) Debugging.Assert(seekStatus == TermsEnum.SeekStatus.FOUND);
ord = startIdx << outerInstance.indexIntervalBits;
SetTerm();
if (Debugging.AssertsEnabled) Debugging.Assert(term != null); // should be non-null since it's in the index
}
while (term != null && term.CompareTo(target) < 0)
{
MoveNext();
}
if (term == null)
{
return SeekStatus.END;
}
else if (term.CompareTo(target) == 0)
{
return SeekStatus.FOUND;
}
else
{
return SeekStatus.NOT_FOUND;
}
}
public override void SeekExact(long targetOrd)
{
int delta = (int)(targetOrd - outerInstance.m_ordBase - ord);
//System.out.println(" seek(ord) targetOrd=" + targetOrd + " delta=" + delta + " ord=" + ord + " ii=" + indexInterval);
if (delta < 0 || delta > outerInstance.indexInterval)
{
int idx = (int)((long)((ulong)targetOrd >> outerInstance.indexIntervalBits));
BytesRef @base = outerInstance.m_indexedTermsArray[idx];
//System.out.println(" do seek term=" + base.utf8ToString());
ord = idx << outerInstance.indexIntervalBits;
delta = (int)(targetOrd - ord);
TermsEnum.SeekStatus seekStatus = termsEnum.SeekCeil(@base);
if (Debugging.AssertsEnabled) Debugging.Assert(seekStatus == TermsEnum.SeekStatus.FOUND);
}
else
{
//System.out.println("seek w/in block");
}
while (--delta >= 0)
{
if (!termsEnum.MoveNext())
{
if (Debugging.AssertsEnabled) Debugging.Assert(false);
return;
}
ord++;
}
SetTerm();
if (Debugging.AssertsEnabled) Debugging.Assert(term != null);
}
private BytesRef SetTerm()
{
term = termsEnum.Term;
//System.out.println(" setTerm() term=" + term.utf8ToString() + " vs prefix=" + (prefix == null ? "null" : prefix.utf8ToString()));
if (outerInstance.m_prefix != null && !StringHelper.StartsWith(term, outerInstance.m_prefix))
{
term = null;
}
return term;
}
}
/// <summary>
/// Returns the term (<see cref="BytesRef"/>) corresponding to
/// the provided ordinal.
/// </summary>
public virtual BytesRef LookupTerm(TermsEnum termsEnum, int ord)
{
termsEnum.SeekExact(ord);
return termsEnum.Term;
}
/// <summary>
/// Returns a <see cref="SortedSetDocValues"/> view of this instance </summary>
public virtual SortedSetDocValues GetIterator(AtomicReader reader)
{
if (IsEmpty)
{
return DocValues.EMPTY_SORTED_SET;
}
else
{
return new Iterator(this, reader);
}
}
private class Iterator : SortedSetDocValues
{
private readonly DocTermOrds outerInstance;
private readonly AtomicReader reader;
private readonly TermsEnum te; // used internally for lookupOrd() and lookupTerm()
// currently we read 5 at a time (using the logic of the old iterator)
private readonly int[] buffer = new int[5];
private int bufferUpto;
private int bufferLength;
private int tnum;
private int upto;
private sbyte[] arr;
internal Iterator(DocTermOrds outerInstance, AtomicReader reader)
{
this.outerInstance = outerInstance;
this.reader = reader;
this.te = GetTermsEnum();
}
public override long NextOrd()
{
while (bufferUpto == bufferLength)
{
if (bufferLength < buffer.Length)
{
return NO_MORE_ORDS;
}
else
{
bufferLength = Read(buffer);
bufferUpto = 0;
}
}
return buffer[bufferUpto++];
}
/// <summary>
/// Buffer must be at least 5 <see cref="int"/>s long. Returns number
/// of term ords placed into buffer; if this count is
/// less than buffer.Length then that is the end.
/// </summary>
internal virtual int Read(int[] buffer)
{
int bufferUpto = 0;
if (arr == null)
{
// code is inlined into upto
//System.out.println("inlined");
int code = upto;
int delta = 0;
for (; ; )
{
delta = (delta << 7) | (code & 0x7f);
if ((code & 0x80) == 0)
{
if (delta == 0)
{
break;
}
tnum += delta - TNUM_OFFSET;
buffer[bufferUpto++] = outerInstance.m_ordBase + tnum;
//System.out.println(" tnum=" + tnum);
delta = 0;
}
code = (int)((uint)code >> 8);
}
}
else
{
// code is a pointer
for (; ; )
{
int delta = 0;
for (; ; )
{
sbyte b = arr[upto++];
delta = (delta << 7) | (b & 0x7f);
//System.out.println(" cycle: upto=" + upto + " delta=" + delta + " b=" + b);
if ((b & 0x80) == 0)
{
break;
}
}
//System.out.println(" delta=" + delta);
if (delta == 0)
{
break;
}
tnum += delta - TNUM_OFFSET;
//System.out.println(" tnum=" + tnum);
buffer[bufferUpto++] = outerInstance.m_ordBase + tnum;
if (bufferUpto == buffer.Length)
{
break;
}
}
}
return bufferUpto;
}
public override void SetDocument(int docID)
{
tnum = 0;
int code = outerInstance.m_index[docID];
if ((code & 0xff) == 1)
{
// a pointer
upto = (int)((uint)code >> 8);
//System.out.println(" pointer! upto=" + upto);
int whichArray = ((int)((uint)docID >> 16)) & 0xff;
arr = outerInstance.m_tnums[whichArray];
}
else
{
//System.out.println(" inline!");
arr = null;
upto = code;
}
bufferUpto = 0;
bufferLength = Read(buffer);
}
public override void LookupOrd(long ord, BytesRef result)
{
BytesRef @ref = null;
try
{
@ref = outerInstance.LookupTerm(te, (int)ord);
}
catch (IOException e)
{
throw new Exception(e.ToString(), e);
}
result.Bytes = @ref.Bytes;
result.Offset = @ref.Offset;
result.Length = @ref.Length;
}
public override long ValueCount => outerInstance.NumTerms;
public override long LookupTerm(BytesRef key)
{
try
{
if (te.SeekCeil(key) == SeekStatus.FOUND)
{
return te.Ord;
}
else
{
return -te.Ord - 1;
}
}
catch (IOException e)
{
throw new Exception(e.ToString(), e);
}
}
public override TermsEnum GetTermsEnum()
{
try
{
return outerInstance.GetOrdTermsEnum(reader);
}
#pragma warning disable 168
catch (IOException e)
#pragma warning restore 168
{
throw new Exception(e.ToString(), e);
}
}
}
}
}