blob: 7ebe22c80d88f6c866c2b33d8f7fcb3aed2dc13d [file] [log] [blame]
using Lucene.Net.Diagnostics;
using Lucene.Net.Support;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Runtime.CompilerServices;
namespace Lucene.Net.Util
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using ByteArrayDataInput = Lucene.Net.Store.ByteArrayDataInput;
using DocIdSet = Lucene.Net.Search.DocIdSet;
using DocIdSetIterator = Lucene.Net.Search.DocIdSetIterator;
using MonotonicAppendingInt64Buffer = Lucene.Net.Util.Packed.MonotonicAppendingInt64Buffer;
using PackedInt32s = Lucene.Net.Util.Packed.PackedInt32s;
/// <summary>
/// <see cref="DocIdSet"/> implementation based on word-aligned hybrid encoding on
/// words of 8 bits.
/// <para>This implementation doesn't support random-access but has a fast
/// <see cref="DocIdSetIterator"/> which can advance in logarithmic time thanks to
/// an index.</para>
/// <para>The compression scheme is simplistic and should work well with sparse and
/// very dense doc id sets while being only slightly larger than a
/// <see cref="FixedBitSet"/> for incompressible sets (overhead&lt;2% in the worst
/// case) in spite of the index.</para>
/// <para><b>Format</b>: The format is byte-aligned. An 8-bits word is either clean,
/// meaning composed only of zeros or ones, or dirty, meaning that it contains
/// between 1 and 7 bits set. The idea is to encode sequences of clean words
/// using run-length encoding and to leave sequences of dirty words as-is.</para>
/// <list type="table">
/// <listheader><term>Token</term><term>Clean length+</term><term>Dirty length+</term><term>Dirty words</term></listheader>
/// <item><term>1 byte</term><term>0-n bytes</term><term>0-n bytes</term><term>0-n bytes</term></item>
/// </list>
/// <list type="bullet">
/// <item><term><b>Token</b></term><description> encodes whether clean means full of zeros or ones in the
/// first bit, the number of clean words minus 2 on the next 3 bits and the
/// number of dirty words on the last 4 bits. The higher-order bit is a
/// continuation bit, meaning that the number is incomplete and needs additional
/// bytes to be read.</description></item>
/// <item><term><b>Clean length+</b>:</term><description> If clean length has its higher-order bit set,
/// you need to read a vint (<see cref="Store.DataInput.ReadVInt32()"/>), shift it by 3 bits on
/// the left side and add it to the 3 bits which have been read in the token.</description></item>
/// <item><term><b>Dirty length+</b></term><description> works the same way as <b>Clean length+</b> but
/// on 4 bits and for the length of dirty words.</description></item>
/// <item><term><b>Dirty words</b></term><description>are the dirty words, there are <b>Dirty length</b>
/// of them.</description></item>
/// </list>
/// <para>This format cannot encode sequences of less than 2 clean words and 0 dirty
/// word. The reason is that if you find a single clean word, you should rather
/// encode it as a dirty word. This takes the same space as starting a new
/// sequence (since you need one byte for the token) but will be lighter to
/// decode. There is however an exception for the first sequence. Since the first
/// sequence may start directly with a dirty word, the clean length is encoded
/// directly, without subtracting 2.</para>
/// <para>There is an additional restriction on the format: the sequence of dirty
/// words is not allowed to contain two consecutive clean words. This restriction
/// exists to make sure no space is wasted and to make sure iterators can read
/// the next doc ID by reading at most 2 dirty words.</para>
/// @lucene.experimental
/// </summary>
public sealed class WAH8DocIdSet : DocIdSet
{
// Minimum index interval, intervals below this value can't guarantee anymore
// that this set implementation won't be significantly larger than a FixedBitSet
// The reason is that a single sequence saves at least one byte and an index
// entry requires at most 8 bytes (2 ints) so there shouldn't be more than one
// index entry every 8 sequences
private const int MIN_INDEX_INTERVAL = 8;
/// <summary>
/// Default index interval. </summary>
public const int DEFAULT_INDEX_INTERVAL = 24;
private static readonly MonotonicAppendingInt64Buffer SINGLE_ZERO_BUFFER = LoadSingleZeroBuffer();
// LUCENENET specific - optimized empty array creation
private static readonly WAH8DocIdSet EMPTY = new WAH8DocIdSet(Arrays.Empty<byte>(), 0, 1, SINGLE_ZERO_BUFFER, SINGLE_ZERO_BUFFER);
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static MonotonicAppendingInt64Buffer LoadSingleZeroBuffer() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
{
var buffer = new MonotonicAppendingInt64Buffer(1, 64, PackedInt32s.COMPACT);
buffer.Add(0L);
buffer.Freeze();
return buffer;
}
private static readonly IComparer<Iterator> SERIALIZED_LENGTH_COMPARER = Comparer<Iterator>.Create((wi1, wi2) => wi1.@in.Length - wi2.@in.Length);
/// <summary>
/// Same as <see cref="Intersect(ICollection{WAH8DocIdSet}, int)"/> with the default index interval. </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WAH8DocIdSet Intersect(ICollection<WAH8DocIdSet> docIdSets)
{
return Intersect(docIdSets, DEFAULT_INDEX_INTERVAL);
}
/// <summary>
/// Compute the intersection of the provided sets. This method is much faster than
/// computing the intersection manually since it operates directly at the byte level.
/// </summary>
public static WAH8DocIdSet Intersect(ICollection<WAH8DocIdSet> docIdSets, int indexInterval)
{
switch (docIdSets.Count)
{
case 0:
throw new ArgumentException("There must be at least one set to intersect");
case 1:
return docIdSets.First();
}
// The logic below is similar to ConjunctionScorer
int numSets = docIdSets.Count;
var iterators = new Iterator[numSets];
int i = 0;
foreach (WAH8DocIdSet set in docIdSets)
{
var it = (Iterator)set.GetIterator();
iterators[i++] = it;
}
Array.Sort(iterators, SERIALIZED_LENGTH_COMPARER);
WordBuilder builder = (WordBuilder)(new WordBuilder()).SetIndexInterval(indexInterval);
int wordNum = 0;
while (true)
{
// Advance the least costly iterator first
iterators[0].AdvanceWord(wordNum);
wordNum = iterators[0].wordNum;
if (wordNum == DocIdSetIterator.NO_MORE_DOCS)
{
break;
}
byte word = iterators[0].word;
for (i = 1; i < numSets; ++i)
{
if (iterators[i].wordNum < wordNum)
{
iterators[i].AdvanceWord(wordNum);
}
if (iterators[i].wordNum > wordNum)
{
wordNum = iterators[i].wordNum;
goto mainContinue;
}
if (Debugging.AssertsEnabled) Debugging.Assert(iterators[i].wordNum == wordNum);
word &= iterators[i].word;
if (word == 0)
{
// There are common words, but they don't share any bit
++wordNum;
goto mainContinue;
}
}
// Found a common word
if (Debugging.AssertsEnabled) Debugging.Assert(word != 0);
builder.AddWord(wordNum, word);
++wordNum;
mainContinue:;
}
//mainBreak:
return builder.Build();
}
/// <summary>
/// Same as <see cref="Union(ICollection{WAH8DocIdSet}, int)"/> with the default index interval. </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static WAH8DocIdSet Union(ICollection<WAH8DocIdSet> docIdSets)
{
return Union(docIdSets, DEFAULT_INDEX_INTERVAL);
}
/// <summary>
/// Compute the union of the provided sets. This method is much faster than
/// computing the union manually since it operates directly at the byte level.
/// </summary>
public static WAH8DocIdSet Union(ICollection<WAH8DocIdSet> docIdSets, int indexInterval)
{
switch (docIdSets.Count)
{
case 0:
return EMPTY;
case 1:
return docIdSets.First();
}
// The logic below is very similar to DisjunctionScorer
int numSets = docIdSets.Count;
PriorityQueue<Iterator> iterators = new PriorityQueueAnonymousInnerClassHelper(numSets);
foreach (WAH8DocIdSet set in docIdSets)
{
Iterator iterator = (Iterator)set.GetIterator();
iterator.NextWord();
iterators.Add(iterator);
}
Iterator top = iterators.Top;
if (top.wordNum == int.MaxValue)
{
return EMPTY;
}
int wordNum = top.wordNum;
byte word = top.word;
WordBuilder builder = (WordBuilder)(new WordBuilder()).SetIndexInterval(indexInterval);
while (true)
{
top.NextWord();
iterators.UpdateTop();
top = iterators.Top;
if (top.wordNum == wordNum)
{
word |= top.word;
}
else
{
builder.AddWord(wordNum, word);
if (top.wordNum == int.MaxValue)
{
break;
}
wordNum = top.wordNum;
word = top.word;
}
}
return builder.Build();
}
private class PriorityQueueAnonymousInnerClassHelper : PriorityQueue<WAH8DocIdSet.Iterator>
{
public PriorityQueueAnonymousInnerClassHelper(int numSets)
: base(numSets)
{
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
protected internal override bool LessThan(Iterator a, Iterator b)
{
return a.wordNum < b.wordNum;
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static int WordNum(int docID)
{
if (Debugging.AssertsEnabled) Debugging.Assert(docID >= 0);
return (int)((uint)docID >> 3);
}
/// <summary>
/// Word-based builder. </summary>
public class WordBuilder
{
internal readonly GrowableByteArrayDataOutput @out;
internal readonly GrowableByteArrayDataOutput dirtyWords;
internal int clean;
internal int lastWordNum;
internal int numSequences;
internal int indexInterval;
internal int cardinality;
internal bool reverse;
internal WordBuilder()
{
@out = new GrowableByteArrayDataOutput(1024);
dirtyWords = new GrowableByteArrayDataOutput(128);
clean = 0;
lastWordNum = -1;
numSequences = 0;
indexInterval = DEFAULT_INDEX_INTERVAL;
cardinality = 0;
}
/// <summary>
/// Set the index interval. Smaller index intervals improve performance of
/// <see cref="DocIdSetIterator.Advance(int)"/> but make the <see cref="DocIdSet"/>
/// larger. An index interval <c>i</c> makes the index add an overhead
/// which is at most <c>4/i</c>, but likely much less. The default index
/// interval is <c>8</c>, meaning the index has an overhead of at most
/// 50%. To disable indexing, you can pass <see cref="int.MaxValue"/> as an
/// index interval.
/// </summary>
public virtual object SetIndexInterval(int indexInterval)
{
if (indexInterval < MIN_INDEX_INTERVAL)
{
throw new ArgumentException("indexInterval must be >= " + MIN_INDEX_INTERVAL);
}
this.indexInterval = indexInterval;
return this;
}
internal virtual void WriteHeader(bool reverse, int cleanLength, int dirtyLength)
{
int cleanLengthMinus2 = cleanLength - 2;
if (Debugging.AssertsEnabled)
{
Debugging.Assert(cleanLengthMinus2 >= 0);
Debugging.Assert(dirtyLength >= 0);
}
int token = ((cleanLengthMinus2 & 0x03) << 4) | (dirtyLength & 0x07);
if (reverse)
{
token |= 1 << 7;
}
if (cleanLengthMinus2 > 0x03)
{
token |= 1 << 6;
}
if (dirtyLength > 0x07)
{
token |= 1 << 3;
}
@out.WriteByte((byte)token);
if (cleanLengthMinus2 > 0x03)
{
@out.WriteVInt32((int)((uint)cleanLengthMinus2 >> 2));
}
if (dirtyLength > 0x07)
{
@out.WriteVInt32((int)((uint)dirtyLength >> 3));
}
}
private bool SequenceIsConsistent() // Called only from assert
{
for (int i = 1; i < dirtyWords.Length; ++i)
{
Debugging.Assert(dirtyWords.Bytes[i - 1] != 0 || dirtyWords.Bytes[i] != 0);
Debugging.Assert((byte)dirtyWords.Bytes[i - 1] != 0xFF || (byte)dirtyWords.Bytes[i] != 0xFF);
}
return true;
}
internal virtual void WriteSequence()
{
if (Debugging.AssertsEnabled) Debugging.Assert(SequenceIsConsistent());
try
{
WriteHeader(reverse, clean, dirtyWords.Length);
}
catch (IOException cannotHappen)
{
throw new InvalidOperationException(cannotHappen.ToString(), cannotHappen); // LUCENENET NOTE: This was AssertionError in Lucene
}
@out.WriteBytes(dirtyWords.Bytes, 0, dirtyWords.Length);
dirtyWords.Length = 0;
++numSequences;
}
internal virtual void AddWord(int wordNum, byte word)
{
if (Debugging.AssertsEnabled)
{
Debugging.Assert(wordNum > lastWordNum);
Debugging.Assert(word != 0);
}
if (!reverse)
{
if (lastWordNum == -1)
{
clean = 2 + wordNum; // special case for the 1st sequence
dirtyWords.WriteByte(word);
}
else
{
switch (wordNum - lastWordNum)
{
case 1:
if (word == 0xFF && (byte)dirtyWords.Bytes[dirtyWords.Length - 1] == 0xFF)
{
--dirtyWords.Length;
WriteSequence();
reverse = true;
clean = 2;
}
else
{
dirtyWords.WriteByte(word);
}
break;
case 2:
dirtyWords.WriteByte(0);
dirtyWords.WriteByte(word);
break;
default:
WriteSequence();
clean = wordNum - lastWordNum - 1;
dirtyWords.WriteByte(word);
break;
}
}
}
else
{
if (Debugging.AssertsEnabled) Debugging.Assert(lastWordNum >= 0);
switch (wordNum - lastWordNum)
{
case 1:
if (word == 0xFF)
{
if (dirtyWords.Length == 0)
{
++clean;
}
else if ((byte)dirtyWords.Bytes[dirtyWords.Length - 1] == 0xFF)
{
--dirtyWords.Length;
WriteSequence();
clean = 2;
}
else
{
dirtyWords.WriteByte(word);
}
}
else
{
dirtyWords.WriteByte(word);
}
break;
case 2:
dirtyWords.WriteByte(0);
dirtyWords.WriteByte(word);
break;
default:
WriteSequence();
reverse = false;
clean = wordNum - lastWordNum - 1;
dirtyWords.WriteByte(word);
break;
}
}
lastWordNum = wordNum;
cardinality += BitUtil.BitCount(word);
}
/// <summary>
/// Build a new <see cref="WAH8DocIdSet"/>. </summary>
public virtual WAH8DocIdSet Build()
{
if (cardinality == 0)
{
if (Debugging.AssertsEnabled) Debugging.Assert(lastWordNum == -1);
return EMPTY;
}
WriteSequence();
byte[] data = Arrays.CopyOf(@out.Bytes, @out.Length);
// Now build the index
int valueCount = (numSequences - 1) / indexInterval + 1;
MonotonicAppendingInt64Buffer indexPositions, indexWordNums;
if (valueCount <= 1)
{
indexPositions = indexWordNums = SINGLE_ZERO_BUFFER;
}
else
{
const int pageSize = 128;
int initialPageCount = (valueCount + pageSize - 1) / pageSize;
MonotonicAppendingInt64Buffer positions = new MonotonicAppendingInt64Buffer(initialPageCount, pageSize, PackedInt32s.COMPACT);
MonotonicAppendingInt64Buffer wordNums = new MonotonicAppendingInt64Buffer(initialPageCount, pageSize, PackedInt32s.COMPACT);
positions.Add(0L);
wordNums.Add(0L);
Iterator it = new Iterator(data, cardinality, int.MaxValue, SINGLE_ZERO_BUFFER, SINGLE_ZERO_BUFFER);
if (Debugging.AssertsEnabled)
{
Debugging.Assert(it.@in.Position == 0);
Debugging.Assert(it.wordNum == -1);
}
for (int i = 1; i < valueCount; ++i)
{
// skip indexInterval sequences
for (int j = 0; j < indexInterval; ++j)
{
bool readSequence = it.ReadSequence();
if (Debugging.AssertsEnabled) Debugging.Assert(readSequence);
it.SkipDirtyBytes();
}
int position = it.@in.Position;
int wordNum = it.wordNum;
positions.Add(position);
wordNums.Add(wordNum + 1);
}
positions.Freeze();
wordNums.Freeze();
indexPositions = positions;
indexWordNums = wordNums;
}
return new WAH8DocIdSet(data, cardinality, indexInterval, indexPositions, indexWordNums);
}
}
/// <summary>
/// A builder for <see cref="WAH8DocIdSet"/>s. </summary>
public sealed class Builder : WordBuilder
{
private int lastDocID;
private int wordNum, word;
/// <summary>
/// Sole constructor </summary>
public Builder()
: base()
{
lastDocID = -1;
wordNum = -1;
word = 0;
}
/// <summary>
/// Add a document to this builder. Documents must be added in order. </summary>
public Builder Add(int docID)
{
if (docID <= lastDocID)
{
throw new ArgumentException("Doc ids must be added in-order, got " + docID + " which is <= lastDocID=" + lastDocID);
}
int wordNum = WordNum(docID);
if (this.wordNum == -1)
{
this.wordNum = wordNum;
word = 1 << (docID & 0x07);
}
else if (wordNum == this.wordNum)
{
word |= 1 << (docID & 0x07);
}
else
{
AddWord(this.wordNum, (byte)word);
this.wordNum = wordNum;
word = 1 << (docID & 0x07);
}
lastDocID = docID;
return this;
}
/// <summary>
/// Add the content of the provided <see cref="DocIdSetIterator"/>. </summary>
public Builder Add(DocIdSetIterator disi)
{
for (int doc = disi.NextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = disi.NextDoc())
{
Add(doc);
}
return this;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public override object SetIndexInterval(int indexInterval)
{
return (Builder)base.SetIndexInterval(indexInterval);
}
public override WAH8DocIdSet Build()
{
if (this.wordNum != -1)
{
AddWord(wordNum, (byte)word);
}
return base.Build();
}
}
// where the doc IDs are stored
private readonly byte[] data;
private readonly int cardinality;
private readonly int indexInterval;
// index for advance(int)
private readonly MonotonicAppendingInt64Buffer positions, wordNums; // wordNums[i] starts at the sequence at positions[i]
internal WAH8DocIdSet(byte[] data, int cardinality, int indexInterval, MonotonicAppendingInt64Buffer positions, MonotonicAppendingInt64Buffer wordNums)
{
this.data = data;
this.cardinality = cardinality;
this.indexInterval = indexInterval;
this.positions = positions;
this.wordNums = wordNums;
}
public override bool IsCacheable => true;
public override DocIdSetIterator GetIterator()
{
return new Iterator(data, cardinality, indexInterval, positions, wordNums);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static int ReadCleanLength(ByteArrayDataInput @in, int token)
{
int len = ((int)((uint)token >> 4)) & 0x07;
int startPosition = @in.Position;
if ((len & 0x04) != 0)
{
len = (len & 0x03) | (@in.ReadVInt32() << 2);
}
if (startPosition != 1)
{
len += 2;
}
return len;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static int ReadDirtyLength(ByteArrayDataInput @in, int token)
{
int len = token & 0x0F;
if ((len & 0x08) != 0)
{
len = (len & 0x07) | (@in.ReadVInt32() << 3);
}
return len;
}
internal class Iterator : DocIdSetIterator
{
/* Using the index can be costly for close targets. */
internal static int IndexThreshold(/*int cardinality, */int indexInterval) // LUCENENET specific - removed unused parameter
{
// Short sequences encode for 3 words (2 clean words and 1 dirty byte),
// don't advance if we are going to read less than 3 x indexInterval
// sequences
long indexThreshold = 3L * 3 * indexInterval;
return (int)Math.Min(int.MaxValue, indexThreshold);
}
internal readonly ByteArrayDataInput @in;
internal readonly int cardinality;
internal readonly int indexInterval;
internal readonly MonotonicAppendingInt64Buffer positions, wordNums;
internal readonly int indexThreshold;
internal int allOnesLength;
internal int dirtyLength;
internal int wordNum; // byte offset
internal byte word; // current word
internal int bitList; // list of bits set in the current word
internal int sequenceNum; // in which sequence are we?
internal int docID;
internal Iterator(byte[] data, int cardinality, int indexInterval, MonotonicAppendingInt64Buffer positions, MonotonicAppendingInt64Buffer wordNums)
{
this.@in = new ByteArrayDataInput(data);
this.cardinality = cardinality;
this.indexInterval = indexInterval;
this.positions = positions;
this.wordNums = wordNums;
wordNum = -1;
word = 0;
bitList = 0;
sequenceNum = -1;
docID = -1;
indexThreshold = IndexThreshold(/*cardinality,*/ indexInterval); // LUCENENET specific - removed unused parameter
}
internal virtual bool ReadSequence()
{
if (@in.Eof)
{
wordNum = int.MaxValue;
return false;
}
int token = @in.ReadByte() & 0xFF;
if ((token & (1 << 7)) == 0)
{
int cleanLength = ReadCleanLength(@in, token);
wordNum += cleanLength;
}
else
{
allOnesLength = ReadCleanLength(@in, token);
}
dirtyLength = ReadDirtyLength(@in, token);
if (Debugging.AssertsEnabled) Debugging.Assert(@in.Length - @in.Position >= dirtyLength, "{0} {1} {2}", @in.Position, @in.Length, dirtyLength);
++sequenceNum;
return true;
}
internal virtual void SkipDirtyBytes(int count)
{
if (Debugging.AssertsEnabled)
{
Debugging.Assert(count >= 0);
Debugging.Assert(count <= allOnesLength + dirtyLength);
}
wordNum += count;
if (count <= allOnesLength)
{
allOnesLength -= count;
}
else
{
count -= allOnesLength;
allOnesLength = 0;
@in.SkipBytes(count);
dirtyLength -= count;
}
}
internal virtual void SkipDirtyBytes()
{
wordNum += allOnesLength + dirtyLength;
@in.SkipBytes(dirtyLength);
allOnesLength = 0;
dirtyLength = 0;
}
internal virtual void NextWord()
{
if (allOnesLength > 0)
{
word = 0xFF;
++wordNum;
--allOnesLength;
return;
}
if (dirtyLength > 0)
{
word = @in.ReadByte();
++wordNum;
--dirtyLength;
if (word != 0)
{
return;
}
if (dirtyLength > 0)
{
word = @in.ReadByte();
++wordNum;
--dirtyLength;
if (Debugging.AssertsEnabled) Debugging.Assert(word != 0); // never more than one consecutive 0
return;
}
}
if (ReadSequence())
{
NextWord();
}
}
internal virtual int ForwardBinarySearch(int targetWordNum)
{
// advance forward and double the window at each step
int indexSize = (int)wordNums.Count;
int lo = sequenceNum / indexInterval, hi = lo + 1;
if (Debugging.AssertsEnabled)
{
Debugging.Assert(sequenceNum == -1 || wordNums.Get(lo) <= wordNum);
Debugging.Assert(lo + 1 == wordNums.Count || wordNums.Get(lo + 1) > wordNum);
}
while (true)
{
if (hi >= indexSize)
{
hi = indexSize - 1;
break;
}
else if (wordNums.Get(hi) >= targetWordNum)
{
break;
}
int newLo = hi;
hi += (hi - lo) << 1;
lo = newLo;
}
// we found a window containing our target, let's binary search now
while (lo <= hi)
{
int mid = (int)((uint)(lo + hi) >> 1);
int midWordNum = (int)wordNums.Get(mid);
if (midWordNum <= targetWordNum)
{
lo = mid + 1;
}
else
{
hi = mid - 1;
}
}
if (Debugging.AssertsEnabled)
{
Debugging.Assert(wordNums.Get(hi) <= targetWordNum);
Debugging.Assert(hi + 1 == wordNums.Count || wordNums.Get(hi + 1) > targetWordNum);
}
return hi;
}
internal virtual void AdvanceWord(int targetWordNum)
{
if (Debugging.AssertsEnabled) Debugging.Assert(targetWordNum > wordNum);
int delta = targetWordNum - wordNum;
if (delta <= allOnesLength + dirtyLength + 1)
{
SkipDirtyBytes(delta - 1);
}
else
{
SkipDirtyBytes();
if (Debugging.AssertsEnabled) Debugging.Assert(dirtyLength == 0);
if (delta > indexThreshold)
{
// use the index
int i = ForwardBinarySearch(targetWordNum);
int position = (int)positions.Get(i);
if (position > @in.Position) // if the binary search returned a backward offset, don't move
{
wordNum = (int)wordNums.Get(i) - 1;
@in.Position = position;
sequenceNum = i * indexInterval - 1;
}
}
while (true)
{
if (!ReadSequence())
{
return;
}
delta = targetWordNum - wordNum;
if (delta <= allOnesLength + dirtyLength + 1)
{
if (delta > 1)
{
SkipDirtyBytes(delta - 1);
}
break;
}
SkipDirtyBytes();
}
}
NextWord();
}
public override int DocID => docID;
public override int NextDoc()
{
if (bitList != 0) // there are remaining bits in the current word
{
docID = (wordNum << 3) | ((bitList & 0x0F) - 1);
bitList = (int)((uint)bitList >> 4);
return docID;
}
NextWord();
if (wordNum == int.MaxValue)
{
return docID = NO_MORE_DOCS;
}
bitList = BitUtil.BitList(word);
if (Debugging.AssertsEnabled) Debugging.Assert(bitList != 0);
docID = (wordNum << 3) | ((bitList & 0x0F) - 1);
bitList = (int)((uint)bitList >> 4);
return docID;
}
public override int Advance(int target)
{
if (Debugging.AssertsEnabled) Debugging.Assert(target > docID);
int targetWordNum = WordNum(target);
if (targetWordNum > this.wordNum)
{
AdvanceWord(targetWordNum);
bitList = BitUtil.BitList(word);
}
return SlowAdvance(target);
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public override long GetCost()
{
return cardinality;
}
}
/// <summary>
/// Return the number of documents in this <see cref="DocIdSet"/> in constant time. </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public int Cardinality()
{
return cardinality;
}
/// <summary>
/// Return the memory usage of this class in bytes. </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public long RamBytesUsed()
{
return RamUsageEstimator.AlignObjectSize(3 * RamUsageEstimator.NUM_BYTES_OBJECT_REF + 2 * RamUsageEstimator.NUM_BYTES_INT32)
+ RamUsageEstimator.SizeOf(data)
+ positions.RamBytesUsed()
+ wordNums.RamBytesUsed();
}
}
}