| using Lucene.Net.Diagnostics; |
| using Lucene.Net.Support; |
| using System; |
| using System.Collections.Generic; |
| using System.IO; |
| using System.Linq; |
| using System.Runtime.CompilerServices; |
| |
| namespace Lucene.Net.Util |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| using ByteArrayDataInput = Lucene.Net.Store.ByteArrayDataInput; |
| using DocIdSet = Lucene.Net.Search.DocIdSet; |
| using DocIdSetIterator = Lucene.Net.Search.DocIdSetIterator; |
| using MonotonicAppendingInt64Buffer = Lucene.Net.Util.Packed.MonotonicAppendingInt64Buffer; |
| using PackedInt32s = Lucene.Net.Util.Packed.PackedInt32s; |
| |
| /// <summary> |
| /// <see cref="DocIdSet"/> implementation based on word-aligned hybrid encoding on |
| /// words of 8 bits. |
| /// <para>This implementation doesn't support random-access but has a fast |
| /// <see cref="DocIdSetIterator"/> which can advance in logarithmic time thanks to |
| /// an index.</para> |
| /// <para>The compression scheme is simplistic and should work well with sparse and |
| /// very dense doc id sets while being only slightly larger than a |
| /// <see cref="FixedBitSet"/> for incompressible sets (overhead<2% in the worst |
| /// case) in spite of the index.</para> |
| /// <para><b>Format</b>: The format is byte-aligned. An 8-bits word is either clean, |
| /// meaning composed only of zeros or ones, or dirty, meaning that it contains |
| /// between 1 and 7 bits set. The idea is to encode sequences of clean words |
| /// using run-length encoding and to leave sequences of dirty words as-is.</para> |
| /// <list type="table"> |
| /// <listheader><term>Token</term><term>Clean length+</term><term>Dirty length+</term><term>Dirty words</term></listheader> |
| /// <item><term>1 byte</term><term>0-n bytes</term><term>0-n bytes</term><term>0-n bytes</term></item> |
| /// </list> |
| /// <list type="bullet"> |
| /// <item><term><b>Token</b></term><description> encodes whether clean means full of zeros or ones in the |
| /// first bit, the number of clean words minus 2 on the next 3 bits and the |
| /// number of dirty words on the last 4 bits. The higher-order bit is a |
| /// continuation bit, meaning that the number is incomplete and needs additional |
| /// bytes to be read.</description></item> |
| /// <item><term><b>Clean length+</b>:</term><description> If clean length has its higher-order bit set, |
| /// you need to read a vint (<see cref="Store.DataInput.ReadVInt32()"/>), shift it by 3 bits on |
| /// the left side and add it to the 3 bits which have been read in the token.</description></item> |
| /// <item><term><b>Dirty length+</b></term><description> works the same way as <b>Clean length+</b> but |
| /// on 4 bits and for the length of dirty words.</description></item> |
| /// <item><term><b>Dirty words</b></term><description>are the dirty words, there are <b>Dirty length</b> |
| /// of them.</description></item> |
| /// </list> |
| /// <para>This format cannot encode sequences of less than 2 clean words and 0 dirty |
| /// word. The reason is that if you find a single clean word, you should rather |
| /// encode it as a dirty word. This takes the same space as starting a new |
| /// sequence (since you need one byte for the token) but will be lighter to |
| /// decode. There is however an exception for the first sequence. Since the first |
| /// sequence may start directly with a dirty word, the clean length is encoded |
| /// directly, without subtracting 2.</para> |
| /// <para>There is an additional restriction on the format: the sequence of dirty |
| /// words is not allowed to contain two consecutive clean words. This restriction |
| /// exists to make sure no space is wasted and to make sure iterators can read |
| /// the next doc ID by reading at most 2 dirty words.</para> |
| /// @lucene.experimental |
| /// </summary> |
| public sealed class WAH8DocIdSet : DocIdSet |
| { |
| // Minimum index interval, intervals below this value can't guarantee anymore |
| // that this set implementation won't be significantly larger than a FixedBitSet |
| // The reason is that a single sequence saves at least one byte and an index |
| // entry requires at most 8 bytes (2 ints) so there shouldn't be more than one |
| // index entry every 8 sequences |
| private const int MIN_INDEX_INTERVAL = 8; |
| |
| /// <summary> |
| /// Default index interval. </summary> |
| public const int DEFAULT_INDEX_INTERVAL = 24; |
| |
| private static readonly MonotonicAppendingInt64Buffer SINGLE_ZERO_BUFFER = LoadSingleZeroBuffer(); |
| // LUCENENET specific - optimized empty array creation |
| private static readonly WAH8DocIdSet EMPTY = new WAH8DocIdSet(Arrays.Empty<byte>(), 0, 1, SINGLE_ZERO_BUFFER, SINGLE_ZERO_BUFFER); |
| [MethodImpl(MethodImplOptions.AggressiveInlining)] |
| private static MonotonicAppendingInt64Buffer LoadSingleZeroBuffer() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006) |
| { |
| var buffer = new MonotonicAppendingInt64Buffer(1, 64, PackedInt32s.COMPACT); |
| buffer.Add(0L); |
| buffer.Freeze(); |
| return buffer; |
| } |
| |
| private static readonly IComparer<Iterator> SERIALIZED_LENGTH_COMPARER = Comparer<Iterator>.Create((wi1, wi2) => wi1.@in.Length - wi2.@in.Length); |
| |
| /// <summary> |
| /// Same as <see cref="Intersect(ICollection{WAH8DocIdSet}, int)"/> with the default index interval. </summary> |
| [MethodImpl(MethodImplOptions.AggressiveInlining)] |
| public static WAH8DocIdSet Intersect(ICollection<WAH8DocIdSet> docIdSets) |
| { |
| return Intersect(docIdSets, DEFAULT_INDEX_INTERVAL); |
| } |
| |
| /// <summary> |
| /// Compute the intersection of the provided sets. This method is much faster than |
| /// computing the intersection manually since it operates directly at the byte level. |
| /// </summary> |
| public static WAH8DocIdSet Intersect(ICollection<WAH8DocIdSet> docIdSets, int indexInterval) |
| { |
| switch (docIdSets.Count) |
| { |
| case 0: |
| throw new ArgumentException("There must be at least one set to intersect"); |
| case 1: |
| return docIdSets.First(); |
| } |
| // The logic below is similar to ConjunctionScorer |
| int numSets = docIdSets.Count; |
| var iterators = new Iterator[numSets]; |
| int i = 0; |
| foreach (WAH8DocIdSet set in docIdSets) |
| { |
| var it = (Iterator)set.GetIterator(); |
| iterators[i++] = it; |
| } |
| Array.Sort(iterators, SERIALIZED_LENGTH_COMPARER); |
| WordBuilder builder = (WordBuilder)(new WordBuilder()).SetIndexInterval(indexInterval); |
| int wordNum = 0; |
| while (true) |
| { |
| // Advance the least costly iterator first |
| iterators[0].AdvanceWord(wordNum); |
| wordNum = iterators[0].wordNum; |
| if (wordNum == DocIdSetIterator.NO_MORE_DOCS) |
| { |
| break; |
| } |
| byte word = iterators[0].word; |
| for (i = 1; i < numSets; ++i) |
| { |
| if (iterators[i].wordNum < wordNum) |
| { |
| iterators[i].AdvanceWord(wordNum); |
| } |
| if (iterators[i].wordNum > wordNum) |
| { |
| wordNum = iterators[i].wordNum; |
| goto mainContinue; |
| } |
| if (Debugging.AssertsEnabled) Debugging.Assert(iterators[i].wordNum == wordNum); |
| word &= iterators[i].word; |
| if (word == 0) |
| { |
| // There are common words, but they don't share any bit |
| ++wordNum; |
| goto mainContinue; |
| } |
| } |
| // Found a common word |
| if (Debugging.AssertsEnabled) Debugging.Assert(word != 0); |
| builder.AddWord(wordNum, word); |
| ++wordNum; |
| mainContinue:; |
| } |
| //mainBreak: |
| return builder.Build(); |
| } |
| |
| /// <summary> |
| /// Same as <see cref="Union(ICollection{WAH8DocIdSet}, int)"/> with the default index interval. </summary> |
| [MethodImpl(MethodImplOptions.AggressiveInlining)] |
| public static WAH8DocIdSet Union(ICollection<WAH8DocIdSet> docIdSets) |
| { |
| return Union(docIdSets, DEFAULT_INDEX_INTERVAL); |
| } |
| |
| /// <summary> |
| /// Compute the union of the provided sets. This method is much faster than |
| /// computing the union manually since it operates directly at the byte level. |
| /// </summary> |
| public static WAH8DocIdSet Union(ICollection<WAH8DocIdSet> docIdSets, int indexInterval) |
| { |
| switch (docIdSets.Count) |
| { |
| case 0: |
| return EMPTY; |
| |
| case 1: |
| return docIdSets.First(); |
| } |
| // The logic below is very similar to DisjunctionScorer |
| int numSets = docIdSets.Count; |
| PriorityQueue<Iterator> iterators = new PriorityQueueAnonymousInnerClassHelper(numSets); |
| foreach (WAH8DocIdSet set in docIdSets) |
| { |
| Iterator iterator = (Iterator)set.GetIterator(); |
| iterator.NextWord(); |
| iterators.Add(iterator); |
| } |
| |
| Iterator top = iterators.Top; |
| if (top.wordNum == int.MaxValue) |
| { |
| return EMPTY; |
| } |
| int wordNum = top.wordNum; |
| byte word = top.word; |
| WordBuilder builder = (WordBuilder)(new WordBuilder()).SetIndexInterval(indexInterval); |
| while (true) |
| { |
| top.NextWord(); |
| iterators.UpdateTop(); |
| top = iterators.Top; |
| if (top.wordNum == wordNum) |
| { |
| word |= top.word; |
| } |
| else |
| { |
| builder.AddWord(wordNum, word); |
| if (top.wordNum == int.MaxValue) |
| { |
| break; |
| } |
| wordNum = top.wordNum; |
| word = top.word; |
| } |
| } |
| return builder.Build(); |
| } |
| |
| private class PriorityQueueAnonymousInnerClassHelper : PriorityQueue<WAH8DocIdSet.Iterator> |
| { |
| public PriorityQueueAnonymousInnerClassHelper(int numSets) |
| : base(numSets) |
| { |
| } |
| |
| [MethodImpl(MethodImplOptions.AggressiveInlining)] |
| protected internal override bool LessThan(Iterator a, Iterator b) |
| { |
| return a.wordNum < b.wordNum; |
| } |
| } |
| |
| [MethodImpl(MethodImplOptions.AggressiveInlining)] |
| internal static int WordNum(int docID) |
| { |
| if (Debugging.AssertsEnabled) Debugging.Assert(docID >= 0); |
| return (int)((uint)docID >> 3); |
| } |
| |
| /// <summary> |
| /// Word-based builder. </summary> |
| public class WordBuilder |
| { |
| internal readonly GrowableByteArrayDataOutput @out; |
| internal readonly GrowableByteArrayDataOutput dirtyWords; |
| internal int clean; |
| internal int lastWordNum; |
| internal int numSequences; |
| internal int indexInterval; |
| internal int cardinality; |
| internal bool reverse; |
| |
| internal WordBuilder() |
| { |
| @out = new GrowableByteArrayDataOutput(1024); |
| dirtyWords = new GrowableByteArrayDataOutput(128); |
| clean = 0; |
| lastWordNum = -1; |
| numSequences = 0; |
| indexInterval = DEFAULT_INDEX_INTERVAL; |
| cardinality = 0; |
| } |
| |
| /// <summary> |
| /// Set the index interval. Smaller index intervals improve performance of |
| /// <see cref="DocIdSetIterator.Advance(int)"/> but make the <see cref="DocIdSet"/> |
| /// larger. An index interval <c>i</c> makes the index add an overhead |
| /// which is at most <c>4/i</c>, but likely much less. The default index |
| /// interval is <c>8</c>, meaning the index has an overhead of at most |
| /// 50%. To disable indexing, you can pass <see cref="int.MaxValue"/> as an |
| /// index interval. |
| /// </summary> |
| public virtual object SetIndexInterval(int indexInterval) |
| { |
| if (indexInterval < MIN_INDEX_INTERVAL) |
| { |
| throw new ArgumentException("indexInterval must be >= " + MIN_INDEX_INTERVAL); |
| } |
| this.indexInterval = indexInterval; |
| return this; |
| } |
| |
| internal virtual void WriteHeader(bool reverse, int cleanLength, int dirtyLength) |
| { |
| int cleanLengthMinus2 = cleanLength - 2; |
| if (Debugging.AssertsEnabled) |
| { |
| Debugging.Assert(cleanLengthMinus2 >= 0); |
| Debugging.Assert(dirtyLength >= 0); |
| } |
| int token = ((cleanLengthMinus2 & 0x03) << 4) | (dirtyLength & 0x07); |
| if (reverse) |
| { |
| token |= 1 << 7; |
| } |
| if (cleanLengthMinus2 > 0x03) |
| { |
| token |= 1 << 6; |
| } |
| if (dirtyLength > 0x07) |
| { |
| token |= 1 << 3; |
| } |
| @out.WriteByte((byte)(sbyte)token); |
| if (cleanLengthMinus2 > 0x03) |
| { |
| @out.WriteVInt32((int)((uint)cleanLengthMinus2 >> 2)); |
| } |
| if (dirtyLength > 0x07) |
| { |
| @out.WriteVInt32((int)((uint)dirtyLength >> 3)); |
| } |
| } |
| |
| private bool SequenceIsConsistent() // Called only from assert |
| { |
| for (int i = 1; i < dirtyWords.Length; ++i) |
| { |
| Debugging.Assert(dirtyWords.Bytes[i - 1] != 0 || dirtyWords.Bytes[i] != 0); |
| Debugging.Assert((byte)dirtyWords.Bytes[i - 1] != 0xFF || (byte)dirtyWords.Bytes[i] != 0xFF); |
| } |
| return true; |
| } |
| |
| internal virtual void WriteSequence() |
| { |
| if (Debugging.AssertsEnabled) Debugging.Assert(SequenceIsConsistent()); |
| try |
| { |
| WriteHeader(reverse, clean, dirtyWords.Length); |
| } |
| catch (IOException cannotHappen) |
| { |
| throw new InvalidOperationException(cannotHappen.ToString(), cannotHappen); // LUCENENET NOTE: This was AssertionError in Lucene |
| } |
| @out.WriteBytes(dirtyWords.Bytes, 0, dirtyWords.Length); |
| dirtyWords.Length = 0; |
| ++numSequences; |
| } |
| |
| internal virtual void AddWord(int wordNum, byte word) |
| { |
| if (Debugging.AssertsEnabled) |
| { |
| Debugging.Assert(wordNum > lastWordNum); |
| Debugging.Assert(word != 0); |
| } |
| |
| if (!reverse) |
| { |
| if (lastWordNum == -1) |
| { |
| clean = 2 + wordNum; // special case for the 1st sequence |
| dirtyWords.WriteByte(word); |
| } |
| else |
| { |
| switch (wordNum - lastWordNum) |
| { |
| case 1: |
| if (word == 0xFF && (byte)dirtyWords.Bytes[dirtyWords.Length - 1] == 0xFF) |
| { |
| --dirtyWords.Length; |
| WriteSequence(); |
| reverse = true; |
| clean = 2; |
| } |
| else |
| { |
| dirtyWords.WriteByte(word); |
| } |
| break; |
| |
| case 2: |
| dirtyWords.WriteByte(0); |
| dirtyWords.WriteByte(word); |
| break; |
| |
| default: |
| WriteSequence(); |
| clean = wordNum - lastWordNum - 1; |
| dirtyWords.WriteByte(word); |
| break; |
| } |
| } |
| } |
| else |
| { |
| if (Debugging.AssertsEnabled) Debugging.Assert(lastWordNum >= 0); |
| switch (wordNum - lastWordNum) |
| { |
| case 1: |
| if (word == 0xFF) |
| { |
| if (dirtyWords.Length == 0) |
| { |
| ++clean; |
| } |
| else if ((byte)dirtyWords.Bytes[dirtyWords.Length - 1] == 0xFF) |
| { |
| --dirtyWords.Length; |
| WriteSequence(); |
| clean = 2; |
| } |
| else |
| { |
| dirtyWords.WriteByte(word); |
| } |
| } |
| else |
| { |
| dirtyWords.WriteByte(word); |
| } |
| break; |
| |
| case 2: |
| dirtyWords.WriteByte(0); |
| dirtyWords.WriteByte(word); |
| break; |
| |
| default: |
| WriteSequence(); |
| reverse = false; |
| clean = wordNum - lastWordNum - 1; |
| dirtyWords.WriteByte(word); |
| break; |
| } |
| } |
| lastWordNum = wordNum; |
| cardinality += BitUtil.BitCount(word); |
| } |
| |
| /// <summary> |
| /// Build a new <see cref="WAH8DocIdSet"/>. </summary> |
| public virtual WAH8DocIdSet Build() |
| { |
| if (cardinality == 0) |
| { |
| if (Debugging.AssertsEnabled) Debugging.Assert(lastWordNum == -1); |
| return EMPTY; |
| } |
| WriteSequence(); |
| byte[] data = Arrays.CopyOf(@out.Bytes, @out.Length); |
| |
| // Now build the index |
| int valueCount = (numSequences - 1) / indexInterval + 1; |
| MonotonicAppendingInt64Buffer indexPositions, indexWordNums; |
| if (valueCount <= 1) |
| { |
| indexPositions = indexWordNums = SINGLE_ZERO_BUFFER; |
| } |
| else |
| { |
| const int pageSize = 128; |
| int initialPageCount = (valueCount + pageSize - 1) / pageSize; |
| MonotonicAppendingInt64Buffer positions = new MonotonicAppendingInt64Buffer(initialPageCount, pageSize, PackedInt32s.COMPACT); |
| MonotonicAppendingInt64Buffer wordNums = new MonotonicAppendingInt64Buffer(initialPageCount, pageSize, PackedInt32s.COMPACT); |
| |
| positions.Add(0L); |
| wordNums.Add(0L); |
| Iterator it = new Iterator(data, cardinality, int.MaxValue, SINGLE_ZERO_BUFFER, SINGLE_ZERO_BUFFER); |
| if (Debugging.AssertsEnabled) |
| { |
| Debugging.Assert(it.@in.Position == 0); |
| Debugging.Assert(it.wordNum == -1); |
| } |
| for (int i = 1; i < valueCount; ++i) |
| { |
| // skip indexInterval sequences |
| for (int j = 0; j < indexInterval; ++j) |
| { |
| bool readSequence = it.ReadSequence(); |
| if (Debugging.AssertsEnabled) Debugging.Assert(readSequence); |
| it.SkipDirtyBytes(); |
| } |
| int position = it.@in.Position; |
| int wordNum = it.wordNum; |
| positions.Add(position); |
| wordNums.Add(wordNum + 1); |
| } |
| positions.Freeze(); |
| wordNums.Freeze(); |
| indexPositions = positions; |
| indexWordNums = wordNums; |
| } |
| |
| return new WAH8DocIdSet(data, cardinality, indexInterval, indexPositions, indexWordNums); |
| } |
| } |
| |
| /// <summary> |
| /// A builder for <see cref="WAH8DocIdSet"/>s. </summary> |
| public sealed class Builder : WordBuilder |
| { |
| private int lastDocID; |
| private int wordNum, word; |
| |
| /// <summary> |
| /// Sole constructor </summary> |
| public Builder() |
| : base() |
| { |
| lastDocID = -1; |
| wordNum = -1; |
| word = 0; |
| } |
| |
| /// <summary> |
| /// Add a document to this builder. Documents must be added in order. </summary> |
| public Builder Add(int docID) |
| { |
| if (docID <= lastDocID) |
| { |
| throw new ArgumentException("Doc ids must be added in-order, got " + docID + " which is <= lastDocID=" + lastDocID); |
| } |
| int wordNum = WordNum(docID); |
| if (this.wordNum == -1) |
| { |
| this.wordNum = wordNum; |
| word = 1 << (docID & 0x07); |
| } |
| else if (wordNum == this.wordNum) |
| { |
| word |= 1 << (docID & 0x07); |
| } |
| else |
| { |
| AddWord(this.wordNum, (byte)word); |
| this.wordNum = wordNum; |
| word = 1 << (docID & 0x07); |
| } |
| lastDocID = docID; |
| return this; |
| } |
| |
| /// <summary> |
| /// Add the content of the provided <see cref="DocIdSetIterator"/>. </summary> |
| public Builder Add(DocIdSetIterator disi) |
| { |
| for (int doc = disi.NextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = disi.NextDoc()) |
| { |
| Add(doc); |
| } |
| return this; |
| } |
| |
| [MethodImpl(MethodImplOptions.AggressiveInlining)] |
| public override object SetIndexInterval(int indexInterval) |
| { |
| return (Builder)base.SetIndexInterval(indexInterval); |
| } |
| |
| public override WAH8DocIdSet Build() |
| { |
| if (this.wordNum != -1) |
| { |
| AddWord(wordNum, (byte)word); |
| } |
| return base.Build(); |
| } |
| } |
| |
| // where the doc IDs are stored |
| private readonly byte[] data; |
| |
| private readonly int cardinality; |
| private readonly int indexInterval; |
| |
| // index for advance(int) |
| private readonly MonotonicAppendingInt64Buffer positions, wordNums; // wordNums[i] starts at the sequence at positions[i] |
| |
| internal WAH8DocIdSet(byte[] data, int cardinality, int indexInterval, MonotonicAppendingInt64Buffer positions, MonotonicAppendingInt64Buffer wordNums) |
| { |
| this.data = data; |
| this.cardinality = cardinality; |
| this.indexInterval = indexInterval; |
| this.positions = positions; |
| this.wordNums = wordNums; |
| } |
| |
| public override bool IsCacheable => true; |
| |
| public override DocIdSetIterator GetIterator() |
| { |
| return new Iterator(data, cardinality, indexInterval, positions, wordNums); |
| } |
| |
| [MethodImpl(MethodImplOptions.AggressiveInlining)] |
| internal static int ReadCleanLength(ByteArrayDataInput @in, int token) |
| { |
| int len = ((int)((uint)token >> 4)) & 0x07; |
| int startPosition = @in.Position; |
| if ((len & 0x04) != 0) |
| { |
| len = (len & 0x03) | (@in.ReadVInt32() << 2); |
| } |
| if (startPosition != 1) |
| { |
| len += 2; |
| } |
| return len; |
| } |
| |
| [MethodImpl(MethodImplOptions.AggressiveInlining)] |
| internal static int ReadDirtyLength(ByteArrayDataInput @in, int token) |
| { |
| int len = token & 0x0F; |
| if ((len & 0x08) != 0) |
| { |
| len = (len & 0x07) | (@in.ReadVInt32() << 3); |
| } |
| return len; |
| } |
| |
| internal class Iterator : DocIdSetIterator |
| { |
| /* Using the index can be costly for close targets. */ |
| |
| internal static int IndexThreshold(/*int cardinality, */int indexInterval) // LUCENENET specific - removed unused parameter |
| { |
| // Short sequences encode for 3 words (2 clean words and 1 dirty byte), |
| // don't advance if we are going to read less than 3 x indexInterval |
| // sequences |
| long indexThreshold = 3L * 3 * indexInterval; |
| return (int)Math.Min(int.MaxValue, indexThreshold); |
| } |
| |
| internal readonly ByteArrayDataInput @in; |
| internal readonly int cardinality; |
| internal readonly int indexInterval; |
| internal readonly MonotonicAppendingInt64Buffer positions, wordNums; |
| internal readonly int indexThreshold; |
| internal int allOnesLength; |
| internal int dirtyLength; |
| |
| internal int wordNum; // byte offset |
| internal byte word; // current word |
| internal int bitList; // list of bits set in the current word |
| internal int sequenceNum; // in which sequence are we? |
| |
| internal int docID; |
| |
| internal Iterator(byte[] data, int cardinality, int indexInterval, MonotonicAppendingInt64Buffer positions, MonotonicAppendingInt64Buffer wordNums) |
| { |
| this.@in = new ByteArrayDataInput(data); |
| this.cardinality = cardinality; |
| this.indexInterval = indexInterval; |
| this.positions = positions; |
| this.wordNums = wordNums; |
| wordNum = -1; |
| word = 0; |
| bitList = 0; |
| sequenceNum = -1; |
| docID = -1; |
| indexThreshold = IndexThreshold(/*cardinality,*/ indexInterval); // LUCENENET specific - removed unused parameter |
| } |
| |
| internal virtual bool ReadSequence() |
| { |
| if (@in.Eof) |
| { |
| wordNum = int.MaxValue; |
| return false; |
| } |
| int token = @in.ReadByte() & 0xFF; |
| if ((token & (1 << 7)) == 0) |
| { |
| int cleanLength = ReadCleanLength(@in, token); |
| wordNum += cleanLength; |
| } |
| else |
| { |
| allOnesLength = ReadCleanLength(@in, token); |
| } |
| dirtyLength = ReadDirtyLength(@in, token); |
| if (Debugging.AssertsEnabled) Debugging.Assert(@in.Length - @in.Position >= dirtyLength, "{0} {1} {2}", @in.Position, @in.Length, dirtyLength); |
| ++sequenceNum; |
| return true; |
| } |
| |
| internal virtual void SkipDirtyBytes(int count) |
| { |
| if (Debugging.AssertsEnabled) |
| { |
| Debugging.Assert(count >= 0); |
| Debugging.Assert(count <= allOnesLength + dirtyLength); |
| } |
| wordNum += count; |
| if (count <= allOnesLength) |
| { |
| allOnesLength -= count; |
| } |
| else |
| { |
| count -= allOnesLength; |
| allOnesLength = 0; |
| @in.SkipBytes(count); |
| dirtyLength -= count; |
| } |
| } |
| |
| internal virtual void SkipDirtyBytes() |
| { |
| wordNum += allOnesLength + dirtyLength; |
| @in.SkipBytes(dirtyLength); |
| allOnesLength = 0; |
| dirtyLength = 0; |
| } |
| |
| internal virtual void NextWord() |
| { |
| if (allOnesLength > 0) |
| { |
| word = 0xFF; |
| ++wordNum; |
| --allOnesLength; |
| return; |
| } |
| if (dirtyLength > 0) |
| { |
| word = @in.ReadByte(); |
| ++wordNum; |
| --dirtyLength; |
| if (word != 0) |
| { |
| return; |
| } |
| if (dirtyLength > 0) |
| { |
| word = @in.ReadByte(); |
| ++wordNum; |
| --dirtyLength; |
| if (Debugging.AssertsEnabled) Debugging.Assert(word != 0); // never more than one consecutive 0 |
| return; |
| } |
| } |
| if (ReadSequence()) |
| { |
| NextWord(); |
| } |
| } |
| |
| internal virtual int ForwardBinarySearch(int targetWordNum) |
| { |
| // advance forward and double the window at each step |
| int indexSize = (int)wordNums.Count; |
| int lo = sequenceNum / indexInterval, hi = lo + 1; |
| if (Debugging.AssertsEnabled) |
| { |
| Debugging.Assert(sequenceNum == -1 || wordNums.Get(lo) <= wordNum); |
| Debugging.Assert(lo + 1 == wordNums.Count || wordNums.Get(lo + 1) > wordNum); |
| } |
| while (true) |
| { |
| if (hi >= indexSize) |
| { |
| hi = indexSize - 1; |
| break; |
| } |
| else if (wordNums.Get(hi) >= targetWordNum) |
| { |
| break; |
| } |
| int newLo = hi; |
| hi += (hi - lo) << 1; |
| lo = newLo; |
| } |
| |
| // we found a window containing our target, let's binary search now |
| while (lo <= hi) |
| { |
| int mid = (int)((uint)(lo + hi) >> 1); |
| int midWordNum = (int)wordNums.Get(mid); |
| if (midWordNum <= targetWordNum) |
| { |
| lo = mid + 1; |
| } |
| else |
| { |
| hi = mid - 1; |
| } |
| } |
| if (Debugging.AssertsEnabled) |
| { |
| Debugging.Assert(wordNums.Get(hi) <= targetWordNum); |
| Debugging.Assert(hi + 1 == wordNums.Count || wordNums.Get(hi + 1) > targetWordNum); |
| } |
| return hi; |
| } |
| |
| internal virtual void AdvanceWord(int targetWordNum) |
| { |
| if (Debugging.AssertsEnabled) Debugging.Assert(targetWordNum > wordNum); |
| int delta = targetWordNum - wordNum; |
| if (delta <= allOnesLength + dirtyLength + 1) |
| { |
| SkipDirtyBytes(delta - 1); |
| } |
| else |
| { |
| SkipDirtyBytes(); |
| if (Debugging.AssertsEnabled) Debugging.Assert(dirtyLength == 0); |
| if (delta > indexThreshold) |
| { |
| // use the index |
| int i = ForwardBinarySearch(targetWordNum); |
| int position = (int)positions.Get(i); |
| if (position > @in.Position) // if the binary search returned a backward offset, don't move |
| { |
| wordNum = (int)wordNums.Get(i) - 1; |
| @in.Position = position; |
| sequenceNum = i * indexInterval - 1; |
| } |
| } |
| |
| while (true) |
| { |
| if (!ReadSequence()) |
| { |
| return; |
| } |
| delta = targetWordNum - wordNum; |
| if (delta <= allOnesLength + dirtyLength + 1) |
| { |
| if (delta > 1) |
| { |
| SkipDirtyBytes(delta - 1); |
| } |
| break; |
| } |
| SkipDirtyBytes(); |
| } |
| } |
| |
| NextWord(); |
| } |
| |
| public override int DocID => docID; |
| |
| public override int NextDoc() |
| { |
| if (bitList != 0) // there are remaining bits in the current word |
| { |
| docID = (wordNum << 3) | ((bitList & 0x0F) - 1); |
| bitList = (int)((uint)bitList >> 4); |
| return docID; |
| } |
| NextWord(); |
| if (wordNum == int.MaxValue) |
| { |
| return docID = NO_MORE_DOCS; |
| } |
| bitList = BitUtil.BitList(word); |
| if (Debugging.AssertsEnabled) Debugging.Assert(bitList != 0); |
| docID = (wordNum << 3) | ((bitList & 0x0F) - 1); |
| bitList = (int)((uint)bitList >> 4); |
| return docID; |
| } |
| |
| public override int Advance(int target) |
| { |
| if (Debugging.AssertsEnabled) Debugging.Assert(target > docID); |
| int targetWordNum = WordNum(target); |
| if (targetWordNum > this.wordNum) |
| { |
| AdvanceWord(targetWordNum); |
| bitList = BitUtil.BitList(word); |
| } |
| return SlowAdvance(target); |
| } |
| |
| [MethodImpl(MethodImplOptions.AggressiveInlining)] |
| public override long GetCost() |
| { |
| return cardinality; |
| } |
| } |
| |
| /// <summary> |
| /// Return the number of documents in this <see cref="DocIdSet"/> in constant time. </summary> |
| [MethodImpl(MethodImplOptions.AggressiveInlining)] |
| public int Cardinality() |
| { |
| return cardinality; |
| } |
| |
| /// <summary> |
| /// Return the memory usage of this class in bytes. </summary> |
| [MethodImpl(MethodImplOptions.AggressiveInlining)] |
| public long RamBytesUsed() |
| { |
| return RamUsageEstimator.AlignObjectSize(3 * RamUsageEstimator.NUM_BYTES_OBJECT_REF + 2 * RamUsageEstimator.NUM_BYTES_INT32) |
| + RamUsageEstimator.SizeOf(data) |
| + positions.RamBytesUsed() |
| + wordNums.RamBytesUsed(); |
| } |
| } |
| } |