| using Lucene.Net.Analysis.Tokenattributes; |
| using System; |
| using System.Collections.Generic; |
| using System.Diagnostics; |
| |
| namespace Lucene.Net.Index |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| using ByteBlockPool = Lucene.Net.Util.ByteBlockPool; |
| using BytesRef = Lucene.Net.Util.BytesRef; |
| using BytesRefHash = Lucene.Net.Util.BytesRefHash; |
| using Counter = Lucene.Net.Util.Counter; |
| using IntBlockPool = Lucene.Net.Util.IntBlockPool; |
| |
| public sealed class TermsHashPerField : InvertedDocConsumerPerField |
| { |
| private const int HASH_INIT_SIZE = 4; |
| |
| internal readonly TermsHashConsumerPerField Consumer; |
| |
| internal readonly TermsHash TermsHash; |
| |
| internal readonly TermsHashPerField NextPerField; |
| internal readonly DocumentsWriterPerThread.DocState DocState; |
| internal readonly FieldInvertState FieldState; |
| internal ITermToBytesRefAttribute TermAtt; |
| internal BytesRef TermBytesRef; |
| |
| // Copied from our perThread |
| internal readonly IntBlockPool IntPool; |
| |
| internal readonly ByteBlockPool BytePool; |
| internal readonly ByteBlockPool TermBytePool; |
| |
| internal readonly int StreamCount; |
| internal readonly int NumPostingInt; |
| |
| internal readonly FieldInfo FieldInfo; |
| |
| internal readonly BytesRefHash BytesHash; |
| |
| internal ParallelPostingsArray PostingsArray; |
| private readonly Counter BytesUsed; |
| |
| public TermsHashPerField(DocInverterPerField docInverterPerField, TermsHash termsHash, TermsHash nextTermsHash, FieldInfo fieldInfo) |
| { |
| IntPool = termsHash.IntPool; |
| BytePool = termsHash.BytePool; |
| TermBytePool = termsHash.TermBytePool; |
| DocState = termsHash.DocState; |
| this.TermsHash = termsHash; |
| BytesUsed = termsHash.BytesUsed; |
| FieldState = docInverterPerField.FieldState; |
| this.Consumer = termsHash.Consumer.AddField(this, fieldInfo); |
| PostingsBytesStartArray byteStarts = new PostingsBytesStartArray(this, BytesUsed); |
| BytesHash = new BytesRefHash(TermBytePool, HASH_INIT_SIZE, byteStarts); |
| StreamCount = Consumer.StreamCount; |
| NumPostingInt = 2 * StreamCount; |
| this.FieldInfo = fieldInfo; |
| if (nextTermsHash != null) |
| { |
| NextPerField = (TermsHashPerField)nextTermsHash.AddField(docInverterPerField, fieldInfo); |
| } |
| else |
| { |
| NextPerField = null; |
| } |
| } |
| |
| internal void ShrinkHash(int targetSize) |
| { |
| // Fully free the bytesHash on each flush but keep the pool untouched |
| // bytesHash.clear will clear the ByteStartArray and in turn the ParallelPostingsArray too |
| BytesHash.Clear(false); |
| } |
| |
| public void Reset() |
| { |
| BytesHash.Clear(false); |
| if (NextPerField != null) |
| { |
| NextPerField.Reset(); |
| } |
| } |
| |
| internal override void Abort() |
| { |
| Reset(); |
| if (NextPerField != null) |
| { |
| NextPerField.Abort(); |
| } |
| } |
| |
| public void InitReader(ByteSliceReader reader, int termID, int stream) |
| { |
| Debug.Assert(stream < StreamCount); |
| int intStart = PostingsArray.IntStarts[termID]; |
| int[] ints = IntPool.Buffers[intStart >> IntBlockPool.INT_BLOCK_SHIFT]; |
| int upto = intStart & IntBlockPool.INT_BLOCK_MASK; |
| reader.Init(BytePool, PostingsArray.ByteStarts[termID] + stream * ByteBlockPool.FIRST_LEVEL_SIZE, ints[upto + stream]); |
| } |
| |
| /// <summary> |
| /// Collapse the hash table & sort in-place. </summary> |
| public int[] SortPostings(IComparer<BytesRef> termComp) |
| { |
| return BytesHash.Sort(termComp); |
| } |
| |
| private bool DoCall; |
| private bool DoNextCall; |
| |
| internal override void Start(IndexableField f) |
| { |
| TermAtt = FieldState.AttributeSource_Renamed.GetAttribute<ITermToBytesRefAttribute>(); |
| TermBytesRef = TermAtt.BytesRef; |
| Consumer.Start(f); |
| if (NextPerField != null) |
| { |
| NextPerField.Start(f); |
| } |
| } |
| |
| internal override bool Start(IndexableField[] fields, int count) |
| { |
| DoCall = Consumer.Start(fields, count); |
| BytesHash.Reinit(); |
| if (NextPerField != null) |
| { |
| DoNextCall = NextPerField.Start(fields, count); |
| } |
| return DoCall || DoNextCall; |
| } |
| |
| // Secondary entry point (for 2nd & subsequent TermsHash), |
| // because token text has already been "interned" into |
| // textStart, so we hash by textStart |
| public void Add(int textStart) |
| { |
| int termID = BytesHash.AddByPoolOffset(textStart); |
| if (termID >= 0) // New posting |
| { |
| // First time we are seeing this token since we last |
| // flushed the hash. |
| // Init stream slices |
| if (NumPostingInt + IntPool.IntUpto > IntBlockPool.INT_BLOCK_SIZE) |
| { |
| IntPool.NextBuffer(); |
| } |
| |
| if (ByteBlockPool.BYTE_BLOCK_SIZE - BytePool.ByteUpto < NumPostingInt * ByteBlockPool.FIRST_LEVEL_SIZE) |
| { |
| BytePool.NextBuffer(); |
| } |
| |
| IntUptos = IntPool.Buffer; |
| IntUptoStart = IntPool.IntUpto; |
| IntPool.IntUpto += StreamCount; |
| |
| PostingsArray.IntStarts[termID] = IntUptoStart + IntPool.IntOffset; |
| |
| for (int i = 0; i < StreamCount; i++) |
| { |
| int upto = BytePool.NewSlice(ByteBlockPool.FIRST_LEVEL_SIZE); |
| IntUptos[IntUptoStart + i] = upto + BytePool.ByteOffset; |
| } |
| PostingsArray.ByteStarts[termID] = IntUptos[IntUptoStart]; |
| |
| Consumer.NewTerm(termID); |
| } |
| else |
| { |
| termID = (-termID) - 1; |
| int intStart = PostingsArray.IntStarts[termID]; |
| IntUptos = IntPool.Buffers[intStart >> IntBlockPool.INT_BLOCK_SHIFT]; |
| IntUptoStart = intStart & IntBlockPool.INT_BLOCK_MASK; |
| Consumer.AddTerm(termID); |
| } |
| } |
| |
| // Primary entry point (for first TermsHash) |
| internal override void Add() |
| { |
| TermAtt.FillBytesRef(); |
| |
| // We are first in the chain so we must "intern" the |
| // term text into textStart address |
| // Get the text & hash of this term. |
| int termID; |
| try |
| { |
| termID = BytesHash.Add(TermBytesRef); |
| } |
| catch (BytesRefHash.MaxBytesLengthExceededException) |
| { |
| // Term is too large; record this here (can't throw an |
| // exc because DocInverterPerField will then abort the |
| // entire segment) and then throw an exc later in |
| // DocInverterPerField.java. LengthFilter can always be |
| // used to prune the term before indexing: |
| if (DocState.MaxTermPrefix == null) |
| { |
| int saved = TermBytesRef.Length; |
| try |
| { |
| TermBytesRef.Length = Math.Min(30, DocumentsWriterPerThread.MAX_TERM_LENGTH_UTF8); |
| DocState.MaxTermPrefix = TermBytesRef.ToString(); |
| } |
| finally |
| { |
| TermBytesRef.Length = saved; |
| } |
| } |
| Consumer.SkippingLongTerm(); |
| return; |
| } |
| if (termID >= 0) // New posting |
| { |
| BytesHash.ByteStart(termID); |
| // Init stream slices |
| if (NumPostingInt + IntPool.IntUpto > IntBlockPool.INT_BLOCK_SIZE) |
| { |
| IntPool.NextBuffer(); |
| } |
| |
| if (ByteBlockPool.BYTE_BLOCK_SIZE - BytePool.ByteUpto < NumPostingInt * ByteBlockPool.FIRST_LEVEL_SIZE) |
| { |
| BytePool.NextBuffer(); |
| } |
| |
| IntUptos = IntPool.Buffer; |
| IntUptoStart = IntPool.IntUpto; |
| IntPool.IntUpto += StreamCount; |
| |
| PostingsArray.IntStarts[termID] = IntUptoStart + IntPool.IntOffset; |
| |
| for (int i = 0; i < StreamCount; i++) |
| { |
| int upto = BytePool.NewSlice(ByteBlockPool.FIRST_LEVEL_SIZE); |
| IntUptos[IntUptoStart + i] = upto + BytePool.ByteOffset; |
| } |
| PostingsArray.ByteStarts[termID] = IntUptos[IntUptoStart]; |
| |
| Consumer.NewTerm(termID); |
| } |
| else |
| { |
| termID = (-termID) - 1; |
| int intStart = PostingsArray.IntStarts[termID]; |
| IntUptos = IntPool.Buffers[intStart >> IntBlockPool.INT_BLOCK_SHIFT]; |
| IntUptoStart = intStart & IntBlockPool.INT_BLOCK_MASK; |
| Consumer.AddTerm(termID); |
| } |
| |
| if (DoNextCall) |
| { |
| NextPerField.Add(PostingsArray.TextStarts[termID]); |
| } |
| } |
| |
| internal int[] IntUptos; |
| internal int IntUptoStart; |
| |
| internal void WriteByte(int stream, sbyte b) |
| { |
| WriteByte(stream, (byte)b); |
| } |
| |
| internal void WriteByte(int stream, byte b) |
| { |
| int upto = IntUptos[IntUptoStart + stream]; |
| var bytes = BytePool.Buffers[upto >> ByteBlockPool.BYTE_BLOCK_SHIFT]; |
| Debug.Assert(bytes != null); |
| int offset = upto & ByteBlockPool.BYTE_BLOCK_MASK; |
| if (bytes[offset] != 0) |
| { |
| // End of slice; allocate a new one |
| offset = BytePool.AllocSlice(bytes, offset); |
| bytes = BytePool.Buffer; |
| IntUptos[IntUptoStart + stream] = offset + BytePool.ByteOffset; |
| } |
| bytes[offset] = b; |
| (IntUptos[IntUptoStart + stream])++; |
| } |
| |
| public void WriteBytes(int stream, byte[] b, int offset, int len) |
| { |
| // TODO: optimize |
| int end = offset + len; |
| for (int i = offset; i < end; i++) |
| { |
| WriteByte(stream, b[i]); |
| } |
| } |
| |
| internal void WriteVInt(int stream, int i) |
| { |
| Debug.Assert(stream < StreamCount); |
| while ((i & ~0x7F) != 0) |
| { |
| WriteByte(stream, unchecked((sbyte)((i & 0x7f) | 0x80))); |
| i = (int)((uint)i >> 7); |
| } |
| WriteByte(stream, (sbyte)i); |
| } |
| |
| internal override void Finish() |
| { |
| Consumer.Finish(); |
| if (NextPerField != null) |
| { |
| NextPerField.Finish(); |
| } |
| } |
| |
| private sealed class PostingsBytesStartArray : BytesRefHash.BytesStartArray |
| { |
| internal readonly TermsHashPerField PerField; |
| internal readonly Counter BytesUsed_Renamed; |
| |
| internal PostingsBytesStartArray(TermsHashPerField perField, Counter bytesUsed) |
| { |
| this.PerField = perField; |
| this.BytesUsed_Renamed = bytesUsed; |
| } |
| |
| public override int[] Init() |
| { |
| if (PerField.PostingsArray == null) |
| { |
| PerField.PostingsArray = PerField.Consumer.CreatePostingsArray(2); |
| BytesUsed_Renamed.AddAndGet(PerField.PostingsArray.Size * PerField.PostingsArray.BytesPerPosting()); |
| } |
| return PerField.PostingsArray.TextStarts; |
| } |
| |
| public override int[] Grow() |
| { |
| ParallelPostingsArray postingsArray = PerField.PostingsArray; |
| int oldSize = PerField.PostingsArray.Size; |
| postingsArray = PerField.PostingsArray = postingsArray.Grow(); |
| BytesUsed_Renamed.AddAndGet((postingsArray.BytesPerPosting() * (postingsArray.Size - oldSize))); |
| return postingsArray.TextStarts; |
| } |
| |
| public override int[] Clear() |
| { |
| if (PerField.PostingsArray != null) |
| { |
| BytesUsed_Renamed.AddAndGet(-(PerField.PostingsArray.Size * PerField.PostingsArray.BytesPerPosting())); |
| PerField.PostingsArray = null; |
| } |
| return null; |
| } |
| |
| public override Counter BytesUsed() |
| { |
| return BytesUsed_Renamed; |
| } |
| } |
| } |
| } |