| using Lucene.Net.Diagnostics; |
| using Lucene.Net.Support; |
| using System; |
| using System.Collections.Generic; |
| using System.Diagnostics; |
| using System.Runtime.CompilerServices; |
| using JCG = J2N.Collections.Generic; |
| using ArrayUtil = Lucene.Net.Util.ArrayUtil; |
| |
| namespace Lucene.Net.Codecs.Compressing |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| using AtomicReader = Lucene.Net.Index.AtomicReader; |
| using IBits = Lucene.Net.Util.IBits; |
| using BlockPackedWriter = Lucene.Net.Util.Packed.BlockPackedWriter; |
| using BufferedChecksumIndexInput = Lucene.Net.Store.BufferedChecksumIndexInput; |
| using BytesRef = Lucene.Net.Util.BytesRef; |
| using ChecksumIndexInput = Lucene.Net.Store.ChecksumIndexInput; |
| using DataInput = Lucene.Net.Store.DataInput; |
| using Directory = Lucene.Net.Store.Directory; |
| using FieldInfo = Lucene.Net.Index.FieldInfo; |
| using FieldInfos = Lucene.Net.Index.FieldInfos; |
| using Fields = Lucene.Net.Index.Fields; |
| using GrowableByteArrayDataOutput = Lucene.Net.Util.GrowableByteArrayDataOutput; |
| using IndexFileNames = Lucene.Net.Index.IndexFileNames; |
| using IndexInput = Lucene.Net.Store.IndexInput; |
| using IndexOutput = Lucene.Net.Store.IndexOutput; |
| using IOContext = Lucene.Net.Store.IOContext; |
| using IOUtils = Lucene.Net.Util.IOUtils; |
| using MergeState = Lucene.Net.Index.MergeState; |
| using PackedInt32s = Lucene.Net.Util.Packed.PackedInt32s; |
| using SegmentInfo = Lucene.Net.Index.SegmentInfo; |
| using SegmentReader = Lucene.Net.Index.SegmentReader; |
| using StringHelper = Lucene.Net.Util.StringHelper; |
| |
| /// <summary> |
| /// <see cref="TermVectorsWriter"/> for <see cref="CompressingTermVectorsFormat"/>. |
| /// <para/> |
| /// @lucene.experimental |
| /// </summary> |
| public sealed class CompressingTermVectorsWriter : TermVectorsWriter |
| { |
| // hard limit on the maximum number of documents per chunk |
| internal const int MAX_DOCUMENTS_PER_CHUNK = 128; |
| |
| internal const string VECTORS_EXTENSION = "tvd"; |
| internal const string VECTORS_INDEX_EXTENSION = "tvx"; |
| |
| internal const string CODEC_SFX_IDX = "Index"; |
| internal const string CODEC_SFX_DAT = "Data"; |
| |
| internal const int VERSION_START = 0; |
| internal const int VERSION_CHECKSUM = 1; |
| internal const int VERSION_CURRENT = VERSION_CHECKSUM; |
| |
| internal const int BLOCK_SIZE = 64; |
| |
| internal const int POSITIONS = 0x01; |
| internal const int OFFSETS = 0x02; |
| internal const int PAYLOADS = 0x04; |
| internal static readonly int FLAGS_BITS = PackedInt32s.BitsRequired(POSITIONS | OFFSETS | PAYLOADS); |
| |
| private readonly Directory directory; |
| private readonly string segment; |
| private readonly string segmentSuffix; |
| #pragma warning disable CA2213 // Disposable fields should be disposed |
| private CompressingStoredFieldsIndexWriter indexWriter; |
| private IndexOutput vectorsStream; |
| #pragma warning restore CA2213 // Disposable fields should be disposed |
| |
| private readonly CompressionMode compressionMode; |
| private readonly Compressor compressor; |
| private readonly int chunkSize; |
| |
| /// <summary> |
| /// A pending doc. </summary> |
| private class DocData |
| { |
| private readonly CompressingTermVectorsWriter outerInstance; |
| |
| internal readonly int numFields; |
| internal readonly LinkedList<FieldData> fields; |
| internal readonly int posStart, offStart, payStart; |
| |
| internal DocData(CompressingTermVectorsWriter outerInstance, int numFields, int posStart, int offStart, int payStart) |
| { |
| this.outerInstance = outerInstance; |
| this.numFields = numFields; |
| this.fields = new LinkedList<FieldData>(); |
| this.posStart = posStart; |
| this.offStart = offStart; |
| this.payStart = payStart; |
| } |
| |
| internal virtual FieldData AddField(int fieldNum, int numTerms, bool positions, bool offsets, bool payloads) |
| { |
| FieldData field; |
| if (fields.Count == 0) |
| { |
| field = new FieldData(outerInstance, fieldNum, numTerms, positions, offsets, payloads, posStart, offStart, payStart); |
| } |
| else |
| { |
| FieldData last = fields.Last.Value; |
| int posStart = last.posStart + (last.hasPositions ? last.totalPositions : 0); |
| int offStart = last.offStart + (last.hasOffsets ? last.totalPositions : 0); |
| int payStart = last.payStart + (last.hasPayloads ? last.totalPositions : 0); |
| field = new FieldData(outerInstance, fieldNum, numTerms, positions, offsets, payloads, posStart, offStart, payStart); |
| } |
| fields.AddLast(field); |
| return field; |
| } |
| } |
| |
| private DocData AddDocData(int numVectorFields) |
| { |
| FieldData last = null; |
| // LUCENENET specific - quicker just to use the linked list properties |
| // to walk backward, since we are only looking for the last element with |
| // fields. |
| var doc = pendingDocs.Last; |
| while (doc != null) |
| { |
| if (!(doc.Value.fields.Count == 0)) |
| { |
| last = doc.Value.fields.Last.Value; |
| break; |
| } |
| doc = doc.Previous; |
| } |
| DocData newDoc; |
| if (last == null) |
| { |
| newDoc = new DocData(this, numVectorFields, 0, 0, 0); |
| } |
| else |
| { |
| int posStart = last.posStart + (last.hasPositions ? last.totalPositions : 0); |
| int offStart = last.offStart + (last.hasOffsets ? last.totalPositions : 0); |
| int payStart = last.payStart + (last.hasPayloads ? last.totalPositions : 0); |
| newDoc = new DocData(this, numVectorFields, posStart, offStart, payStart); |
| } |
| pendingDocs.AddLast(newDoc); |
| return newDoc; |
| } |
| |
| /// <summary> |
| /// A pending field. </summary> |
| private class FieldData |
| { |
| private readonly CompressingTermVectorsWriter outerInstance; |
| |
| internal readonly bool hasPositions, hasOffsets, hasPayloads; |
| internal readonly int fieldNum, flags, numTerms; |
| internal readonly int[] freqs, prefixLengths, suffixLengths; |
| internal readonly int posStart, offStart, payStart; |
| internal int totalPositions; |
| internal int ord; |
| |
| internal FieldData(CompressingTermVectorsWriter outerInstance, int fieldNum, int numTerms, bool positions, bool offsets, bool payloads, int posStart, int offStart, int payStart) |
| { |
| this.outerInstance = outerInstance; |
| this.fieldNum = fieldNum; |
| this.numTerms = numTerms; |
| this.hasPositions = positions; |
| this.hasOffsets = offsets; |
| this.hasPayloads = payloads; |
| this.flags = (positions ? POSITIONS : 0) | (offsets ? OFFSETS : 0) | (payloads ? PAYLOADS : 0); |
| this.freqs = new int[numTerms]; |
| this.prefixLengths = new int[numTerms]; |
| this.suffixLengths = new int[numTerms]; |
| this.posStart = posStart; |
| this.offStart = offStart; |
| this.payStart = payStart; |
| totalPositions = 0; |
| ord = 0; |
| } |
| |
| [MethodImpl(MethodImplOptions.AggressiveInlining)] |
| internal virtual void AddTerm(int freq, int prefixLength, int suffixLength) |
| { |
| freqs[ord] = freq; |
| prefixLengths[ord] = prefixLength; |
| suffixLengths[ord] = suffixLength; |
| ++ord; |
| } |
| |
| internal virtual void AddPosition(int position, int startOffset, int length, int payloadLength) |
| { |
| if (hasPositions) |
| { |
| if (posStart + totalPositions == outerInstance.positionsBuf.Length) |
| { |
| outerInstance.positionsBuf = ArrayUtil.Grow(outerInstance.positionsBuf); |
| } |
| outerInstance.positionsBuf[posStart + totalPositions] = position; |
| } |
| if (hasOffsets) |
| { |
| if (offStart + totalPositions == outerInstance.startOffsetsBuf.Length) |
| { |
| int newLength = ArrayUtil.Oversize(offStart + totalPositions, 4); |
| outerInstance.startOffsetsBuf = Arrays.CopyOf(outerInstance.startOffsetsBuf, newLength); |
| outerInstance.lengthsBuf = Arrays.CopyOf(outerInstance.lengthsBuf, newLength); |
| } |
| outerInstance.startOffsetsBuf[offStart + totalPositions] = startOffset; |
| outerInstance.lengthsBuf[offStart + totalPositions] = length; |
| } |
| if (hasPayloads) |
| { |
| if (payStart + totalPositions == outerInstance.payloadLengthsBuf.Length) |
| { |
| outerInstance.payloadLengthsBuf = ArrayUtil.Grow(outerInstance.payloadLengthsBuf); |
| } |
| outerInstance.payloadLengthsBuf[payStart + totalPositions] = payloadLength; |
| } |
| ++totalPositions; |
| } |
| } |
| |
| private int numDocs; // total number of docs seen |
| private readonly LinkedList<DocData> pendingDocs; // pending docs |
| private DocData curDoc; // current document |
| private FieldData curField; // current field |
| private readonly BytesRef lastTerm; |
| private int[] positionsBuf, startOffsetsBuf, lengthsBuf, payloadLengthsBuf; |
| private readonly GrowableByteArrayDataOutput termSuffixes; // buffered term suffixes |
| private readonly GrowableByteArrayDataOutput payloadBytes; // buffered term payloads |
| private readonly BlockPackedWriter writer; |
| |
| /// <summary> |
| /// Sole constructor. </summary> |
| public CompressingTermVectorsWriter(Directory directory, SegmentInfo si, string segmentSuffix, IOContext context, string formatName, CompressionMode compressionMode, int chunkSize) |
| { |
| if (Debugging.AssertsEnabled) Debugging.Assert(directory != null); |
| this.directory = directory; |
| this.segment = si.Name; |
| this.segmentSuffix = segmentSuffix; |
| this.compressionMode = compressionMode; |
| this.compressor = compressionMode.NewCompressor(); |
| this.chunkSize = chunkSize; |
| |
| numDocs = 0; |
| pendingDocs = new LinkedList<DocData>(); |
| termSuffixes = new GrowableByteArrayDataOutput(ArrayUtil.Oversize(chunkSize, 1)); |
| payloadBytes = new GrowableByteArrayDataOutput(ArrayUtil.Oversize(1, 1)); |
| lastTerm = new BytesRef(ArrayUtil.Oversize(30, 1)); |
| |
| bool success = false; |
| IndexOutput indexStream = directory.CreateOutput(IndexFileNames.SegmentFileName(segment, segmentSuffix, VECTORS_INDEX_EXTENSION), context); |
| try |
| { |
| vectorsStream = directory.CreateOutput(IndexFileNames.SegmentFileName(segment, segmentSuffix, VECTORS_EXTENSION), context); |
| |
| string codecNameIdx = formatName + CODEC_SFX_IDX; |
| string codecNameDat = formatName + CODEC_SFX_DAT; |
| CodecUtil.WriteHeader(indexStream, codecNameIdx, VERSION_CURRENT); |
| CodecUtil.WriteHeader(vectorsStream, codecNameDat, VERSION_CURRENT); |
| if (Debugging.AssertsEnabled) |
| { |
| Debugging.Assert(CodecUtil.HeaderLength(codecNameDat) == vectorsStream.GetFilePointer()); |
| Debugging.Assert(CodecUtil.HeaderLength(codecNameIdx) == indexStream.GetFilePointer()); |
| } |
| |
| indexWriter = new CompressingStoredFieldsIndexWriter(indexStream); |
| indexStream = null; |
| |
| vectorsStream.WriteVInt32(PackedInt32s.VERSION_CURRENT); |
| vectorsStream.WriteVInt32(chunkSize); |
| writer = new BlockPackedWriter(vectorsStream, BLOCK_SIZE); |
| |
| positionsBuf = new int[1024]; |
| startOffsetsBuf = new int[1024]; |
| lengthsBuf = new int[1024]; |
| payloadLengthsBuf = new int[1024]; |
| |
| success = true; |
| } |
| finally |
| { |
| if (!success) |
| { |
| IOUtils.DisposeWhileHandlingException(indexStream); |
| Abort(); |
| } |
| } |
| } |
| |
| [MethodImpl(MethodImplOptions.AggressiveInlining)] |
| protected override void Dispose(bool disposing) |
| { |
| if (disposing) |
| { |
| try |
| { |
| IOUtils.Dispose(vectorsStream, indexWriter); |
| } |
| finally |
| { |
| vectorsStream = null; |
| indexWriter = null; |
| } |
| } |
| } |
| |
| [MethodImpl(MethodImplOptions.NoInlining)] |
| public override void Abort() |
| { |
| IOUtils.DisposeWhileHandlingException(this); |
| IOUtils.DeleteFilesIgnoringExceptions(directory, IndexFileNames.SegmentFileName(segment, segmentSuffix, VECTORS_EXTENSION), IndexFileNames.SegmentFileName(segment, segmentSuffix, VECTORS_INDEX_EXTENSION)); |
| } |
| |
| [MethodImpl(MethodImplOptions.AggressiveInlining)] |
| public override void StartDocument(int numVectorFields) |
| { |
| curDoc = AddDocData(numVectorFields); |
| } |
| |
| [MethodImpl(MethodImplOptions.NoInlining)] |
| public override void FinishDocument() |
| { |
| // append the payload bytes of the doc after its terms |
| termSuffixes.WriteBytes(payloadBytes.Bytes, payloadBytes.Length); |
| payloadBytes.Length = 0; |
| ++numDocs; |
| if (TriggerFlush()) |
| { |
| Flush(); |
| } |
| curDoc = null; |
| } |
| |
| [MethodImpl(MethodImplOptions.AggressiveInlining)] |
| public override void StartField(FieldInfo info, int numTerms, bool positions, bool offsets, bool payloads) |
| { |
| curField = curDoc.AddField(info.Number, numTerms, positions, offsets, payloads); |
| lastTerm.Length = 0; |
| } |
| |
| [MethodImpl(MethodImplOptions.AggressiveInlining)] |
| public override void FinishField() |
| { |
| curField = null; |
| } |
| |
| public override void StartTerm(BytesRef term, int freq) |
| { |
| if (Debugging.AssertsEnabled) Debugging.Assert(freq >= 1); |
| int prefix = StringHelper.BytesDifference(lastTerm, term); |
| curField.AddTerm(freq, prefix, term.Length - prefix); |
| termSuffixes.WriteBytes(term.Bytes, term.Offset + prefix, term.Length - prefix); |
| // copy last term |
| if (lastTerm.Bytes.Length < term.Length) |
| { |
| lastTerm.Bytes = new byte[ArrayUtil.Oversize(term.Length, 1)]; |
| } |
| lastTerm.Offset = 0; |
| lastTerm.Length = term.Length; |
| Array.Copy(term.Bytes, term.Offset, lastTerm.Bytes, 0, term.Length); |
| } |
| |
| public override void AddPosition(int position, int startOffset, int endOffset, BytesRef payload) |
| { |
| if (Debugging.AssertsEnabled) Debugging.Assert(curField.flags != 0); |
| curField.AddPosition(position, startOffset, endOffset - startOffset, payload == null ? 0 : payload.Length); |
| if (curField.hasPayloads && payload != null) |
| { |
| payloadBytes.WriteBytes(payload.Bytes, payload.Offset, payload.Length); |
| } |
| } |
| |
| [MethodImpl(MethodImplOptions.AggressiveInlining)] |
| private bool TriggerFlush() |
| { |
| return termSuffixes.Length >= chunkSize || pendingDocs.Count >= MAX_DOCUMENTS_PER_CHUNK; |
| } |
| |
| [MethodImpl(MethodImplOptions.NoInlining)] |
| private void Flush() |
| { |
| int chunkDocs = pendingDocs.Count; |
| if (Debugging.AssertsEnabled) Debugging.Assert(chunkDocs > 0, "{0}", chunkDocs); |
| |
| // write the index file |
| indexWriter.WriteIndex(chunkDocs, vectorsStream.GetFilePointer()); |
| |
| int docBase = numDocs - chunkDocs; |
| vectorsStream.WriteVInt32(docBase); |
| vectorsStream.WriteVInt32(chunkDocs); |
| |
| // total number of fields of the chunk |
| int totalFields = FlushNumFields(chunkDocs); |
| |
| if (totalFields > 0) |
| { |
| // unique field numbers (sorted) |
| int[] fieldNums = FlushFieldNums(); |
| // offsets in the array of unique field numbers |
| FlushFields(totalFields, fieldNums); |
| // flags (does the field have positions, offsets, payloads?) |
| FlushFlags(totalFields, fieldNums); |
| // number of terms of each field |
| FlushNumTerms(totalFields); |
| // prefix and suffix lengths for each field |
| FlushTermLengths(); |
| // term freqs - 1 (because termFreq is always >=1) for each term |
| FlushTermFreqs(); |
| // positions for all terms, when enabled |
| FlushPositions(); |
| // offsets for all terms, when enabled |
| FlushOffsets(fieldNums); |
| // payload lengths for all terms, when enabled |
| FlushPayloadLengths(); |
| |
| // compress terms and payloads and write them to the output |
| compressor.Compress(termSuffixes.Bytes, 0, termSuffixes.Length, vectorsStream); |
| } |
| |
| // reset |
| pendingDocs.Clear(); |
| curDoc = null; |
| curField = null; |
| termSuffixes.Length = 0; |
| } |
| |
| private int FlushNumFields(int chunkDocs) |
| { |
| if (chunkDocs == 1) |
| { |
| int numFields = pendingDocs.First.Value.numFields; |
| vectorsStream.WriteVInt32(numFields); |
| return numFields; |
| } |
| else |
| { |
| writer.Reset(vectorsStream); |
| int totalFields = 0; |
| foreach (DocData dd in pendingDocs) |
| { |
| writer.Add(dd.numFields); |
| totalFields += dd.numFields; |
| } |
| writer.Finish(); |
| return totalFields; |
| } |
| } |
| |
| /// <summary> |
| /// Returns a sorted array containing unique field numbers. </summary> |
| private int[] FlushFieldNums() |
| { |
| JCG.SortedSet<int> fieldNums = new JCG.SortedSet<int>(); |
| foreach (DocData dd in pendingDocs) |
| { |
| foreach (FieldData fd in dd.fields) |
| { |
| fieldNums.Add(fd.fieldNum); |
| } |
| } |
| |
| int numDistinctFields = fieldNums.Count; |
| if (Debugging.AssertsEnabled) Debugging.Assert(numDistinctFields > 0); |
| int bitsRequired = PackedInt32s.BitsRequired(fieldNums.Max); |
| int token = (Math.Min(numDistinctFields - 1, 0x07) << 5) | bitsRequired; |
| vectorsStream.WriteByte((byte)token); |
| if (numDistinctFields - 1 >= 0x07) |
| { |
| vectorsStream.WriteVInt32(numDistinctFields - 1 - 0x07); |
| } |
| PackedInt32s.Writer writer = PackedInt32s.GetWriterNoHeader(vectorsStream, PackedInt32s.Format.PACKED, fieldNums.Count, bitsRequired, 1); |
| foreach (int fieldNum in fieldNums) |
| { |
| writer.Add(fieldNum); |
| } |
| writer.Finish(); |
| |
| int[] fns = new int[fieldNums.Count]; |
| int i = 0; |
| foreach (int key in fieldNums) |
| { |
| fns[i++] = key; |
| } |
| return fns; |
| } |
| |
| private void FlushFields(int totalFields, int[] fieldNums) |
| { |
| PackedInt32s.Writer writer = PackedInt32s.GetWriterNoHeader(vectorsStream, PackedInt32s.Format.PACKED, totalFields, PackedInt32s.BitsRequired(fieldNums.Length - 1), 1); |
| foreach (DocData dd in pendingDocs) |
| { |
| foreach (FieldData fd in dd.fields) |
| { |
| int fieldNumIndex = Array.BinarySearch(fieldNums, fd.fieldNum); |
| if (Debugging.AssertsEnabled) Debugging.Assert(fieldNumIndex >= 0); |
| writer.Add(fieldNumIndex); |
| } |
| } |
| writer.Finish(); |
| } |
| |
| private void FlushFlags(int totalFields, int[] fieldNums) |
| { |
| // check if fields always have the same flags |
| bool nonChangingFlags = true; |
| int[] fieldFlags = new int[fieldNums.Length]; |
| Arrays.Fill(fieldFlags, -1); |
| bool breakOuterLoop; |
| foreach (DocData dd in pendingDocs) |
| { |
| breakOuterLoop = false; |
| foreach (FieldData fd in dd.fields) |
| { |
| int fieldNumOff = Array.BinarySearch(fieldNums, fd.fieldNum); |
| if (Debugging.AssertsEnabled) Debugging.Assert(fieldNumOff >= 0); |
| if (fieldFlags[fieldNumOff] == -1) |
| { |
| fieldFlags[fieldNumOff] = fd.flags; |
| } |
| else if (fieldFlags[fieldNumOff] != fd.flags) |
| { |
| nonChangingFlags = false; |
| breakOuterLoop = true; |
| } |
| } |
| if (breakOuterLoop) |
| break; |
| } |
| |
| if (nonChangingFlags) |
| { |
| // write one flag per field num |
| vectorsStream.WriteVInt32(0); |
| PackedInt32s.Writer writer = PackedInt32s.GetWriterNoHeader(vectorsStream, PackedInt32s.Format.PACKED, fieldFlags.Length, FLAGS_BITS, 1); |
| foreach (int flags in fieldFlags) |
| { |
| if (Debugging.AssertsEnabled) Debugging.Assert(flags >= 0); |
| writer.Add(flags); |
| } |
| if (Debugging.AssertsEnabled) Debugging.Assert(writer.Ord == fieldFlags.Length - 1); |
| writer.Finish(); |
| } |
| else |
| { |
| // write one flag for every field instance |
| vectorsStream.WriteVInt32(1); |
| PackedInt32s.Writer writer = PackedInt32s.GetWriterNoHeader(vectorsStream, PackedInt32s.Format.PACKED, totalFields, FLAGS_BITS, 1); |
| foreach (DocData dd in pendingDocs) |
| { |
| foreach (FieldData fd in dd.fields) |
| { |
| writer.Add(fd.flags); |
| } |
| } |
| if (Debugging.AssertsEnabled) Debugging.Assert(writer.Ord == totalFields - 1); |
| writer.Finish(); |
| } |
| } |
| |
| private void FlushNumTerms(int totalFields) |
| { |
| int maxNumTerms = 0; |
| foreach (DocData dd in pendingDocs) |
| { |
| foreach (FieldData fd in dd.fields) |
| { |
| maxNumTerms |= fd.numTerms; |
| } |
| } |
| int bitsRequired = PackedInt32s.BitsRequired(maxNumTerms); |
| vectorsStream.WriteVInt32(bitsRequired); |
| PackedInt32s.Writer writer = PackedInt32s.GetWriterNoHeader(vectorsStream, PackedInt32s.Format.PACKED, totalFields, bitsRequired, 1); |
| foreach (DocData dd in pendingDocs) |
| { |
| foreach (FieldData fd in dd.fields) |
| { |
| writer.Add(fd.numTerms); |
| } |
| } |
| if (Debugging.AssertsEnabled) Debugging.Assert(writer.Ord == totalFields - 1); |
| writer.Finish(); |
| } |
| |
| private void FlushTermLengths() |
| { |
| writer.Reset(vectorsStream); |
| foreach (DocData dd in pendingDocs) |
| { |
| foreach (FieldData fd in dd.fields) |
| { |
| for (int i = 0; i < fd.numTerms; ++i) |
| { |
| writer.Add(fd.prefixLengths[i]); |
| } |
| } |
| } |
| writer.Finish(); |
| writer.Reset(vectorsStream); |
| foreach (DocData dd in pendingDocs) |
| { |
| foreach (FieldData fd in dd.fields) |
| { |
| for (int i = 0; i < fd.numTerms; ++i) |
| { |
| writer.Add(fd.suffixLengths[i]); |
| } |
| } |
| } |
| writer.Finish(); |
| } |
| |
| private void FlushTermFreqs() |
| { |
| writer.Reset(vectorsStream); |
| foreach (DocData dd in pendingDocs) |
| { |
| foreach (FieldData fd in dd.fields) |
| { |
| for (int i = 0; i < fd.numTerms; ++i) |
| { |
| writer.Add(fd.freqs[i] - 1); |
| } |
| } |
| } |
| writer.Finish(); |
| } |
| |
| private void FlushPositions() |
| { |
| writer.Reset(vectorsStream); |
| foreach (DocData dd in pendingDocs) |
| { |
| foreach (FieldData fd in dd.fields) |
| { |
| if (fd.hasPositions) |
| { |
| int pos = 0; |
| for (int i = 0; i < fd.numTerms; ++i) |
| { |
| int previousPosition = 0; |
| for (int j = 0; j < fd.freqs[i]; ++j) |
| { |
| int position = positionsBuf[fd.posStart + pos++]; |
| writer.Add(position - previousPosition); |
| previousPosition = position; |
| } |
| } |
| if (Debugging.AssertsEnabled) Debugging.Assert(pos == fd.totalPositions); |
| } |
| } |
| } |
| writer.Finish(); |
| } |
| |
| private void FlushOffsets(int[] fieldNums) |
| { |
| bool hasOffsets = false; |
| long[] sumPos = new long[fieldNums.Length]; |
| long[] sumOffsets = new long[fieldNums.Length]; |
| foreach (DocData dd in pendingDocs) |
| { |
| foreach (FieldData fd in dd.fields) |
| { |
| hasOffsets |= fd.hasOffsets; |
| if (fd.hasOffsets && fd.hasPositions) |
| { |
| int fieldNumOff = Array.BinarySearch(fieldNums, fd.fieldNum); |
| int pos = 0; |
| for (int i = 0; i < fd.numTerms; ++i) |
| { |
| int previousPos = 0; |
| int previousOff = 0; |
| for (int j = 0; j < fd.freqs[i]; ++j) |
| { |
| int position = positionsBuf[fd.posStart + pos]; |
| int startOffset = startOffsetsBuf[fd.offStart + pos]; |
| sumPos[fieldNumOff] += position - previousPos; |
| sumOffsets[fieldNumOff] += startOffset - previousOff; |
| previousPos = position; |
| previousOff = startOffset; |
| ++pos; |
| } |
| } |
| if (Debugging.AssertsEnabled) Debugging.Assert(pos == fd.totalPositions); |
| } |
| } |
| } |
| |
| if (!hasOffsets) |
| { |
| // nothing to do |
| return; |
| } |
| |
| float[] charsPerTerm = new float[fieldNums.Length]; |
| for (int i = 0; i < fieldNums.Length; ++i) |
| { |
| charsPerTerm[i] = (sumPos[i] <= 0 || sumOffsets[i] <= 0) ? 0 : (float)((double)sumOffsets[i] / sumPos[i]); |
| } |
| |
| // start offsets |
| for (int i = 0; i < fieldNums.Length; ++i) |
| { |
| vectorsStream.WriteInt32(J2N.BitConversion.SingleToRawInt32Bits(charsPerTerm[i])); |
| } |
| |
| writer.Reset(vectorsStream); |
| foreach (DocData dd in pendingDocs) |
| { |
| foreach (FieldData fd in dd.fields) |
| { |
| if ((fd.flags & OFFSETS) != 0) |
| { |
| int fieldNumOff = Array.BinarySearch(fieldNums, fd.fieldNum); |
| float cpt = charsPerTerm[fieldNumOff]; |
| int pos = 0; |
| for (int i = 0; i < fd.numTerms; ++i) |
| { |
| int previousPos = 0; |
| int previousOff = 0; |
| for (int j = 0; j < fd.freqs[i]; ++j) |
| { |
| int position = fd.hasPositions ? positionsBuf[fd.posStart + pos] : 0; |
| int startOffset = startOffsetsBuf[fd.offStart + pos]; |
| writer.Add(startOffset - previousOff - (int)(cpt * (position - previousPos))); |
| previousPos = position; |
| previousOff = startOffset; |
| ++pos; |
| } |
| } |
| } |
| } |
| } |
| writer.Finish(); |
| |
| // lengths |
| writer.Reset(vectorsStream); |
| foreach (DocData dd in pendingDocs) |
| { |
| foreach (FieldData fd in dd.fields) |
| { |
| if ((fd.flags & OFFSETS) != 0) |
| { |
| int pos = 0; |
| for (int i = 0; i < fd.numTerms; ++i) |
| { |
| for (int j = 0; j < fd.freqs[i]; ++j) |
| { |
| writer.Add(lengthsBuf[fd.offStart + pos++] - fd.prefixLengths[i] - fd.suffixLengths[i]); |
| } |
| } |
| if (Debugging.AssertsEnabled) Debugging.Assert(pos == fd.totalPositions); |
| } |
| } |
| } |
| writer.Finish(); |
| } |
| |
| private void FlushPayloadLengths() |
| { |
| writer.Reset(vectorsStream); |
| foreach (DocData dd in pendingDocs) |
| { |
| foreach (FieldData fd in dd.fields) |
| { |
| if (fd.hasPayloads) |
| { |
| for (int i = 0; i < fd.totalPositions; ++i) |
| { |
| writer.Add(payloadLengthsBuf[fd.payStart + i]); |
| } |
| } |
| } |
| } |
| writer.Finish(); |
| } |
| |
| public override void Finish(FieldInfos fis, int numDocs) |
| { |
| if (!(pendingDocs.Count == 0)) |
| { |
| Flush(); |
| } |
| if (numDocs != this.numDocs) |
| { |
| throw new Exception("Wrote " + this.numDocs + " docs, finish called with numDocs=" + numDocs); |
| } |
| indexWriter.Finish(numDocs, vectorsStream.GetFilePointer()); |
| CodecUtil.WriteFooter(vectorsStream); |
| } |
| |
| public override IComparer<BytesRef> Comparer => BytesRef.UTF8SortedAsUnicodeComparer; |
| |
| public override void AddProx(int numProx, DataInput positions, DataInput offsets) |
| { |
| if (Debugging.AssertsEnabled) |
| { |
| Debugging.Assert((curField.hasPositions) == (positions != null)); |
| Debugging.Assert((curField.hasOffsets) == (offsets != null)); |
| } |
| |
| if (curField.hasPositions) |
| { |
| int posStart = curField.posStart + curField.totalPositions; |
| if (posStart + numProx > positionsBuf.Length) |
| { |
| positionsBuf = ArrayUtil.Grow(positionsBuf, posStart + numProx); |
| } |
| int position = 0; |
| if (curField.hasPayloads) |
| { |
| int payStart = curField.payStart + curField.totalPositions; |
| if (payStart + numProx > payloadLengthsBuf.Length) |
| { |
| payloadLengthsBuf = ArrayUtil.Grow(payloadLengthsBuf, payStart + numProx); |
| } |
| for (int i = 0; i < numProx; ++i) |
| { |
| int code = positions.ReadVInt32(); |
| if ((code & 1) != 0) |
| { |
| // this position has a payload |
| int payloadLength = positions.ReadVInt32(); |
| payloadLengthsBuf[payStart + i] = payloadLength; |
| payloadBytes.CopyBytes(positions, payloadLength); |
| } |
| else |
| { |
| payloadLengthsBuf[payStart + i] = 0; |
| } |
| position += (int)((uint)code >> 1); |
| positionsBuf[posStart + i] = position; |
| } |
| } |
| else |
| { |
| for (int i = 0; i < numProx; ++i) |
| { |
| position += ((int)((uint)positions.ReadVInt32() >> 1)); |
| positionsBuf[posStart + i] = position; |
| } |
| } |
| } |
| |
| if (curField.hasOffsets) |
| { |
| int offStart = curField.offStart + curField.totalPositions; |
| if (offStart + numProx > startOffsetsBuf.Length) |
| { |
| int newLength = ArrayUtil.Oversize(offStart + numProx, 4); |
| startOffsetsBuf = Arrays.CopyOf(startOffsetsBuf, newLength); |
| lengthsBuf = Arrays.CopyOf(lengthsBuf, newLength); |
| } |
| int lastOffset = 0, startOffset, endOffset; |
| for (int i = 0; i < numProx; ++i) |
| { |
| startOffset = lastOffset + offsets.ReadVInt32(); |
| endOffset = startOffset + offsets.ReadVInt32(); |
| lastOffset = endOffset; |
| startOffsetsBuf[offStart + i] = startOffset; |
| lengthsBuf[offStart + i] = endOffset - startOffset; |
| } |
| } |
| |
| curField.totalPositions += numProx; |
| } |
| |
| [MethodImpl(MethodImplOptions.NoInlining)] |
| public override int Merge(MergeState mergeState) |
| { |
| int docCount = 0; |
| int idx = 0; |
| |
| foreach (AtomicReader reader in mergeState.Readers) |
| { |
| SegmentReader matchingSegmentReader = mergeState.MatchingSegmentReaders[idx++]; |
| CompressingTermVectorsReader matchingVectorsReader = null; |
| if (matchingSegmentReader != null) |
| { |
| TermVectorsReader vectorsReader = matchingSegmentReader.TermVectorsReader; |
| // we can only bulk-copy if the matching reader is also a CompressingTermVectorsReader |
| if (vectorsReader != null && vectorsReader is CompressingTermVectorsReader compressingTermVectorsReader) |
| { |
| matchingVectorsReader = compressingTermVectorsReader; |
| } |
| } |
| |
| int maxDoc = reader.MaxDoc; |
| IBits liveDocs = reader.LiveDocs; |
| |
| if (matchingVectorsReader == null || matchingVectorsReader.Version != VERSION_CURRENT || matchingVectorsReader.CompressionMode != compressionMode || matchingVectorsReader.ChunkSize != chunkSize || matchingVectorsReader.PackedInt32sVersion != PackedInt32s.VERSION_CURRENT) |
| { |
| // naive merge... |
| for (int i = NextLiveDoc(0, liveDocs, maxDoc); i < maxDoc; i = NextLiveDoc(i + 1, liveDocs, maxDoc)) |
| { |
| Fields vectors = reader.GetTermVectors(i); |
| AddAllDocVectors(vectors, mergeState); |
| ++docCount; |
| mergeState.CheckAbort.Work(300); |
| } |
| } |
| else |
| { |
| CompressingStoredFieldsIndexReader index = matchingVectorsReader.Index; |
| IndexInput vectorsStreamOrig = matchingVectorsReader.VectorsStream; |
| vectorsStreamOrig.Seek(0); |
| ChecksumIndexInput vectorsStream = new BufferedChecksumIndexInput((IndexInput)vectorsStreamOrig.Clone()); |
| |
| for (int i = NextLiveDoc(0, liveDocs, maxDoc); i < maxDoc; ) |
| { |
| // We make sure to move the checksum input in any case, otherwise the final |
| // integrity check might need to read the whole file a second time |
| long startPointer = index.GetStartPointer(i); |
| if (startPointer > vectorsStream.GetFilePointer()) |
| { |
| vectorsStream.Seek(startPointer); |
| } |
| if ((pendingDocs.Count == 0) && (i == 0 || index.GetStartPointer(i - 1) < startPointer)) // start of a chunk |
| { |
| int docBase = vectorsStream.ReadVInt32(); |
| int chunkDocs = vectorsStream.ReadVInt32(); |
| if (Debugging.AssertsEnabled) Debugging.Assert(docBase + chunkDocs <= matchingSegmentReader.MaxDoc); |
| if (docBase + chunkDocs < matchingSegmentReader.MaxDoc && NextDeletedDoc(docBase, liveDocs, docBase + chunkDocs) == docBase + chunkDocs) |
| { |
| long chunkEnd = index.GetStartPointer(docBase + chunkDocs); |
| long chunkLength = chunkEnd - vectorsStream.GetFilePointer(); |
| indexWriter.WriteIndex(chunkDocs, this.vectorsStream.GetFilePointer()); |
| this.vectorsStream.WriteVInt32(docCount); |
| this.vectorsStream.WriteVInt32(chunkDocs); |
| this.vectorsStream.CopyBytes(vectorsStream, chunkLength); |
| docCount += chunkDocs; |
| this.numDocs += chunkDocs; |
| mergeState.CheckAbort.Work(300 * chunkDocs); |
| i = NextLiveDoc(docBase + chunkDocs, liveDocs, maxDoc); |
| } |
| else |
| { |
| for (; i < docBase + chunkDocs; i = NextLiveDoc(i + 1, liveDocs, maxDoc)) |
| { |
| Fields vectors = reader.GetTermVectors(i); |
| AddAllDocVectors(vectors, mergeState); |
| ++docCount; |
| mergeState.CheckAbort.Work(300); |
| } |
| } |
| } |
| else |
| { |
| Fields vectors = reader.GetTermVectors(i); |
| AddAllDocVectors(vectors, mergeState); |
| ++docCount; |
| mergeState.CheckAbort.Work(300); |
| i = NextLiveDoc(i + 1, liveDocs, maxDoc); |
| } |
| } |
| |
| vectorsStream.Seek(vectorsStream.Length - CodecUtil.FooterLength()); |
| CodecUtil.CheckFooter(vectorsStream); |
| } |
| } |
| Finish(mergeState.FieldInfos, docCount); |
| return docCount; |
| } |
| |
| [MethodImpl(MethodImplOptions.AggressiveInlining)] |
| private static int NextLiveDoc(int doc, IBits liveDocs, int maxDoc) |
| { |
| if (liveDocs == null) |
| { |
| return doc; |
| } |
| while (doc < maxDoc && !liveDocs.Get(doc)) |
| { |
| ++doc; |
| } |
| return doc; |
| } |
| |
| [MethodImpl(MethodImplOptions.AggressiveInlining)] |
| private static int NextDeletedDoc(int doc, IBits liveDocs, int maxDoc) |
| { |
| if (liveDocs == null) |
| { |
| return maxDoc; |
| } |
| while (doc < maxDoc && liveDocs.Get(doc)) |
| { |
| ++doc; |
| } |
| return doc; |
| } |
| } |
| } |