| using Lucene.Net.Diagnostics; |
| using Lucene.Net.Index; |
| using Lucene.Net.Store; |
| using Lucene.Net.Util; |
| using Lucene.Net.Util.Packed; |
| using System; |
| using System.Collections.Generic; |
| using System.Diagnostics; |
| |
| namespace Lucene.Net.Codecs.BlockTerms |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| /// <summary> |
| /// Selects every Nth term as and index term, and hold term |
| /// bytes (mostly) fully expanded in memory. This terms index |
| /// supports seeking by ord. See |
| /// <see cref="VariableGapTermsIndexWriter"/> for a more memory efficient |
| /// terms index that does not support seeking by ord. |
| /// <para/> |
| /// @lucene.experimental |
| /// </summary> |
| public class FixedGapTermsIndexWriter : TermsIndexWriterBase |
| { |
| protected IndexOutput m_output; |
| |
| /// <summary>Extension of terms index file</summary> |
| internal readonly static string TERMS_INDEX_EXTENSION = "tii"; |
| internal readonly static string CODEC_NAME = "SIMPLE_STANDARD_TERMS_INDEX"; |
| internal readonly static int VERSION_START = 0; |
| internal readonly static int VERSION_APPEND_ONLY = 1; |
| internal readonly static int VERSION_CHECKSUM = 1000; // 4.x "skipped" trunk's monotonic addressing: give any user a nice exception |
| internal readonly static int VERSION_CURRENT = VERSION_CHECKSUM; |
| |
| private readonly int termIndexInterval; |
| |
| private readonly IList<SimpleFieldWriter> fields = new List<SimpleFieldWriter>(); |
| |
| //private readonly FieldInfos fieldInfos; // unread // LUCENENET: Not used |
| |
| public FixedGapTermsIndexWriter(SegmentWriteState state) |
| { |
| string indexFileName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, TERMS_INDEX_EXTENSION); |
| termIndexInterval = state.TermIndexInterval; |
| m_output = state.Directory.CreateOutput(indexFileName, state.Context); |
| bool success = false; |
| try |
| { |
| //fieldInfos = state.FieldInfos; // LUCENENET: Not used |
| WriteHeader(m_output); |
| m_output.WriteInt32(termIndexInterval); |
| success = true; |
| } |
| finally |
| { |
| if (!success) |
| { |
| IOUtils.DisposeWhileHandlingException(m_output); |
| } |
| } |
| } |
| |
| private void WriteHeader(IndexOutput output) |
| { |
| CodecUtil.WriteHeader(output, CODEC_NAME, VERSION_CURRENT); |
| } |
| |
| public override FieldWriter AddField(FieldInfo field, long termsFilePointer) |
| { |
| //System.out.println("FGW: addFfield=" + field.name); |
| SimpleFieldWriter writer = new SimpleFieldWriter(this, field, termsFilePointer); |
| fields.Add(writer); |
| return writer; |
| } |
| |
| /// <summary> |
| /// NOTE: if your codec does not sort in unicode code |
| /// point order, you must override this method, to simply |
| /// return <c>indexedTerm.Length</c>. |
| /// </summary> |
| protected virtual int IndexedTermPrefixLength(BytesRef priorTerm, BytesRef indexedTerm) |
| { |
| // As long as codec sorts terms in unicode codepoint |
| // order, we can safely strip off the non-distinguishing |
| // suffix to save RAM in the loaded terms index. |
| int idxTermOffset = indexedTerm.Offset; |
| int priorTermOffset = priorTerm.Offset; |
| int limit = Math.Min(priorTerm.Length, indexedTerm.Length); |
| for (int byteIdx = 0; byteIdx < limit; byteIdx++) |
| { |
| if (priorTerm.Bytes[priorTermOffset + byteIdx] != indexedTerm.Bytes[idxTermOffset + byteIdx]) |
| { |
| return byteIdx + 1; |
| } |
| } |
| return Math.Min(1 + priorTerm.Length, indexedTerm.Length); |
| } |
| |
| private class SimpleFieldWriter : FieldWriter |
| { |
| private readonly FixedGapTermsIndexWriter outerInstance; |
| |
| internal readonly FieldInfo fieldInfo; |
| internal int numIndexTerms; |
| internal readonly long indexStart; |
| internal readonly long termsStart; |
| internal long packedIndexStart; |
| internal long packedOffsetsStart; |
| private long numTerms; |
| |
| // TODO: we could conceivably make a PackedInts wrapper |
| // that auto-grows... then we wouldn't force 6 bytes RAM |
| // per index term: |
| private short[] termLengths; |
| private int[] termsPointerDeltas; |
| private long lastTermsPointer; |
| private long totTermLength; |
| |
| private readonly BytesRef lastTerm = new BytesRef(); |
| |
| internal SimpleFieldWriter(FixedGapTermsIndexWriter outerInstance, FieldInfo fieldInfo, long termsFilePointer) |
| { |
| this.outerInstance = outerInstance; |
| |
| this.fieldInfo = fieldInfo; |
| indexStart = outerInstance.m_output.GetFilePointer(); |
| termsStart = lastTermsPointer = termsFilePointer; |
| termLengths = EMPTY_INT16S; |
| termsPointerDeltas = EMPTY_INT32S; |
| } |
| |
| public override bool CheckIndexTerm(BytesRef text, TermStats stats) |
| { |
| // First term is first indexed term: |
| //System.output.println("FGW: checkIndexTerm text=" + text.utf8ToString()); |
| if (0 == (numTerms++ % outerInstance.termIndexInterval)) |
| { |
| return true; |
| } |
| else |
| { |
| if (0 == numTerms % outerInstance.termIndexInterval) |
| { |
| // save last term just before next index term so we |
| // can compute wasted suffix |
| lastTerm.CopyBytes(text); |
| } |
| return false; |
| } |
| } |
| |
| public override void Add(BytesRef text, TermStats stats, long termsFilePointer) |
| { |
| int indexedTermLength = outerInstance.IndexedTermPrefixLength(lastTerm, text); |
| //System.out.println("FGW: add text=" + text.utf8ToString() + " " + text + " fp=" + termsFilePointer); |
| |
| // write only the min prefix that shows the diff |
| // against prior term |
| outerInstance.m_output.WriteBytes(text.Bytes, text.Offset, indexedTermLength); |
| |
| if (termLengths.Length == numIndexTerms) |
| { |
| termLengths = ArrayUtil.Grow(termLengths); |
| } |
| if (termsPointerDeltas.Length == numIndexTerms) |
| { |
| termsPointerDeltas = ArrayUtil.Grow(termsPointerDeltas); |
| } |
| |
| // save delta terms pointer |
| termsPointerDeltas[numIndexTerms] = (int)(termsFilePointer - lastTermsPointer); |
| lastTermsPointer = termsFilePointer; |
| |
| // save term length (in bytes) |
| if (Debugging.AssertsEnabled) Debugging.Assert(indexedTermLength <= short.MaxValue); |
| termLengths[numIndexTerms] = (short)indexedTermLength; |
| totTermLength += indexedTermLength; |
| |
| lastTerm.CopyBytes(text); |
| numIndexTerms++; |
| } |
| |
| public override void Finish(long termsFilePointer) |
| { |
| // write primary terms dict offsets |
| packedIndexStart = outerInstance.m_output.GetFilePointer(); |
| |
| PackedInt32s.Writer w = PackedInt32s.GetWriter(outerInstance.m_output, numIndexTerms, PackedInt32s.BitsRequired(termsFilePointer), PackedInt32s.DEFAULT); |
| |
| // relative to our indexStart |
| long upto = 0; |
| for (int i = 0; i < numIndexTerms; i++) |
| { |
| upto += termsPointerDeltas[i]; |
| w.Add(upto); |
| } |
| w.Finish(); |
| |
| packedOffsetsStart = outerInstance.m_output.GetFilePointer(); |
| |
| // write offsets into the byte[] terms |
| w = PackedInt32s.GetWriter(outerInstance.m_output, 1 + numIndexTerms, PackedInt32s.BitsRequired(totTermLength), PackedInt32s.DEFAULT); |
| upto = 0; |
| for (int i = 0; i < numIndexTerms; i++) |
| { |
| w.Add(upto); |
| upto += termLengths[i]; |
| } |
| w.Add(upto); |
| w.Finish(); |
| |
| // our referrer holds onto us, while other fields are |
| // being written, so don't tie up this RAM: |
| termLengths = null; |
| termsPointerDeltas = null; |
| } |
| } |
| |
| protected override void Dispose(bool disposing) |
| { |
| if (disposing) |
| { |
| if (m_output != null) |
| { |
| bool success = false; |
| try |
| { |
| long dirStart = m_output.GetFilePointer(); |
| int fieldCount = fields.Count; |
| |
| int nonNullFieldCount = 0; |
| for (int i = 0; i < fieldCount; i++) |
| { |
| SimpleFieldWriter field = fields[i]; |
| if (field.numIndexTerms > 0) |
| { |
| nonNullFieldCount++; |
| } |
| } |
| |
| m_output.WriteVInt32(nonNullFieldCount); |
| for (int i = 0; i < fieldCount; i++) |
| { |
| SimpleFieldWriter field = fields[i]; |
| if (field.numIndexTerms > 0) |
| { |
| m_output.WriteVInt32(field.fieldInfo.Number); |
| m_output.WriteVInt32(field.numIndexTerms); |
| m_output.WriteVInt64(field.termsStart); |
| m_output.WriteVInt64(field.indexStart); |
| m_output.WriteVInt64(field.packedIndexStart); |
| m_output.WriteVInt64(field.packedOffsetsStart); |
| } |
| } |
| WriteTrailer(dirStart); |
| CodecUtil.WriteFooter(m_output); |
| success = true; |
| } |
| finally |
| { |
| if (success) |
| { |
| IOUtils.Dispose(m_output); |
| } |
| else |
| { |
| IOUtils.DisposeWhileHandlingException(m_output); |
| } |
| m_output = null; |
| } |
| } |
| } |
| } |
| |
| private void WriteTrailer(long dirStart) |
| { |
| m_output.WriteInt64(dirStart); |
| } |
| } |
| } |