blob: 80fa33b5058c758b5dcc42cb426612a0c3fc9e1c [file] [log] [blame]
using Lucene.Net.Diagnostics;
using Lucene.Net.Index;
using Lucene.Net.Store;
using Lucene.Net.Util;
using Lucene.Net.Util.Fst;
using System;
using System.Collections.Generic;
namespace Lucene.Net.Codecs.BlockTerms
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Selects index terms according to provided pluggable
/// <see cref="IndexTermSelector"/>, and stores them in a prefix trie that's
/// loaded entirely in RAM stored as an <see cref="FST{T}"/>. This terms
/// index only supports unsigned byte term sort order
/// (unicode codepoint order when the bytes are UTF8).
/// <para/>
/// @lucene.experimental
/// </summary>
public class VariableGapTermsIndexWriter : TermsIndexWriterBase
{
protected IndexOutput m_output;
/// <summary>Extension of terms index file.</summary>
internal readonly static string TERMS_INDEX_EXTENSION = "tiv";
internal readonly static string CODEC_NAME = "VARIABLE_GAP_TERMS_INDEX";
internal readonly static int VERSION_START = 0;
internal readonly static int VERSION_APPEND_ONLY = 1;
internal readonly static int VERSION_CHECKSUM = 2;
internal readonly static int VERSION_CURRENT = VERSION_CHECKSUM;
private readonly IList<FSTFieldWriter> fields = new List<FSTFieldWriter>();
//private readonly FieldInfos fieldInfos; // unread // LUCENENET: Not used
private readonly IndexTermSelector policy;
/// <summary>
/// Hook for selecting which terms should be placed in the terms index.
/// <para/>
/// <see cref="NewField(FieldInfo)"/> is called at the start of each new field, and
/// <see cref="IsIndexTerm(BytesRef, TermStats)"/> for each term in that field.
/// <para/>
/// @lucene.experimental
/// </summary>
public abstract class IndexTermSelector
{
/// <summary>
/// Called sequentially on every term being written,
/// returning <c>true</c> if this term should be indexed.
/// </summary>
public abstract bool IsIndexTerm(BytesRef term, TermStats stats);
/// <summary>Called when a new field is started.</summary>
public abstract void NewField(FieldInfo fieldInfo);
}
/// <remarks>
/// Same policy as <see cref="FixedGapTermsIndexWriter"/>
/// </remarks>
public sealed class EveryNTermSelector : IndexTermSelector
{
private int count;
private readonly int interval;
public EveryNTermSelector(int interval)
{
this.interval = interval;
// First term is first indexed term
count = interval;
}
public override bool IsIndexTerm(BytesRef term, TermStats stats)
{
if (count >= interval)
{
count = 1;
return true;
}
else
{
count++;
return false;
}
}
public override void NewField(FieldInfo fieldInfo)
{
count = interval;
}
}
/// <summary>
/// Sets an index term when docFreq >= docFreqThresh, or
/// every interval terms. This should reduce seek time
/// to high docFreq terms.
/// </summary>
public sealed class EveryNOrDocFreqTermSelector : IndexTermSelector
{
private int count;
private readonly int docFreqThresh;
private readonly int interval;
public EveryNOrDocFreqTermSelector(int docFreqThresh, int interval)
{
this.interval = interval;
this.docFreqThresh = docFreqThresh;
// First term is first indexed term:
count = interval;
}
public override bool IsIndexTerm(BytesRef term, TermStats stats)
{
if (stats.DocFreq >= docFreqThresh || count >= interval)
{
count = 1;
return true;
}
else
{
count++;
return false;
}
}
public override void NewField(FieldInfo fieldInfo)
{
count = interval;
}
}
// TODO: it'd be nice to let the FST builder prune based
// on term count of each node (the prune1/prune2 that it
// accepts), and build the index based on that. This
// should result in a more compact terms index, more like
// a prefix trie than the other selectors, because it
// only stores enough leading bytes to get down to N
// terms that may complete that prefix. It becomes
// "deeper" when terms are dense, and "shallow" when they
// are less dense.
//
// However, it's not easy to make that work this this
// API, because that pruning doesn't immediately know on
// seeing each term whether that term will be a seek point
// or not. It requires some non-causality in the API, ie
// only on seeing some number of future terms will the
// builder decide which past terms are seek points.
// Somehow the API'd need to be able to return a "I don't
// know" value, eg like a Future, which only later on is
// flipped (frozen) to true or false.
//
// We could solve this with a 2-pass approach, where the
// first pass would build an FSA (no outputs) solely to
// determine which prefixes are the 'leaves' in the
// pruning. The 2nd pass would then look at this prefix
// trie to mark the seek points and build the FST mapping
// to the true output.
//
// But, one downside to this approach is that it'd result
// in uneven index term selection. EG with prune1=10, the
// resulting index terms could be as frequent as every 10
// terms or as rare as every <maxArcCount> * 10 (eg 2560),
// in the extremes.
public VariableGapTermsIndexWriter(SegmentWriteState state, IndexTermSelector policy)
{
string indexFileName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, TERMS_INDEX_EXTENSION);
m_output = state.Directory.CreateOutput(indexFileName, state.Context);
bool success = false;
try
{
//fieldInfos = state.FieldInfos; // LUCENENET: Not used
this.policy = policy;
WriteHeader(m_output);
success = true;
}
finally
{
if (!success)
{
IOUtils.DisposeWhileHandlingException(m_output);
}
}
}
private void WriteHeader(IndexOutput output)
{
CodecUtil.WriteHeader(output, CODEC_NAME, VERSION_CURRENT);
}
public override FieldWriter AddField(FieldInfo field, long termsFilePointer)
{
////System.out.println("VGW: field=" + field.name);
policy.NewField(field);
FSTFieldWriter writer = new FSTFieldWriter(this, field, termsFilePointer);
fields.Add(writer);
return writer;
}
/// <remarks>
/// NOTE: If your codec does not sort in unicode code point order,
/// you must override this method to simply return <c>indexedTerm.Length</c>.
/// </remarks>
protected virtual int IndexedTermPrefixLength(BytesRef priorTerm, BytesRef indexedTerm)
{
// As long as codec sorts terms in unicode codepoint
// order, we can safely strip off the non-distinguishing
// suffix to save RAM in the loaded terms index.
int idxTermOffset = indexedTerm.Offset;
int priorTermOffset = priorTerm.Offset;
int limit = Math.Min(priorTerm.Length, indexedTerm.Length);
for (int byteIdx = 0; byteIdx < limit; byteIdx++)
{
if (priorTerm.Bytes[priorTermOffset + byteIdx] != indexedTerm.Bytes[idxTermOffset + byteIdx])
{
return byteIdx + 1;
}
}
return Math.Min(1 + priorTerm.Length, indexedTerm.Length);
}
private class FSTFieldWriter : FieldWriter
{
private readonly VariableGapTermsIndexWriter outerInstance;
private readonly Builder<long?> fstBuilder;
private readonly PositiveInt32Outputs fstOutputs;
private readonly long startTermsFilePointer;
internal readonly FieldInfo fieldInfo;
internal FST<long?> fst;
internal readonly long indexStart;
private readonly BytesRef lastTerm = new BytesRef();
private bool first = true;
public FSTFieldWriter(VariableGapTermsIndexWriter outerInstance, FieldInfo fieldInfo, long termsFilePointer)
{
this.outerInstance = outerInstance;
this.fieldInfo = fieldInfo;
fstOutputs = PositiveInt32Outputs.Singleton;
fstBuilder = new Builder<long?>(FST.INPUT_TYPE.BYTE1, fstOutputs);
indexStart = outerInstance.m_output.GetFilePointer();
////System.out.println("VGW: field=" + fieldInfo.name);
// Always put empty string in
fstBuilder.Add(new Int32sRef(), termsFilePointer);
startTermsFilePointer = termsFilePointer;
}
public override bool CheckIndexTerm(BytesRef text, TermStats stats)
{
//System.out.println("VGW: index term=" + text.utf8ToString());
// NOTE: we must force the first term per field to be
// indexed, in case policy doesn't:
if (outerInstance.policy.IsIndexTerm(text, stats) || first)
{
first = false;
//System.out.println(" YES");
return true;
}
else
{
lastTerm.CopyBytes(text);
return false;
}
}
private readonly Int32sRef scratchIntsRef = new Int32sRef();
public override void Add(BytesRef text, TermStats stats, long termsFilePointer)
{
if (text.Length == 0)
{
// We already added empty string in ctor
if (Debugging.AssertsEnabled) Debugging.Assert(termsFilePointer == startTermsFilePointer);
return;
}
int lengthSave = text.Length;
text.Length = outerInstance.IndexedTermPrefixLength(lastTerm, text);
try
{
fstBuilder.Add(Util.Fst.Util.ToInt32sRef(text, scratchIntsRef), termsFilePointer);
}
finally
{
text.Length = lengthSave;
}
lastTerm.CopyBytes(text);
}
public override void Finish(long termsFilePointer)
{
fst = fstBuilder.Finish();
if (fst != null)
{
fst.Save(outerInstance.m_output);
}
}
}
protected override void Dispose(bool disposing)
{
if (disposing)
{
if (m_output != null)
{
try
{
long dirStart = m_output.GetFilePointer();
int fieldCount = fields.Count;
int nonNullFieldCount = 0;
for (int i = 0; i < fieldCount; i++)
{
FSTFieldWriter field = fields[i];
if (field.fst != null)
{
nonNullFieldCount++;
}
}
m_output.WriteVInt32(nonNullFieldCount);
for (int i = 0; i < fieldCount; i++)
{
FSTFieldWriter field = fields[i];
if (field.fst != null)
{
m_output.WriteVInt32(field.fieldInfo.Number);
m_output.WriteVInt64(field.indexStart);
}
}
WriteTrailer(dirStart);
CodecUtil.WriteFooter(m_output);
}
finally
{
m_output.Dispose();
m_output = null;
}
}
}
}
private void WriteTrailer(long dirStart)
{
m_output.WriteInt64(dirStart);
}
}
}