blob: 7a2126f3839c04457ad9e8fbe69f08539dbbdf2c [file] [log] [blame]
using J2N.Numerics;
using J2N.Text;
using Lucene.Net.Diagnostics;
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Runtime.CompilerServices;
namespace Lucene.Net.Codecs
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using AtomicReader = Lucene.Net.Index.AtomicReader;
using IBits = Lucene.Net.Util.IBits;
using BytesRef = Lucene.Net.Util.BytesRef;
using DataInput = Lucene.Net.Store.DataInput;
using DocIdSetIterator = Lucene.Net.Search.DocIdSetIterator;
using DocsAndPositionsEnum = Lucene.Net.Index.DocsAndPositionsEnum;
using FieldInfo = Lucene.Net.Index.FieldInfo;
using FieldInfos = Lucene.Net.Index.FieldInfos;
using Fields = Lucene.Net.Index.Fields;
using MergeState = Lucene.Net.Index.MergeState;
using Terms = Lucene.Net.Index.Terms;
using TermsEnum = Lucene.Net.Index.TermsEnum;
/// <summary>
/// Codec API for writing term vectors:
/// <para/>
/// <list type="number">
/// <item><description>For every document, <see cref="StartDocument(int)"/> is called,
/// informing the <see cref="Codec"/> how many fields will be written.</description></item>
/// <item><description><see cref="StartField(FieldInfo, int, bool, bool, bool)"/> is called for
/// each field in the document, informing the codec how many terms
/// will be written for that field, and whether or not positions,
/// offsets, or payloads are enabled.</description></item>
/// <item><description>Within each field, <see cref="StartTerm(BytesRef, int)"/> is called
/// for each term.</description></item>
/// <item><description>If offsets and/or positions are enabled, then
/// <see cref="AddPosition(int, int, int, BytesRef)"/> will be called for each term
/// occurrence.</description></item>
/// <item><description>After all documents have been written, <see cref="Finish(FieldInfos, int)"/>
/// is called for verification/sanity-checks.</description></item>
/// <item><description>Finally the writer is disposed (<see cref="Dispose(bool)"/>)</description></item>
/// </list>
/// <para/>
/// @lucene.experimental
/// </summary>
public abstract class TermVectorsWriter : IDisposable
{
/// <summary>
/// Sole constructor. (For invocation by subclass
/// constructors, typically implicit.)
/// </summary>
protected internal TermVectorsWriter()
{
}
/// <summary>
/// Called before writing the term vectors of the document.
/// <see cref="StartField(FieldInfo, int, bool, bool, bool)"/> will
/// be called <paramref name="numVectorFields"/> times. Note that if term
/// vectors are enabled, this is called even if the document
/// has no vector fields, in this case <paramref name="numVectorFields"/>
/// will be zero.
/// </summary>
public abstract void StartDocument(int numVectorFields);
/// <summary>
/// Called after a doc and all its fields have been added. </summary>
[MethodImpl(MethodImplOptions.NoInlining)]
public virtual void FinishDocument()
{
}
/// <summary>
/// Called before writing the terms of the field.
/// <see cref="StartTerm(BytesRef, int)"/> will be called <paramref name="numTerms"/> times.
/// </summary>
public abstract void StartField(FieldInfo info, int numTerms, bool positions, bool offsets, bool payloads);
/// <summary>
/// Called after a field and all its terms have been added. </summary>
public virtual void FinishField()
{
}
/// <summary>
/// Adds a <paramref name="term"/> and its term frequency <paramref name="freq"/>.
/// If this field has positions and/or offsets enabled, then
/// <see cref="AddPosition(int, int, int, BytesRef)"/> will be called
/// <paramref name="freq"/> times respectively.
/// </summary>
public abstract void StartTerm(BytesRef term, int freq);
/// <summary>
/// Called after a term and all its positions have been added. </summary>
public virtual void FinishTerm()
{
}
/// <summary>
/// Adds a term <paramref name="position"/> and offsets. </summary>
public abstract void AddPosition(int position, int startOffset, int endOffset, BytesRef payload);
/// <summary>
/// Aborts writing entirely, implementation should remove
/// any partially-written files, etc.
/// </summary>
[MethodImpl(MethodImplOptions.NoInlining)]
public abstract void Abort();
/// <summary>
/// Called before <see cref="Dispose(bool)"/>, passing in the number
/// of documents that were written. Note that this is
/// intentionally redundant (equivalent to the number of
/// calls to <see cref="StartDocument(int)"/>, but a <see cref="Codec"/> should
/// check that this is the case to detect the bug described
/// in LUCENE-1282.
/// </summary>
public abstract void Finish(FieldInfos fis, int numDocs);
/// <summary>
/// Called by <see cref="Index.IndexWriter"/> when writing new segments.
/// <para/>
/// This is an expert API that allows the codec to consume
/// positions and offsets directly from the indexer.
/// <para/>
/// The default implementation calls <see cref="AddPosition(int, int, int, BytesRef)"/>,
/// but subclasses can override this if they want to efficiently write
/// all the positions, then all the offsets, for example.
/// <para/>
/// NOTE: this API is extremely expert and subject to change or removal!!!
/// <para/>
/// @lucene.internal
/// </summary>
// TODO: we should probably nuke this and make a more efficient 4.x format
// PreFlex-RW could then be slow and buffer (its only used in tests...)
public virtual void AddProx(int numProx, DataInput positions, DataInput offsets)
{
int position = 0;
int lastOffset = 0;
BytesRef payload = null;
for (int i = 0; i < numProx; i++)
{
int startOffset;
int endOffset;
BytesRef thisPayload;
if (positions == null)
{
position = -1;
thisPayload = null;
}
else
{
int code = positions.ReadVInt32();
position += code.TripleShift(1);
if ((code & 1) != 0)
{
// this position has a payload
int payloadLength = positions.ReadVInt32();
if (payload == null)
{
payload = new BytesRef();
payload.Bytes = new byte[payloadLength];
}
else if (payload.Bytes.Length < payloadLength)
{
payload.Grow(payloadLength);
}
positions.ReadBytes(payload.Bytes, 0, payloadLength);
payload.Length = payloadLength;
thisPayload = payload;
}
else
{
thisPayload = null;
}
}
if (offsets == null)
{
startOffset = endOffset = -1;
}
else
{
startOffset = lastOffset + offsets.ReadVInt32();
endOffset = startOffset + offsets.ReadVInt32();
lastOffset = endOffset;
}
AddPosition(position, startOffset, endOffset, thisPayload);
}
}
/// <summary>
/// Merges in the term vectors from the readers in
/// <paramref name="mergeState"/>. The default implementation skips
/// over deleted documents, and uses <see cref="StartDocument(int)"/>,
/// <see cref="StartField(FieldInfo, int, bool, bool, bool)"/>,
/// <see cref="StartTerm(BytesRef, int)"/>, <see cref="AddPosition(int, int, int, BytesRef)"/>,
/// and <see cref="Finish(FieldInfos, int)"/>,
/// returning the number of documents that were written.
/// Implementations can override this method for more sophisticated
/// merging (bulk-byte copying, etc).
/// </summary>
[MethodImpl(MethodImplOptions.NoInlining)]
public virtual int Merge(MergeState mergeState)
{
int docCount = 0;
for (int i = 0; i < mergeState.Readers.Count; i++)
{
AtomicReader reader = mergeState.Readers[i];
int maxDoc = reader.MaxDoc;
IBits liveDocs = reader.LiveDocs;
for (int docID = 0; docID < maxDoc; docID++)
{
if (liveDocs != null && !liveDocs.Get(docID))
{
// skip deleted docs
continue;
}
// NOTE: it's very important to first assign to vectors then pass it to
// termVectorsWriter.addAllDocVectors; see LUCENE-1282
Fields vectors = reader.GetTermVectors(docID);
AddAllDocVectors(vectors, mergeState);
docCount++;
mergeState.CheckAbort.Work(300);
}
}
Finish(mergeState.FieldInfos, docCount);
return docCount;
}
/// <summary>
/// Safe (but, slowish) default method to write every
/// vector field in the document.
/// </summary>
protected void AddAllDocVectors(Fields vectors, MergeState mergeState)
{
if (vectors == null)
{
StartDocument(0);
FinishDocument();
return;
}
int numFields = vectors.Count;
if (numFields == -1)
{
// count manually! TODO: Maybe enforce that Fields.size() returns something valid?
numFields = 0;
//for (IEnumerator<string> it = vectors.Iterator(); it.hasNext();)
foreach (string it in vectors)
{
numFields++;
}
}
StartDocument(numFields);
string lastFieldName = null;
TermsEnum termsEnum = null;
DocsAndPositionsEnum docsAndPositionsEnum = null;
int fieldCount = 0;
foreach (string fieldName in vectors)
{
fieldCount++;
FieldInfo fieldInfo = mergeState.FieldInfos.FieldInfo(fieldName);
if (Debugging.AssertsEnabled) Debugging.Assert(lastFieldName == null || fieldName.CompareToOrdinal(lastFieldName) > 0, "lastFieldName={0} fieldName={1}", lastFieldName, fieldName);
lastFieldName = fieldName;
Terms terms = vectors.GetTerms(fieldName);
if (terms == null)
{
// FieldsEnum shouldn't lie...
continue;
}
bool hasPositions = terms.HasPositions;
bool hasOffsets = terms.HasOffsets;
bool hasPayloads = terms.HasPayloads;
if (Debugging.AssertsEnabled) Debugging.Assert(!hasPayloads || hasPositions);
int numTerms = (int)terms.Count;
if (numTerms == -1)
{
// count manually. It is stupid, but needed, as Terms.size() is not a mandatory statistics function
numTerms = 0;
termsEnum = terms.GetEnumerator(termsEnum);
while (termsEnum.MoveNext())
{
numTerms++;
}
}
StartField(fieldInfo, numTerms, hasPositions, hasOffsets, hasPayloads);
termsEnum = terms.GetEnumerator(termsEnum);
int termCount = 0;
while (termsEnum.MoveNext())
{
termCount++;
int freq = (int)termsEnum.TotalTermFreq;
StartTerm(termsEnum.Term, freq);
if (hasPositions || hasOffsets)
{
docsAndPositionsEnum = termsEnum.DocsAndPositions(null, docsAndPositionsEnum);
if (Debugging.AssertsEnabled) Debugging.Assert(docsAndPositionsEnum != null);
int docID = docsAndPositionsEnum.NextDoc();
if (Debugging.AssertsEnabled)
{
Debugging.Assert(docID != DocIdSetIterator.NO_MORE_DOCS);
Debugging.Assert(docsAndPositionsEnum.Freq == freq);
}
for (int posUpto = 0; posUpto < freq; posUpto++)
{
int pos = docsAndPositionsEnum.NextPosition();
int startOffset = docsAndPositionsEnum.StartOffset;
int endOffset = docsAndPositionsEnum.EndOffset;
BytesRef payload = docsAndPositionsEnum.GetPayload();
if (Debugging.AssertsEnabled) Debugging.Assert(!hasPositions || pos >= 0);
AddPosition(pos, startOffset, endOffset, payload);
}
}
FinishTerm();
}
if (Debugging.AssertsEnabled) Debugging.Assert(termCount == numTerms);
FinishField();
}
if (Debugging.AssertsEnabled) Debugging.Assert(fieldCount == numFields);
FinishDocument();
}
/// <summary>
/// Return the <see cref="T:IComparer{BytesRef}"/> used to sort terms
/// before feeding to this API.
/// </summary>
public abstract IComparer<BytesRef> Comparer { get; }
/// <summary>
/// Disposes all resources used by this object.
/// </summary>
public void Dispose()
{
Dispose(true);
GC.SuppressFinalize(this);
}
/// <summary>
/// Implementations must override and should dispose all resources used by this instance.
/// </summary>
protected abstract void Dispose(bool disposing);
}
}