blob: f7875353e12be3e8de108f0212762aec56d3101e [file] [log] [blame]
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Diagnostics;
using Lucene.Net.Support;
using System;
using System.Runtime.CompilerServices;
namespace Lucene.Net.Index
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using ByteBlockPool = Lucene.Net.Util.ByteBlockPool;
using BytesRef = Lucene.Net.Util.BytesRef;
using RamUsageEstimator = Lucene.Net.Util.RamUsageEstimator;
using TermVectorsWriter = Lucene.Net.Codecs.TermVectorsWriter;
internal sealed class TermVectorsConsumerPerField : TermsHashConsumerPerField
{
internal readonly TermsHashPerField termsHashPerField;
internal readonly TermVectorsConsumer termsWriter;
internal readonly FieldInfo fieldInfo;
internal readonly DocumentsWriterPerThread.DocState docState;
internal readonly FieldInvertState fieldState;
internal bool doVectors;
internal bool doVectorPositions;
internal bool doVectorOffsets;
internal bool doVectorPayloads;
internal int maxNumPostings;
internal IOffsetAttribute offsetAttribute;
internal IPayloadAttribute payloadAttribute;
internal bool hasPayloads; // if enabled, and we actually saw any for this field
public TermVectorsConsumerPerField(TermsHashPerField termsHashPerField, TermVectorsConsumer termsWriter, FieldInfo fieldInfo)
{
this.termsHashPerField = termsHashPerField;
this.termsWriter = termsWriter;
this.fieldInfo = fieldInfo;
docState = termsHashPerField.docState;
fieldState = termsHashPerField.fieldState;
}
internal override int StreamCount => 2;
internal override bool Start(IIndexableField[] fields, int count)
{
doVectors = false;
doVectorPositions = false;
doVectorOffsets = false;
doVectorPayloads = false;
hasPayloads = false;
for (int i = 0; i < count; i++)
{
IIndexableField field = fields[i];
if (field.IndexableFieldType.IsIndexed)
{
if (field.IndexableFieldType.StoreTermVectors)
{
doVectors = true;
doVectorPositions |= field.IndexableFieldType.StoreTermVectorPositions;
doVectorOffsets |= field.IndexableFieldType.StoreTermVectorOffsets;
if (doVectorPositions)
{
doVectorPayloads |= field.IndexableFieldType.StoreTermVectorPayloads;
}
else if (field.IndexableFieldType.StoreTermVectorPayloads)
{
// TODO: move this check somewhere else, and impl the other missing ones
throw new ArgumentException("cannot index term vector payloads without term vector positions (field=\"" + field.Name + "\")");
}
}
else
{
if (field.IndexableFieldType.StoreTermVectorOffsets)
{
throw new ArgumentException("cannot index term vector offsets when term vectors are not indexed (field=\"" + field.Name + "\")");
}
if (field.IndexableFieldType.StoreTermVectorPositions)
{
throw new ArgumentException("cannot index term vector positions when term vectors are not indexed (field=\"" + field.Name + "\")");
}
if (field.IndexableFieldType.StoreTermVectorPayloads)
{
throw new ArgumentException("cannot index term vector payloads when term vectors are not indexed (field=\"" + field.Name + "\")");
}
}
}
else
{
if (field.IndexableFieldType.StoreTermVectors)
{
throw new ArgumentException("cannot index term vectors when field is not indexed (field=\"" + field.Name + "\")");
}
if (field.IndexableFieldType.StoreTermVectorOffsets)
{
throw new ArgumentException("cannot index term vector offsets when field is not indexed (field=\"" + field.Name + "\")");
}
if (field.IndexableFieldType.StoreTermVectorPositions)
{
throw new ArgumentException("cannot index term vector positions when field is not indexed (field=\"" + field.Name + "\")");
}
if (field.IndexableFieldType.StoreTermVectorPayloads)
{
throw new ArgumentException("cannot index term vector payloads when field is not indexed (field=\"" + field.Name + "\")");
}
}
}
if (doVectors)
{
termsWriter.hasVectors = true;
if (termsHashPerField.bytesHash.Count != 0)
{
// Only necessary if previous doc hit a
// non-aborting exception while writing vectors in
// this field:
termsHashPerField.Reset();
}
}
// TODO: only if needed for performance
//perThread.postingsCount = 0;
return doVectors;
}
[MethodImpl(MethodImplOptions.NoInlining)]
public void Abort()
{
}
/// <summary>
/// Called once per field per document if term vectors
/// are enabled, to write the vectors to
/// RAMOutputStream, which is then quickly flushed to
/// the real term vectors files in the Directory.
/// </summary>
internal override void Finish()
{
if (!doVectors || termsHashPerField.bytesHash.Count == 0)
{
return;
}
termsWriter.AddFieldToFlush(this);
}
[MethodImpl(MethodImplOptions.NoInlining)]
internal void FinishDocument()
{
if (Debugging.AssertsEnabled) Debugging.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.finish start"));
int numPostings = termsHashPerField.bytesHash.Count;
BytesRef flushTerm = termsWriter.flushTerm;
if (Debugging.AssertsEnabled) Debugging.Assert(numPostings >= 0);
if (numPostings > maxNumPostings)
{
maxNumPostings = numPostings;
}
// this is called once, after inverting all occurrences
// of a given field in the doc. At this point we flush
// our hash into the DocWriter.
if (Debugging.AssertsEnabled) Debugging.Assert(termsWriter.VectorFieldsInOrder(fieldInfo));
TermVectorsPostingsArray postings = (TermVectorsPostingsArray)termsHashPerField.postingsArray;
TermVectorsWriter tv = termsWriter.writer;
int[] termIDs = termsHashPerField.SortPostings(tv.Comparer);
tv.StartField(fieldInfo, numPostings, doVectorPositions, doVectorOffsets, hasPayloads);
ByteSliceReader posReader = doVectorPositions ? termsWriter.vectorSliceReaderPos : null;
ByteSliceReader offReader = doVectorOffsets ? termsWriter.vectorSliceReaderOff : null;
ByteBlockPool termBytePool = termsHashPerField.termBytePool;
for (int j = 0; j < numPostings; j++)
{
int termID = termIDs[j];
int freq = postings.freqs[termID];
// Get BytesRef
termBytePool.SetBytesRef(flushTerm, postings.textStarts[termID]);
tv.StartTerm(flushTerm, freq);
if (doVectorPositions || doVectorOffsets)
{
if (posReader != null)
{
termsHashPerField.InitReader(posReader, termID, 0);
}
if (offReader != null)
{
termsHashPerField.InitReader(offReader, termID, 1);
}
tv.AddProx(freq, posReader, offReader);
}
tv.FinishTerm();
}
tv.FinishField();
termsHashPerField.Reset();
fieldInfo.SetStoreTermVectors();
}
internal void ShrinkHash()
{
termsHashPerField.ShrinkHash(maxNumPostings);
maxNumPostings = 0;
}
internal override void Start(IIndexableField f)
{
if (doVectorOffsets)
{
offsetAttribute = fieldState.AttributeSource.AddAttribute<IOffsetAttribute>();
}
else
{
offsetAttribute = null;
}
if (doVectorPayloads && fieldState.AttributeSource.HasAttribute<IPayloadAttribute>())
{
payloadAttribute = fieldState.AttributeSource.GetAttribute<IPayloadAttribute>();
}
else
{
payloadAttribute = null;
}
}
internal void WriteProx(TermVectorsPostingsArray postings, int termID)
{
if (doVectorOffsets)
{
int startOffset = fieldState.Offset + offsetAttribute.StartOffset;
int endOffset = fieldState.Offset + offsetAttribute.EndOffset;
termsHashPerField.WriteVInt32(1, startOffset - postings.lastOffsets[termID]);
termsHashPerField.WriteVInt32(1, endOffset - startOffset);
postings.lastOffsets[termID] = endOffset;
}
if (doVectorPositions)
{
BytesRef payload;
if (payloadAttribute == null)
{
payload = null;
}
else
{
payload = payloadAttribute.Payload;
}
int pos = fieldState.Position - postings.lastPositions[termID];
if (payload != null && payload.Length > 0)
{
termsHashPerField.WriteVInt32(0, (pos << 1) | 1);
termsHashPerField.WriteVInt32(0, payload.Length);
termsHashPerField.WriteBytes(0, payload.Bytes, payload.Offset, payload.Length);
hasPayloads = true;
}
else
{
termsHashPerField.WriteVInt32(0, pos << 1);
}
postings.lastPositions[termID] = fieldState.Position;
}
}
internal override void NewTerm(int termID)
{
if (Debugging.AssertsEnabled) Debugging.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.newTerm start"));
TermVectorsPostingsArray postings = (TermVectorsPostingsArray)termsHashPerField.postingsArray;
postings.freqs[termID] = 1;
postings.lastOffsets[termID] = 0;
postings.lastPositions[termID] = 0;
WriteProx(postings, termID);
}
internal override void AddTerm(int termID)
{
if (Debugging.AssertsEnabled) Debugging.Assert(docState.TestPoint("TermVectorsTermsWriterPerField.addTerm start"));
TermVectorsPostingsArray postings = (TermVectorsPostingsArray)termsHashPerField.postingsArray;
postings.freqs[termID]++;
WriteProx(postings, termID);
}
[ExceptionToNetNumericConvention]
internal override void SkippingLongTerm()
{
}
internal override ParallelPostingsArray CreatePostingsArray(int size)
{
return new TermVectorsPostingsArray(size);
}
internal sealed class TermVectorsPostingsArray : ParallelPostingsArray
{
public TermVectorsPostingsArray(int size)
: base(size)
{
freqs = new int[size];
lastOffsets = new int[size];
lastPositions = new int[size];
}
internal int[] freqs; // How many times this term occurred in the current doc
internal int[] lastOffsets; // Last offset we saw
internal int[] lastPositions; // Last position where this term occurred
internal override ParallelPostingsArray NewInstance(int size)
{
return new TermVectorsPostingsArray(size);
}
internal override void CopyTo(ParallelPostingsArray toArray, int numToCopy)
{
if (Debugging.AssertsEnabled) Debugging.Assert(toArray is TermVectorsPostingsArray);
TermVectorsPostingsArray to = (TermVectorsPostingsArray)toArray;
base.CopyTo(toArray, numToCopy);
Array.Copy(freqs, 0, to.freqs, 0, size);
Array.Copy(lastOffsets, 0, to.lastOffsets, 0, size);
Array.Copy(lastPositions, 0, to.lastPositions, 0, size);
}
internal override int BytesPerPosting()
{
return base.BytesPerPosting() + 3 * RamUsageEstimator.NUM_BYTES_INT32;
}
}
}
}