blob: a66617b24cfec089331da2a35fbf711c20d77e3c [file] [log] [blame]
using Lucene.Net.Analysis.TokenAttributes;
using System;
using System.Runtime.CompilerServices;
namespace Lucene.Net.Index
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using IOUtils = Lucene.Net.Util.IOUtils;
using TokenStream = Lucene.Net.Analysis.TokenStream;
/// <summary>
/// Holds state for inverting all occurrences of a single
/// field in the document. This class doesn't do anything
/// itself; instead, it forwards the tokens produced by
/// analysis to its own consumer
/// (<see cref="InvertedDocConsumerPerField"/>). It also interacts with an
/// endConsumer (<see cref="InvertedDocEndConsumerPerField"/>).
/// </summary>
internal sealed class DocInverterPerField : DocFieldConsumerPerField
{
internal readonly FieldInfo fieldInfo;
internal readonly InvertedDocConsumerPerField consumer;
internal readonly InvertedDocEndConsumerPerField endConsumer;
internal readonly DocumentsWriterPerThread.DocState docState;
internal readonly FieldInvertState fieldState;
public DocInverterPerField(DocInverter parent, FieldInfo fieldInfo)
{
this.fieldInfo = fieldInfo;
docState = parent.docState;
fieldState = new FieldInvertState(fieldInfo.Name);
this.consumer = parent.consumer.AddField(this, fieldInfo);
this.endConsumer = parent.endConsumer.AddField(this, fieldInfo);
}
[MethodImpl(MethodImplOptions.NoInlining)]
internal override void Abort()
{
try
{
consumer.Abort();
}
finally
{
endConsumer.Abort();
}
}
public override void ProcessFields(IIndexableField[] fields, int count)
{
fieldState.Reset();
bool doInvert = consumer.Start(fields, count);
for (int i = 0; i < count; i++)
{
IIndexableField field = fields[i];
IIndexableFieldType fieldType = field.IndexableFieldType;
// TODO FI: this should be "genericized" to querying
// consumer if it wants to see this particular field
// tokenized.
if (fieldType.IsIndexed && doInvert)
{
bool analyzed = fieldType.IsTokenized && docState.analyzer != null;
// if the field omits norms, the boost cannot be indexed.
if (fieldType.OmitNorms && field.Boost != 1.0f)
{
throw new NotSupportedException("You cannot set an index-time boost: norms are omitted for field '" + field.Name + "'");
}
// only bother checking offsets if something will consume them.
// TODO: after we fix analyzers, also check if termVectorOffsets will be indexed.
bool checkOffsets = fieldType.IndexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
int lastStartOffset = 0;
if (i > 0)
{
fieldState.Position += analyzed ? docState.analyzer.GetPositionIncrementGap(fieldInfo.Name) : 0;
}
/*
* To assist people in tracking down problems in analysis components, we wish to write the field name to the infostream
* when we fail. We expect some caller to eventually deal with the real exception, so we don't want any 'catch' clauses,
* but rather a finally that takes note of the problem.
*/
bool succeededInProcessingField = false;
TokenStream stream = field.GetTokenStream(docState.analyzer);
// reset the TokenStream to the first token
stream.Reset();
try
{
bool hasMoreTokens = stream.IncrementToken();
fieldState.AttributeSource = stream;
IOffsetAttribute offsetAttribute = fieldState.AttributeSource.AddAttribute<IOffsetAttribute>();
IPositionIncrementAttribute posIncrAttribute = fieldState.AttributeSource.AddAttribute<IPositionIncrementAttribute>();
if (hasMoreTokens)
{
consumer.Start(field);
do
{
// If we hit an exception in stream.next below
// (which is fairly common, eg if analyzer
// chokes on a given document), then it's
// non-aborting and (above) this one document
// will be marked as deleted, but still
// consume a docID
int posIncr = posIncrAttribute.PositionIncrement;
if (posIncr < 0)
{
throw new ArgumentException("position increment must be >=0 (got " + posIncr + ") for field '" + field.Name + "'");
}
if (fieldState.Position == 0 && posIncr == 0)
{
throw new ArgumentException("first position increment must be > 0 (got 0) for field '" + field.Name + "'");
}
int position = fieldState.Position + posIncr;
if (position > 0)
{
// NOTE: confusing: this "mirrors" the
// position++ we do below
position--;
}
else if (position < 0)
{
throw new ArgumentException("position overflow for field '" + field.Name + "'");
}
// position is legal, we can safely place it in fieldState now.
// not sure if anything will use fieldState after non-aborting exc...
fieldState.Position = position;
if (posIncr == 0)
{
fieldState.NumOverlap++;
}
if (checkOffsets)
{
int startOffset = fieldState.Offset + offsetAttribute.StartOffset;
int endOffset = fieldState.Offset + offsetAttribute.EndOffset;
if (startOffset < 0 || endOffset < startOffset)
{
throw new ArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, " + "startOffset=" + startOffset + ",endOffset=" + endOffset + " for field '" + field.Name + "'");
}
if (startOffset < lastStartOffset)
{
throw new ArgumentException("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset + " for field '" + field.Name + "'");
}
lastStartOffset = startOffset;
}
bool success = false;
try
{
// If we hit an exception in here, we abort
// all buffered documents since the last
// flush, on the likelihood that the
// internal state of the consumer is now
// corrupt and should not be flushed to a
// new segment:
consumer.Add();
success = true;
}
finally
{
if (!success)
{
docState.docWriter.SetAborting();
}
}
fieldState.Length++;
fieldState.Position++;
} while (stream.IncrementToken());
}
// trigger streams to perform end-of-stream operations
stream.End();
// TODO: maybe add some safety? then again, its already checked
// when we come back around to the field...
fieldState.Position += posIncrAttribute.PositionIncrement;
fieldState.Offset += offsetAttribute.EndOffset;
if (docState.maxTermPrefix != null)
{
string msg = "Document contains at least one immense term in field=\"" + fieldInfo.Name + "\" (whose UTF8 encoding is longer than the max length " + DocumentsWriterPerThread.MAX_TERM_LENGTH_UTF8 + "), all of which were skipped. Please correct the analyzer to not produce such terms. The prefix of the first immense term is: '" + docState.maxTermPrefix + "...'";
if (docState.infoStream.IsEnabled("IW"))
{
docState.infoStream.Message("IW", "ERROR: " + msg);
}
docState.maxTermPrefix = null;
throw new ArgumentException(msg);
}
/* if success was false above there is an exception coming through and we won't get here.*/
succeededInProcessingField = true;
}
finally
{
if (!succeededInProcessingField)
{
IOUtils.DisposeWhileHandlingException(stream);
}
else
{
stream.Dispose();
}
if (!succeededInProcessingField && docState.infoStream.IsEnabled("DW"))
{
docState.infoStream.Message("DW", "An exception was thrown while processing field " + fieldInfo.Name);
}
}
fieldState.Offset += analyzed ? docState.analyzer.GetOffsetGap(fieldInfo.Name) : 0;
fieldState.Boost *= field.Boost;
}
// LUCENE-2387: don't hang onto the field, so GC can
// reclaim
fields[i] = null;
}
consumer.Finish();
endConsumer.Finish();
}
internal override FieldInfo FieldInfo => fieldInfo;
}
}