src/Lucene.Net/Index/DocInverterPerField.cs - lucenenet - Git at Google

 using Lucene.Net.Analysis.TokenAttributes;
 using System;
 using System.Runtime.CompilerServices;

 namespace Lucene.Net.Index
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     using IOUtils = Lucene.Net.Util.IOUtils;
     using TokenStream = Lucene.Net.Analysis.TokenStream;

     /// <summary>
     /// Holds state for inverting all occurrences of a single
     /// field in the document.  This class doesn't do anything
     /// itself; instead, it forwards the tokens produced by
     /// analysis to its own consumer
     /// (<see cref="InvertedDocConsumerPerField"/>).  It also interacts with an
     /// endConsumer (<see cref="InvertedDocEndConsumerPerField"/>).
     /// </summary>
     internal sealed class DocInverterPerField : DocFieldConsumerPerField
     {
         internal readonly FieldInfo fieldInfo;
         internal readonly InvertedDocConsumerPerField consumer;
         internal readonly InvertedDocEndConsumerPerField endConsumer;
         internal readonly DocumentsWriterPerThread.DocState docState;
         internal readonly FieldInvertState fieldState;

         public DocInverterPerField(DocInverter parent, FieldInfo fieldInfo)
         {
             this.fieldInfo = fieldInfo;
             docState = parent.docState;
             fieldState = new FieldInvertState(fieldInfo.Name);
             this.consumer = parent.consumer.AddField(this, fieldInfo);
             this.endConsumer = parent.endConsumer.AddField(this, fieldInfo);
         }

         [MethodImpl(MethodImplOptions.NoInlining)]
         internal override void Abort()
         {
             try
             {
                 consumer.Abort();
             }
             finally
             {
                 endConsumer.Abort();
             }
         }

         public override void ProcessFields(IIndexableField[] fields, int count)
         {
             fieldState.Reset();

             bool doInvert = consumer.Start(fields, count);

             for (int i = 0; i < count; i++)
             {
                 IIndexableField field = fields[i];
                 IIndexableFieldType fieldType = field.IndexableFieldType;

                 // TODO FI: this should be "genericized" to querying
                 // consumer if it wants to see this particular field
                 // tokenized.
                 if (fieldType.IsIndexed && doInvert)
                 {
                     bool analyzed = fieldType.IsTokenized && docState.analyzer != null;

                     // if the field omits norms, the boost cannot be indexed.
                     if (fieldType.OmitNorms && field.Boost != 1.0f)
                     {
                         throw new NotSupportedException("You cannot set an index-time boost: norms are omitted for field '" + field.Name + "'");
                     }

                     // only bother checking offsets if something will consume them.
                     // TODO: after we fix analyzers, also check if termVectorOffsets will be indexed.
                     bool checkOffsets = fieldType.IndexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
                     int lastStartOffset = 0;

                     if (i > 0)
                     {
                         fieldState.Position += analyzed ? docState.analyzer.GetPositionIncrementGap(fieldInfo.Name) : 0;
                     }

                     /*
                    * To assist people in tracking down problems in analysis components, we wish to write the field name to the infostream
                    * when we fail. We expect some caller to eventually deal with the real exception, so we don't want any 'catch' clauses,
                    * but rather a finally that takes note of the problem.
                    */

                     bool succeededInProcessingField = false;

                     TokenStream stream = field.GetTokenStream(docState.analyzer);
                     // reset the TokenStream to the first token
                     stream.Reset();

                     try
                     {
                         bool hasMoreTokens = stream.IncrementToken();

                         fieldState.AttributeSource = stream;

                         IOffsetAttribute offsetAttribute = fieldState.AttributeSource.AddAttribute<IOffsetAttribute>();
                         IPositionIncrementAttribute posIncrAttribute = fieldState.AttributeSource.AddAttribute<IPositionIncrementAttribute>();

                         if (hasMoreTokens)
                         {
                             consumer.Start(field);

                             do
                             {
                                 // If we hit an exception in stream.next below
                                 // (which is fairly common, eg if analyzer
                                 // chokes on a given document), then it's
                                 // non-aborting and (above) this one document
                                 // will be marked as deleted, but still
                                 // consume a docID

                                 int posIncr = posIncrAttribute.PositionIncrement;
                                 if (posIncr < 0)
                                 {
                                     throw new ArgumentException("position increment must be >=0 (got " + posIncr + ") for field '" + field.Name + "'");
                                 }
                                 if (fieldState.Position == 0 && posIncr == 0)
                                 {
                                     throw new ArgumentException("first position increment must be > 0 (got 0) for field '" + field.Name + "'");
                                 }
                                 int position = fieldState.Position + posIncr;
                                 if (position > 0)
                                 {
                                     // NOTE: confusing: this "mirrors" the
                                     // position++ we do below
                                     position--;
                                 }
                                 else if (position < 0)
                                 {
                                     throw new ArgumentException("position overflow for field '" + field.Name + "'");
                                 }

                                 // position is legal, we can safely place it in fieldState now.
                                 // not sure if anything will use fieldState after non-aborting exc...
                                 fieldState.Position = position;

                                 if (posIncr == 0)
                                 {
                                     fieldState.NumOverlap++;
                                 }

                                 if (checkOffsets)
                                 {
                                     int startOffset = fieldState.Offset + offsetAttribute.StartOffset;
                                     int endOffset = fieldState.Offset + offsetAttribute.EndOffset;
                                     if (startOffset < 0 || endOffset < startOffset)
                                     {
                                         throw new ArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, " + "startOffset=" + startOffset + ",endOffset=" + endOffset + " for field '" + field.Name + "'");
                                     }
                                     if (startOffset < lastStartOffset)
                                     {
                                         throw new ArgumentException("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset + " for field '" + field.Name + "'");
                                     }
                                     lastStartOffset = startOffset;
                                 }

                                 bool success = false;
                                 try
                                 {
                                     // If we hit an exception in here, we abort
                                     // all buffered documents since the last
                                     // flush, on the likelihood that the
                                     // internal state of the consumer is now
                                     // corrupt and should not be flushed to a
                                     // new segment:
                                     consumer.Add();
                                     success = true;
                                 }
                                 finally
                                 {
                                     if (!success)
                                     {
                                         docState.docWriter.SetAborting();
                                     }
                                 }
                                 fieldState.Length++;
                                 fieldState.Position++;
                             } while (stream.IncrementToken());
                         }
                         // trigger streams to perform end-of-stream operations
                         stream.End();
                         // TODO: maybe add some safety? then again, its already checked
                         // when we come back around to the field...
                         fieldState.Position += posIncrAttribute.PositionIncrement;
                         fieldState.Offset += offsetAttribute.EndOffset;

                         if (docState.maxTermPrefix != null)
                         {
                             string msg = "Document contains at least one immense term in field=\"" + fieldInfo.Name + "\" (whose UTF8 encoding is longer than the max length " + DocumentsWriterPerThread.MAX_TERM_LENGTH_UTF8 + "), all of which were skipped.  Please correct the analyzer to not produce such terms.  The prefix of the first immense term is: '" + docState.maxTermPrefix + "...'";
                             if (docState.infoStream.IsEnabled("IW"))
                             {
                                 docState.infoStream.Message("IW", "ERROR: " + msg);
                             }
                             docState.maxTermPrefix = null;
                             throw new ArgumentException(msg);
                         }

                         /* if success was false above there is an exception coming through and we won't get here.*/
                         succeededInProcessingField = true;
                     }
                     finally
                     {
                         if (!succeededInProcessingField)
                         {
                             IOUtils.DisposeWhileHandlingException(stream);
                         }
                         else
                         {
                             stream.Dispose();
                         }
                         if (!succeededInProcessingField && docState.infoStream.IsEnabled("DW"))
                         {
                             docState.infoStream.Message("DW", "An exception was thrown while processing field " + fieldInfo.Name);
                         }
                     }

                     fieldState.Offset += analyzed ? docState.analyzer.GetOffsetGap(fieldInfo.Name) : 0;
                     fieldState.Boost *= field.Boost;
                 }

                 // LUCENE-2387: don't hang onto the field, so GC can
                 // reclaim
                 fields[i] = null;
             }

             consumer.Finish();
             endConsumer.Finish();
         }

         internal override FieldInfo FieldInfo => fieldInfo;
     }
 }
	using Lucene.Net.Analysis.TokenAttributes;
	using System;
	using System.Runtime.CompilerServices;

	namespace Lucene.Net.Index
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	using IOUtils = Lucene.Net.Util.IOUtils;
	using TokenStream = Lucene.Net.Analysis.TokenStream;

	/// <summary>
	/// Holds state for inverting all occurrences of a single
	/// field in the document. This class doesn't do anything
	/// itself; instead, it forwards the tokens produced by
	/// analysis to its own consumer
	/// (<see cref="InvertedDocConsumerPerField"/>). It also interacts with an
	/// endConsumer (<see cref="InvertedDocEndConsumerPerField"/>).
	/// </summary>
	internal sealed class DocInverterPerField : DocFieldConsumerPerField
	{
	internal readonly FieldInfo fieldInfo;
	internal readonly InvertedDocConsumerPerField consumer;
	internal readonly InvertedDocEndConsumerPerField endConsumer;
	internal readonly DocumentsWriterPerThread.DocState docState;
	internal readonly FieldInvertState fieldState;

	public DocInverterPerField(DocInverter parent, FieldInfo fieldInfo)
	{
	this.fieldInfo = fieldInfo;
	docState = parent.docState;
	fieldState = new FieldInvertState(fieldInfo.Name);
	this.consumer = parent.consumer.AddField(this, fieldInfo);
	this.endConsumer = parent.endConsumer.AddField(this, fieldInfo);
	}

	[MethodImpl(MethodImplOptions.NoInlining)]
	internal override void Abort()
	{
	try
	{
	consumer.Abort();
	}
	finally
	{
	endConsumer.Abort();
	}
	}

	public override void ProcessFields(IIndexableField[] fields, int count)
	{
	fieldState.Reset();

	bool doInvert = consumer.Start(fields, count);

	for (int i = 0; i < count; i++)
	{
	IIndexableField field = fields[i];
	IIndexableFieldType fieldType = field.IndexableFieldType;

	// TODO FI: this should be "genericized" to querying
	// consumer if it wants to see this particular field
	// tokenized.
	if (fieldType.IsIndexed && doInvert)
	{
	bool analyzed = fieldType.IsTokenized && docState.analyzer != null;

	// if the field omits norms, the boost cannot be indexed.
	if (fieldType.OmitNorms && field.Boost != 1.0f)
	{
	throw new NotSupportedException("You cannot set an index-time boost: norms are omitted for field '" + field.Name + "'");
	}

	// only bother checking offsets if something will consume them.
	// TODO: after we fix analyzers, also check if termVectorOffsets will be indexed.
	bool checkOffsets = fieldType.IndexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
	int lastStartOffset = 0;

	if (i > 0)
	{
	fieldState.Position += analyzed ? docState.analyzer.GetPositionIncrementGap(fieldInfo.Name) : 0;
	}

	/*
	* To assist people in tracking down problems in analysis components, we wish to write the field name to the infostream
	* when we fail. We expect some caller to eventually deal with the real exception, so we don't want any 'catch' clauses,
	* but rather a finally that takes note of the problem.
	*/

	bool succeededInProcessingField = false;

	TokenStream stream = field.GetTokenStream(docState.analyzer);
	// reset the TokenStream to the first token
	stream.Reset();

	try
	{
	bool hasMoreTokens = stream.IncrementToken();

	fieldState.AttributeSource = stream;

	IOffsetAttribute offsetAttribute = fieldState.AttributeSource.AddAttribute<IOffsetAttribute>();
	IPositionIncrementAttribute posIncrAttribute = fieldState.AttributeSource.AddAttribute<IPositionIncrementAttribute>();

	if (hasMoreTokens)
	{
	consumer.Start(field);

	do
	{
	// If we hit an exception in stream.next below
	// (which is fairly common, eg if analyzer
	// chokes on a given document), then it's
	// non-aborting and (above) this one document
	// will be marked as deleted, but still
	// consume a docID

	int posIncr = posIncrAttribute.PositionIncrement;
	if (posIncr < 0)
	{
	throw new ArgumentException("position increment must be >=0 (got " + posIncr + ") for field '" + field.Name + "'");
	}
	if (fieldState.Position == 0 && posIncr == 0)
	{
	throw new ArgumentException("first position increment must be > 0 (got 0) for field '" + field.Name + "'");
	}
	int position = fieldState.Position + posIncr;
	if (position > 0)
	{
	// NOTE: confusing: this "mirrors" the
	// position++ we do below
	position--;
	}
	else if (position < 0)
	{
	throw new ArgumentException("position overflow for field '" + field.Name + "'");
	}

	// position is legal, we can safely place it in fieldState now.
	// not sure if anything will use fieldState after non-aborting exc...
	fieldState.Position = position;

	if (posIncr == 0)
	{
	fieldState.NumOverlap++;
	}

	if (checkOffsets)
	{
	int startOffset = fieldState.Offset + offsetAttribute.StartOffset;
	int endOffset = fieldState.Offset + offsetAttribute.EndOffset;
	if (startOffset < 0 \|\| endOffset < startOffset)
	{
	throw new ArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, " + "startOffset=" + startOffset + ",endOffset=" + endOffset + " for field '" + field.Name + "'");
	}
	if (startOffset < lastStartOffset)
	{
	throw new ArgumentException("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset + " for field '" + field.Name + "'");
	}
	lastStartOffset = startOffset;
	}

	bool success = false;
	try
	{
	// If we hit an exception in here, we abort
	// all buffered documents since the last
	// flush, on the likelihood that the
	// internal state of the consumer is now
	// corrupt and should not be flushed to a
	// new segment:
	consumer.Add();
	success = true;
	}
	finally
	{
	if (!success)
	{
	docState.docWriter.SetAborting();
	}
	}
	fieldState.Length++;
	fieldState.Position++;
	} while (stream.IncrementToken());
	}
	// trigger streams to perform end-of-stream operations
	stream.End();
	// TODO: maybe add some safety? then again, its already checked
	// when we come back around to the field...
	fieldState.Position += posIncrAttribute.PositionIncrement;
	fieldState.Offset += offsetAttribute.EndOffset;

	if (docState.maxTermPrefix != null)
	{
	string msg = "Document contains at least one immense term in field=\"" + fieldInfo.Name + "\" (whose UTF8 encoding is longer than the max length " + DocumentsWriterPerThread.MAX_TERM_LENGTH_UTF8 + "), all of which were skipped. Please correct the analyzer to not produce such terms. The prefix of the first immense term is: '" + docState.maxTermPrefix + "...'";
	if (docState.infoStream.IsEnabled("IW"))
	{
	docState.infoStream.Message("IW", "ERROR: " + msg);
	}
	docState.maxTermPrefix = null;
	throw new ArgumentException(msg);
	}

	/* if success was false above there is an exception coming through and we won't get here.*/
	succeededInProcessingField = true;
	}
	finally
	{
	if (!succeededInProcessingField)
	{
	IOUtils.DisposeWhileHandlingException(stream);
	}
	else
	{
	stream.Dispose();
	}
	if (!succeededInProcessingField && docState.infoStream.IsEnabled("DW"))
	{
	docState.infoStream.Message("DW", "An exception was thrown while processing field " + fieldInfo.Name);
	}
	}

	fieldState.Offset += analyzed ? docState.analyzer.GetOffsetGap(fieldInfo.Name) : 0;
	fieldState.Boost *= field.Boost;
	}

	// LUCENE-2387: don't hang onto the field, so GC can
	// reclaim
	fields[i] = null;
	}

	consumer.Finish();
	endConsumer.Finish();
	}

	internal override FieldInfo FieldInfo => fieldInfo;
	}
	}