| using Lucene.Net.Analysis.Tokenattributes; |
| |
| namespace Lucene.Net.Index |
| { |
| using IOUtils = Lucene.Net.Util.IOUtils; |
| |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| using TokenStream = Lucene.Net.Analysis.TokenStream; |
| |
| /// <summary> |
| /// Holds state for inverting all occurrences of a single |
| /// field in the document. this class doesn't do anything |
| /// itself; instead, it forwards the tokens produced by |
| /// analysis to its own consumer |
| /// (InvertedDocConsumerPerField). It also interacts with an |
| /// endConsumer (InvertedDocEndConsumerPerField). |
| /// </summary> |
| public sealed class DocInverterPerField : DocFieldConsumerPerField |
| { |
| internal readonly FieldInfo fieldInfo; |
| internal readonly InvertedDocConsumerPerField Consumer; |
| internal readonly InvertedDocEndConsumerPerField EndConsumer; |
| internal readonly DocumentsWriterPerThread.DocState DocState; |
| internal readonly FieldInvertState FieldState; |
| |
| public DocInverterPerField(DocInverter parent, FieldInfo fieldInfo) |
| { |
| this.fieldInfo = fieldInfo; |
| DocState = parent.DocState; |
| FieldState = new FieldInvertState(fieldInfo.Name); |
| this.Consumer = parent.Consumer.AddField(this, fieldInfo); |
| this.EndConsumer = parent.EndConsumer.AddField(this, fieldInfo); |
| } |
| |
| public override void Abort() |
| { |
| try |
| { |
| Consumer.Abort(); |
| } |
| finally |
| { |
| EndConsumer.Abort(); |
| } |
| } |
| |
| public override void ProcessFields(IndexableField[] fields, int count) |
| { |
| FieldState.Reset(); |
| |
| bool doInvert = Consumer.Start(fields, count); |
| |
| for (int i = 0; i < count; i++) |
| { |
| IndexableField field = fields[i]; |
| IndexableFieldType fieldType = field.FieldType(); |
| |
| // TODO FI: this should be "genericized" to querying |
| // consumer if it wants to see this particular field |
| // tokenized. |
| if (fieldType.Indexed && doInvert) |
| { |
| bool analyzed = fieldType.Tokenized && DocState.Analyzer != null; |
| |
| // if the field omits norms, the boost cannot be indexed. |
| if (fieldType.OmitNorms && field.GetBoost() != 1.0f) |
| { |
| throw new System.NotSupportedException("You cannot set an index-time boost: norms are omitted for field '" + field.Name() + "'"); |
| } |
| |
| // only bother checking offsets if something will consume them. |
| // TODO: after we fix analyzers, also check if termVectorOffsets will be indexed. |
| bool checkOffsets = fieldType.IndexOptions == FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; |
| int lastStartOffset = 0; |
| |
| if (i > 0) |
| { |
| FieldState.Position_Renamed += analyzed ? DocState.Analyzer.GetPositionIncrementGap(fieldInfo.Name) : 0; |
| } |
| |
| /* |
| * To assist people in tracking down problems in analysis components, we wish to write the field name to the infostream |
| * when we fail. We expect some caller to eventually deal with the real exception, so we don't want any 'catch' clauses, |
| * but rather a finally that takes note of the problem. |
| */ |
| |
| bool succeededInProcessingField = false; |
| |
| TokenStream stream = field.GetTokenStream(DocState.Analyzer); |
| // reset the TokenStream to the first token |
| stream.Reset(); |
| |
| try |
| { |
| bool hasMoreTokens = stream.IncrementToken(); |
| |
| FieldState.AttributeSource_Renamed = stream; |
| |
| IOffsetAttribute offsetAttribute = FieldState.AttributeSource_Renamed.AddAttribute<IOffsetAttribute>(); |
| IPositionIncrementAttribute posIncrAttribute = FieldState.AttributeSource_Renamed.AddAttribute<IPositionIncrementAttribute>(); |
| |
| if (hasMoreTokens) |
| { |
| Consumer.Start(field); |
| |
| do |
| { |
| // If we hit an exception in stream.next below |
| // (which is fairly common, eg if analyzer |
| // chokes on a given document), then it's |
| // non-aborting and (above) this one document |
| // will be marked as deleted, but still |
| // consume a docID |
| |
| int posIncr = posIncrAttribute.PositionIncrement; |
| if (posIncr < 0) |
| { |
| throw new System.ArgumentException("position increment must be >=0 (got " + posIncr + ") for field '" + field.Name() + "'"); |
| } |
| if (FieldState.Position_Renamed == 0 && posIncr == 0) |
| { |
| throw new System.ArgumentException("first position increment must be > 0 (got 0) for field '" + field.Name() + "'"); |
| } |
| int position = FieldState.Position_Renamed + posIncr; |
| if (position > 0) |
| { |
| // NOTE: confusing: this "mirrors" the |
| // position++ we do below |
| position--; |
| } |
| else if (position < 0) |
| { |
| throw new System.ArgumentException("position overflow for field '" + field.Name() + "'"); |
| } |
| |
| // position is legal, we can safely place it in fieldState now. |
| // not sure if anything will use fieldState after non-aborting exc... |
| FieldState.Position_Renamed = position; |
| |
| if (posIncr == 0) |
| { |
| FieldState.NumOverlap_Renamed++; |
| } |
| |
| if (checkOffsets) |
| { |
| int startOffset = FieldState.Offset_Renamed + offsetAttribute.StartOffset(); |
| int endOffset = FieldState.Offset_Renamed + offsetAttribute.EndOffset(); |
| if (startOffset < 0 || endOffset < startOffset) |
| { |
| throw new System.ArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, " + "startOffset=" + startOffset + ",endOffset=" + endOffset + " for field '" + field.Name() + "'"); |
| } |
| if (startOffset < lastStartOffset) |
| { |
| throw new System.ArgumentException("offsets must not go backwards startOffset=" + startOffset + " is < lastStartOffset=" + lastStartOffset + " for field '" + field.Name() + "'"); |
| } |
| lastStartOffset = startOffset; |
| } |
| |
| bool success = false; |
| try |
| { |
| // If we hit an exception in here, we abort |
| // all buffered documents since the last |
| // flush, on the likelihood that the |
| // internal state of the consumer is now |
| // corrupt and should not be flushed to a |
| // new segment: |
| Consumer.Add(); |
| success = true; |
| } |
| finally |
| { |
| if (!success) |
| { |
| DocState.DocWriter.SetAborting(); |
| } |
| } |
| FieldState.Length_Renamed++; |
| FieldState.Position_Renamed++; |
| } while (stream.IncrementToken()); |
| } |
| // trigger streams to perform end-of-stream operations |
| stream.End(); |
| // TODO: maybe add some safety? then again, its already checked |
| // when we come back around to the field... |
| FieldState.Position_Renamed += posIncrAttribute.PositionIncrement; |
| FieldState.Offset_Renamed += offsetAttribute.EndOffset(); |
| |
| if (DocState.MaxTermPrefix != null) |
| { |
| string msg = "Document contains at least one immense term in field=\"" + fieldInfo.Name + "\" (whose UTF8 encoding is longer than the max length " + DocumentsWriterPerThread.MAX_TERM_LENGTH_UTF8 + "), all of which were skipped. Please correct the analyzer to not produce such terms. The prefix of the first immense term is: '" + DocState.MaxTermPrefix + "...'"; |
| if (DocState.InfoStream.IsEnabled("IW")) |
| { |
| DocState.InfoStream.Message("IW", "ERROR: " + msg); |
| } |
| DocState.MaxTermPrefix = null; |
| throw new System.ArgumentException(msg); |
| } |
| |
| /* if success was false above there is an exception coming through and we won't get here.*/ |
| succeededInProcessingField = true; |
| } |
| finally |
| { |
| if (!succeededInProcessingField) |
| { |
| IOUtils.CloseWhileHandlingException(stream); |
| } |
| else |
| { |
| stream.Dispose(); |
| } |
| if (!succeededInProcessingField && DocState.InfoStream.IsEnabled("DW")) |
| { |
| DocState.InfoStream.Message("DW", "An exception was thrown while processing field " + fieldInfo.Name); |
| } |
| } |
| |
| FieldState.Offset_Renamed += analyzed ? DocState.Analyzer.GetOffsetGap(fieldInfo.Name) : 0; |
| FieldState.Boost_Renamed *= field.GetBoost(); |
| } |
| |
| // LUCENE-2387: don't hang onto the field, so GC can |
| // reclaim |
| fields[i] = null; |
| } |
| |
| Consumer.Finish(); |
| EndConsumer.Finish(); |
| } |
| |
| public override FieldInfo FieldInfo |
| { |
| get |
| { |
| return fieldInfo; |
| } |
| } |
| } |
| } |