| package org.apache.lucene.index; |
| |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| import java.io.IOException; |
| |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| import org.apache.lucene.index.FieldInfo.IndexOptions; |
| import org.apache.lucene.util.IOUtils; |
| |
| /** |
| * Holds state for inverting all occurrences of a single |
| * field in the document. This class doesn't do anything |
| * itself; instead, it forwards the tokens produced by |
| * analysis to its own consumer |
| * (InvertedDocConsumerPerField). It also interacts with an |
| * endConsumer (InvertedDocEndConsumerPerField). |
| */ |
| |
| final class DocInverterPerField extends DocFieldConsumerPerField { |
| |
| final FieldInfo fieldInfo; |
| final InvertedDocConsumerPerField consumer; |
| final InvertedDocEndConsumerPerField endConsumer; |
| final DocumentsWriterPerThread.DocState docState; |
| final FieldInvertState fieldState; |
| |
| public DocInverterPerField(DocInverter parent, FieldInfo fieldInfo) { |
| this.fieldInfo = fieldInfo; |
| docState = parent.docState; |
| fieldState = new FieldInvertState(fieldInfo.name); |
| this.consumer = parent.consumer.addField(this, fieldInfo); |
| this.endConsumer = parent.endConsumer.addField(this, fieldInfo); |
| } |
| |
| @Override |
| void abort() { |
| try { |
| consumer.abort(); |
| } finally { |
| endConsumer.abort(); |
| } |
| } |
| |
| @Override |
| public void processFields(final IndexableField[] fields, |
| final int count) throws IOException { |
| |
| fieldState.reset(); |
| |
| final boolean doInvert = consumer.start(fields, count); |
| |
| for(int i=0;i<count;i++) { |
| |
| final IndexableField field = fields[i]; |
| final IndexableFieldType fieldType = field.fieldType(); |
| |
| // TODO FI: this should be "genericized" to querying |
| // consumer if it wants to see this particular field |
| // tokenized. |
| if (doInvert) { |
| final boolean analyzed = fieldType.tokenized() && docState.analyzer != null; |
| |
| // if the field omits norms, the boost cannot be indexed. |
| if (fieldType.omitNorms() && field.boost() != 1.0f) { |
| throw new UnsupportedOperationException("You cannot set an index-time boost: norms are omitted for field '" + field.name() + "'"); |
| } |
| |
| // only bother checking offsets if something will consume them. |
| // TODO: after we fix analyzers, also check if termVectorOffsets will be indexed. |
| final boolean checkOffsets = fieldType.indexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; |
| int lastStartOffset = 0; |
| |
| if (i > 0) { |
| fieldState.position += analyzed ? docState.analyzer.getPositionIncrementGap(fieldInfo.name) : 0; |
| } |
| |
| try (TokenStream stream = field.tokenStream(docState.analyzer)) { |
| // reset the TokenStream to the first token |
| stream.reset(); |
| boolean hasMoreTokens = stream.incrementToken(); |
| |
| fieldState.attributeSource = stream; |
| |
| OffsetAttribute offsetAttribute = fieldState.attributeSource.addAttribute(OffsetAttribute.class); |
| PositionIncrementAttribute posIncrAttribute = fieldState.attributeSource.addAttribute(PositionIncrementAttribute.class); |
| |
| if (hasMoreTokens) { |
| consumer.start(field); |
| |
| do { |
| // If we hit an exception in stream.next below |
| // (which is fairly common, eg if analyzer |
| // chokes on a given document), then it's |
| // non-aborting and (above) this one document |
| // will be marked as deleted, but still |
| // consume a docID |
| |
| final int posIncr = posIncrAttribute.getPositionIncrement(); |
| if (posIncr < 0) { |
| throw new IllegalArgumentException("position increment must be >=0 (got " + posIncr + ") for field '" + field.name() + "'"); |
| } |
| if (fieldState.position == 0 && posIncr == 0) { |
| throw new IllegalArgumentException("first position increment must be > 0 (got 0) for field '" + field.name() + "'"); |
| } |
| int position = fieldState.position + posIncr; |
| if (position > 0) { |
| // NOTE: confusing: this "mirrors" the |
| // position++ we do below |
| position--; |
| } else if (position < 0) { |
| throw new IllegalArgumentException("position overflow for field '" + field.name() + "'"); |
| } |
| |
| // position is legal, we can safely place it in fieldState now. |
| // not sure if anything will use fieldState after non-aborting exc... |
| fieldState.position = position; |
| |
| if (posIncr == 0) |
| fieldState.numOverlap++; |
| |
| if (checkOffsets) { |
| int startOffset = fieldState.offset + offsetAttribute.startOffset(); |
| int endOffset = fieldState.offset + offsetAttribute.endOffset(); |
| if (startOffset < 0 || endOffset < startOffset) { |
| throw new IllegalArgumentException("startOffset must be non-negative, and endOffset must be >= startOffset, " |
| + "startOffset=" + startOffset + ",endOffset=" + endOffset + " for field '" + field.name() + "'"); |
| } |
| if (startOffset < lastStartOffset) { |
| throw new IllegalArgumentException("offsets must not go backwards startOffset=" |
| + startOffset + " is < lastStartOffset=" + lastStartOffset + " for field '" + field.name() + "'"); |
| } |
| lastStartOffset = startOffset; |
| } |
| |
| boolean success = false; |
| try { |
| // If we hit an exception in here, we abort |
| // all buffered documents since the last |
| // flush, on the likelihood that the |
| // internal state of the consumer is now |
| // corrupt and should not be flushed to a |
| // new segment: |
| consumer.add(); |
| success = true; |
| } finally { |
| if (!success) { |
| docState.docWriter.setAborting(); |
| } |
| } |
| fieldState.length++; |
| fieldState.position++; |
| } while (stream.incrementToken()); |
| } |
| // trigger streams to perform end-of-stream operations |
| stream.end(); |
| // TODO: maybe add some safety? then again, its already checked |
| // when we come back around to the field... |
| fieldState.position += posIncrAttribute.getPositionIncrement(); |
| fieldState.offset += offsetAttribute.endOffset(); |
| } |
| |
| fieldState.offset += analyzed ? docState.analyzer.getOffsetGap(fieldInfo.name) : 0; |
| fieldState.boost *= field.boost(); |
| } |
| |
| // LUCENE-2387: don't hang onto the field, so GC can |
| // reclaim |
| fields[i] = null; |
| } |
| |
| consumer.finish(); |
| endConsumer.finish(); |
| } |
| |
| @Override |
| FieldInfo getFieldInfo() { |
| return fieldInfo; |
| } |
| } |