blob: f51f0c6f96788113ea215b88b90114778af561ca [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs;
import java.io.IOException;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
/**
* Extension of {@link PostingsWriterBase}, adding a push
* API for writing each element of the postings. This API
* is somewhat analogous to an XML SAX API, while {@link
* PostingsWriterBase} is more like an XML DOM API.
*
* @see PostingsReaderBase
* @lucene.experimental
*/
// TODO: find a better name; this defines the API that the
// terms dict impls use to talk to a postings impl.
// TermsDict + PostingsReader/WriterBase == PostingsConsumer/Producer
public abstract class PushPostingsWriterBase extends PostingsWriterBase {
// Reused in writeTerm
private PostingsEnum postingsEnum;
private int enumFlags;
/** {@link FieldInfo} of current field being written. */
protected FieldInfo fieldInfo;
/** {@link IndexOptions} of current field being
written */
protected IndexOptions indexOptions;
/** True if the current field writes freqs. */
protected boolean writeFreqs;
/** True if the current field writes positions. */
protected boolean writePositions;
/** True if the current field writes payloads. */
protected boolean writePayloads;
/** True if the current field writes offsets. */
protected boolean writeOffsets;
/** Sole constructor. (For invocation by subclass
* constructors, typically implicit.) */
protected PushPostingsWriterBase() {
}
/** Return a newly created empty TermState */
public abstract BlockTermState newTermState() throws IOException;
/** Start a new term. Note that a matching call to {@link
* #finishTerm(BlockTermState)} is done, only if the term has at least one
* document. */
public abstract void startTerm(NumericDocValues norms) throws IOException;
/** Finishes the current term. The provided {@link
* BlockTermState} contains the term's summary statistics,
* and will holds metadata from PBF when returned */
public abstract void finishTerm(BlockTermState state) throws IOException;
/**
* Sets the current field for writing, and returns the
* fixed length of long[] metadata (which is fixed per
* field), called when the writing switches to another field. */
@Override
public void setField(FieldInfo fieldInfo) {
this.fieldInfo = fieldInfo;
indexOptions = fieldInfo.getIndexOptions();
writeFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
writePositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
writeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
writePayloads = fieldInfo.hasPayloads();
if (writeFreqs == false) {
enumFlags = 0;
} else if (writePositions == false) {
enumFlags = PostingsEnum.FREQS;
} else if (writeOffsets == false) {
if (writePayloads) {
enumFlags = PostingsEnum.PAYLOADS;
} else {
enumFlags = PostingsEnum.POSITIONS;
}
} else {
if (writePayloads) {
enumFlags = PostingsEnum.PAYLOADS | PostingsEnum.OFFSETS;
} else {
enumFlags = PostingsEnum.OFFSETS;
}
}
}
@Override
public final BlockTermState writeTerm(BytesRef term, TermsEnum termsEnum, FixedBitSet docsSeen, NormsProducer norms) throws IOException {
NumericDocValues normValues;
if (fieldInfo.hasNorms() == false) {
normValues = null;
} else {
normValues = norms.getNorms(fieldInfo);
}
startTerm(normValues);
postingsEnum = termsEnum.postings(postingsEnum, enumFlags);
assert postingsEnum != null;
int docFreq = 0;
long totalTermFreq = 0;
while (true) {
int docID = postingsEnum.nextDoc();
if (docID == PostingsEnum.NO_MORE_DOCS) {
break;
}
docFreq++;
docsSeen.set(docID);
int freq;
if (writeFreqs) {
freq = postingsEnum.freq();
totalTermFreq += freq;
} else {
freq = -1;
}
startDoc(docID, freq);
if (writePositions) {
for(int i=0;i<freq;i++) {
int pos = postingsEnum.nextPosition();
BytesRef payload = writePayloads ? postingsEnum.getPayload() : null;
int startOffset;
int endOffset;
if (writeOffsets) {
startOffset = postingsEnum.startOffset();
endOffset = postingsEnum.endOffset();
} else {
startOffset = -1;
endOffset = -1;
}
addPosition(pos, payload, startOffset, endOffset);
}
}
finishDoc();
}
if (docFreq == 0) {
return null;
} else {
BlockTermState state = newTermState();
state.docFreq = docFreq;
state.totalTermFreq = writeFreqs ? totalTermFreq : -1;
finishTerm(state);
return state;
}
}
/** Adds a new doc in this term.
* <code>freq</code> will be -1 when term frequencies are omitted
* for the field. */
public abstract void startDoc(int docID, int freq) throws IOException;
/** Add a new position and payload, and start/end offset. A
* null payload means no payload; a non-null payload with
* zero length also means no payload. Caller may reuse
* the {@link BytesRef} for the payload between calls
* (method must fully consume the payload). <code>startOffset</code>
* and <code>endOffset</code> will be -1 when offsets are not indexed. */
public abstract void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException;
/** Called when we are done adding positions and payloads
* for each doc. */
public abstract void finishDoc() throws IOException;
}