blob: a310cf65abf6c746af7b00a36a14adda54b065f2 [file] [log] [blame]
package org.apache.lucene.codecs;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
/**
* Extension of {@link PostingsWriterBase}, adding a push
* API for writing each element of the postings. This API
* is somewhat analagous to an XML SAX API, while {@link
* PostingsWriterBase} is more like an XML DOM API.
*
* @see PostingsReaderBase
* @lucene.experimental
*/
// TODO: find a better name; this defines the API that the
// terms dict impls use to talk to a postings impl.
// TermsDict + PostingsReader/WriterBase == PostingsConsumer/Producer
public abstract class PushPostingsWriterBase extends PostingsWriterBase {
// Reused in writeTerm
private DocsEnum docsEnum;
private DocsAndPositionsEnum posEnum;
private int enumFlags;
/** {@link FieldInfo} of current field being written. */
protected FieldInfo fieldInfo;
/** {@link IndexOptions} of current field being
written */
protected IndexOptions indexOptions;
/** True if the current field writes freqs. */
protected boolean writeFreqs;
/** True if the current field writes positions. */
protected boolean writePositions;
/** True if the current field writes payloads. */
protected boolean writePayloads;
/** True if the current field writes offsets. */
protected boolean writeOffsets;
/** Sole constructor. (For invocation by subclass
* constructors, typically implicit.) */
protected PushPostingsWriterBase() {
}
/** Called once after startup, before any terms have been
* added. Implementations typically write a header to
* the provided {@code termsOut}. */
public abstract void init(IndexOutput termsOut) throws IOException;
/** Return a newly created empty TermState */
public abstract BlockTermState newTermState() throws IOException;
/** Start a new term. Note that a matching call to {@link
* #finishTerm(BlockTermState)} is done, only if the term has at least one
* document. */
public abstract void startTerm() throws IOException;
/** Finishes the current term. The provided {@link
* BlockTermState} contains the term's summary statistics,
* and will holds metadata from PBF when returned */
public abstract void finishTerm(BlockTermState state) throws IOException;
/**
* Encode metadata as long[] and byte[]. {@code absolute} controls whether
* current term is delta encoded according to latest term.
* Usually elements in {@code longs} are file pointers, so each one always
* increases when a new term is consumed. {@code out} is used to write generic
* bytes, which are not monotonic.
*
* NOTE: sometimes long[] might contain "don't care" values that are unused, e.g.
* the pointer to postings list may not be defined for some terms but is defined
* for others, if it is designed to inline some postings data in term dictionary.
* In this case, the postings writer should always use the last value, so that each
* element in metadata long[] remains monotonic.
*/
public abstract void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState state, boolean absolute) throws IOException;
/**
* Sets the current field for writing, and returns the
* fixed length of long[] metadata (which is fixed per
* field), called when the writing switches to another field. */
// TODO: better name?
public int setField(FieldInfo fieldInfo) {
this.fieldInfo = fieldInfo;
indexOptions = fieldInfo.getIndexOptions();
writeFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
writePositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
writeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
writePayloads = fieldInfo.hasPayloads();
if (writeFreqs == false) {
enumFlags = 0;
} else if (writePositions == false) {
enumFlags = DocsEnum.FLAG_FREQS;
} else if (writeOffsets == false) {
if (writePayloads) {
enumFlags = DocsAndPositionsEnum.FLAG_PAYLOADS;
} else {
enumFlags = 0;
}
} else {
if (writePayloads) {
enumFlags = DocsAndPositionsEnum.FLAG_PAYLOADS | DocsAndPositionsEnum.FLAG_OFFSETS;
} else {
enumFlags = DocsAndPositionsEnum.FLAG_OFFSETS;
}
}
return 0;
}
@Override
public final BlockTermState writeTerm(BytesRef term, TermsEnum termsEnum, FixedBitSet docsSeen) throws IOException {
startTerm();
if (writePositions == false) {
docsEnum = termsEnum.docs(null, docsEnum, enumFlags);
} else {
posEnum = termsEnum.docsAndPositions(null, posEnum, enumFlags);
docsEnum = posEnum;
}
assert docsEnum != null;
int docFreq = 0;
long totalTermFreq = 0;
while (true) {
int docID = docsEnum.nextDoc();
if (docID == DocsEnum.NO_MORE_DOCS) {
break;
}
docFreq++;
docsSeen.set(docID);
int freq;
if (writeFreqs) {
freq = docsEnum.freq();
totalTermFreq += freq;
} else {
freq = -1;
}
startDoc(docID, freq);
if (writePositions) {
for(int i=0;i<freq;i++) {
int pos = posEnum.nextPosition();
BytesRef payload = writePayloads ? posEnum.getPayload() : null;
int startOffset;
int endOffset;
if (writeOffsets) {
startOffset = posEnum.startOffset();
endOffset = posEnum.endOffset();
} else {
startOffset = -1;
endOffset = -1;
}
addPosition(pos, payload, startOffset, endOffset);
}
}
finishDoc();
}
if (docFreq == 0) {
return null;
} else {
BlockTermState state = newTermState();
state.docFreq = docFreq;
state.totalTermFreq = writeFreqs ? totalTermFreq : -1;
finishTerm(state);
return state;
}
}
/** Adds a new doc in this term.
* <code>freq</code> will be -1 when term frequencies are omitted
* for the field. */
public abstract void startDoc(int docID, int freq) throws IOException;
/** Add a new position & payload, and start/end offset. A
* null payload means no payload; a non-null payload with
* zero length also means no payload. Caller may reuse
* the {@link BytesRef} for the payload between calls
* (method must fully consume the payload). <code>startOffset</code>
* and <code>endOffset</code> will be -1 when offsets are not indexed. */
public abstract void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException;
/** Called when we are done adding positions & payloads
* for each doc. */
public abstract void finishDoc() throws IOException;
}