| package org.apache.lucene.codecs.lucene41; |
| |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| import static org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat.BLOCK_SIZE; |
| import static org.apache.lucene.codecs.lucene41.ForUtil.MAX_DATA_SIZE; |
| import static org.apache.lucene.codecs.lucene41.ForUtil.MAX_ENCODED_SIZE; |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.List; |
| |
| import org.apache.lucene.codecs.BlockTermState; |
| import org.apache.lucene.codecs.CodecUtil; |
| import org.apache.lucene.codecs.PostingsWriterBase; |
| import org.apache.lucene.index.CorruptIndexException; |
| import org.apache.lucene.index.FieldInfo; |
| import org.apache.lucene.index.FieldInfo.IndexOptions; |
| import org.apache.lucene.index.IndexFileNames; |
| import org.apache.lucene.index.SegmentWriteState; |
| import org.apache.lucene.index.TermState; |
| import org.apache.lucene.store.DataOutput; |
| import org.apache.lucene.store.IndexOutput; |
| import org.apache.lucene.store.RAMOutputStream; |
| import org.apache.lucene.util.ArrayUtil; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.IOUtils; |
| import org.apache.lucene.util.packed.PackedInts; |
| |
| |
| /** |
| * Concrete class that writes docId(maybe frq,pos,offset,payloads) list |
| * with postings format. |
| * |
| * Postings list for each term will be stored separately. |
| * |
| * @see Lucene41SkipWriter for details about skipping setting and postings layout. |
| * @lucene.experimental |
| */ |
| public final class Lucene41PostingsWriter extends PostingsWriterBase { |
| |
| /** |
| * Expert: The maximum number of skip levels. Smaller values result in |
| * slightly smaller indexes, but slower skipping in big posting lists. |
| */ |
| static final int maxSkipLevels = 10; |
| |
| final static String TERMS_CODEC = "Lucene41PostingsWriterTerms"; |
| final static String DOC_CODEC = "Lucene41PostingsWriterDoc"; |
| final static String POS_CODEC = "Lucene41PostingsWriterPos"; |
| final static String PAY_CODEC = "Lucene41PostingsWriterPay"; |
| |
| // Increment version to change it |
| final static int VERSION_START = 0; |
| final static int VERSION_META_ARRAY = 1; |
| final static int VERSION_CURRENT = VERSION_META_ARRAY; |
| |
| final IndexOutput docOut; |
| final IndexOutput posOut; |
| final IndexOutput payOut; |
| |
| final static IntBlockTermState emptyState = new IntBlockTermState(); |
| IntBlockTermState lastState; |
| |
| // How current field indexes postings: |
| private boolean fieldHasFreqs; |
| private boolean fieldHasPositions; |
| private boolean fieldHasOffsets; |
| private boolean fieldHasPayloads; |
| |
| // Holds starting file pointers for current term: |
| private long docStartFP; |
| private long posStartFP; |
| private long payStartFP; |
| |
| final int[] docDeltaBuffer; |
| final int[] freqBuffer; |
| private int docBufferUpto; |
| |
| final int[] posDeltaBuffer; |
| final int[] payloadLengthBuffer; |
| final int[] offsetStartDeltaBuffer; |
| final int[] offsetLengthBuffer; |
| private int posBufferUpto; |
| |
| private byte[] payloadBytes; |
| private int payloadByteUpto; |
| |
| private int lastBlockDocID; |
| private long lastBlockPosFP; |
| private long lastBlockPayFP; |
| private int lastBlockPosBufferUpto; |
| private int lastBlockPayloadByteUpto; |
| |
| private int lastDocID; |
| private int lastPosition; |
| private int lastStartOffset; |
| private int docCount; |
| |
| final byte[] encoded; |
| |
| private final ForUtil forUtil; |
| private final Lucene41SkipWriter skipWriter; |
| |
| /** Creates a postings writer with the specified PackedInts overhead ratio */ |
| // TODO: does this ctor even make sense? |
| public Lucene41PostingsWriter(SegmentWriteState state, float acceptableOverheadRatio) throws IOException { |
| super(); |
| |
| docOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene41PostingsFormat.DOC_EXTENSION), |
| state.context); |
| IndexOutput posOut = null; |
| IndexOutput payOut = null; |
| boolean success = false; |
| try { |
| CodecUtil.writeHeader(docOut, DOC_CODEC, VERSION_CURRENT); |
| forUtil = new ForUtil(acceptableOverheadRatio, docOut); |
| if (state.fieldInfos.hasProx()) { |
| posDeltaBuffer = new int[MAX_DATA_SIZE]; |
| posOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene41PostingsFormat.POS_EXTENSION), |
| state.context); |
| CodecUtil.writeHeader(posOut, POS_CODEC, VERSION_CURRENT); |
| |
| if (state.fieldInfos.hasPayloads()) { |
| payloadBytes = new byte[128]; |
| payloadLengthBuffer = new int[MAX_DATA_SIZE]; |
| } else { |
| payloadBytes = null; |
| payloadLengthBuffer = null; |
| } |
| |
| if (state.fieldInfos.hasOffsets()) { |
| offsetStartDeltaBuffer = new int[MAX_DATA_SIZE]; |
| offsetLengthBuffer = new int[MAX_DATA_SIZE]; |
| } else { |
| offsetStartDeltaBuffer = null; |
| offsetLengthBuffer = null; |
| } |
| |
| if (state.fieldInfos.hasPayloads() || state.fieldInfos.hasOffsets()) { |
| payOut = state.directory.createOutput(IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene41PostingsFormat.PAY_EXTENSION), |
| state.context); |
| CodecUtil.writeHeader(payOut, PAY_CODEC, VERSION_CURRENT); |
| } |
| } else { |
| posDeltaBuffer = null; |
| payloadLengthBuffer = null; |
| offsetStartDeltaBuffer = null; |
| offsetLengthBuffer = null; |
| payloadBytes = null; |
| } |
| this.payOut = payOut; |
| this.posOut = posOut; |
| success = true; |
| } finally { |
| if (!success) { |
| IOUtils.closeWhileHandlingException(docOut, posOut, payOut); |
| } |
| } |
| |
| docDeltaBuffer = new int[MAX_DATA_SIZE]; |
| freqBuffer = new int[MAX_DATA_SIZE]; |
| |
| // TODO: should we try skipping every 2/4 blocks...? |
| skipWriter = new Lucene41SkipWriter(maxSkipLevels, |
| BLOCK_SIZE, |
| state.segmentInfo.getDocCount(), |
| docOut, |
| posOut, |
| payOut); |
| |
| encoded = new byte[MAX_ENCODED_SIZE]; |
| } |
| |
| /** Creates a postings writer with <code>PackedInts.COMPACT</code> */ |
| public Lucene41PostingsWriter(SegmentWriteState state) throws IOException { |
| this(state, PackedInts.COMPACT); |
| } |
| |
| final static class IntBlockTermState extends BlockTermState { |
| long docStartFP = 0; |
| long posStartFP = 0; |
| long payStartFP = 0; |
| long skipOffset = -1; |
| long lastPosBlockOffset = -1; |
| // docid when there is a single pulsed posting, otherwise -1 |
| // freq is always implicitly totalTermFreq in this case. |
| int singletonDocID = -1; |
| |
| @Override |
| public IntBlockTermState clone() { |
| IntBlockTermState other = new IntBlockTermState(); |
| other.copyFrom(this); |
| return other; |
| } |
| |
| @Override |
| public void copyFrom(TermState _other) { |
| super.copyFrom(_other); |
| IntBlockTermState other = (IntBlockTermState) _other; |
| docStartFP = other.docStartFP; |
| posStartFP = other.posStartFP; |
| payStartFP = other.payStartFP; |
| lastPosBlockOffset = other.lastPosBlockOffset; |
| skipOffset = other.skipOffset; |
| singletonDocID = other.singletonDocID; |
| } |
| |
| |
| @Override |
| public String toString() { |
| return super.toString() + " docStartFP=" + docStartFP + " posStartFP=" + posStartFP + " payStartFP=" + payStartFP + " lastPosBlockOffset=" + lastPosBlockOffset + " singletonDocID=" + singletonDocID; |
| } |
| } |
| |
| @Override |
| public IntBlockTermState newTermState() { |
| return new IntBlockTermState(); |
| } |
| |
| @Override |
| public void init(IndexOutput termsOut) throws IOException { |
| CodecUtil.writeHeader(termsOut, TERMS_CODEC, VERSION_CURRENT); |
| termsOut.writeVInt(BLOCK_SIZE); |
| } |
| |
| @Override |
| public int setField(FieldInfo fieldInfo) { |
| IndexOptions indexOptions = fieldInfo.getIndexOptions(); |
| fieldHasFreqs = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; |
| fieldHasPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; |
| fieldHasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; |
| fieldHasPayloads = fieldInfo.hasPayloads(); |
| skipWriter.setField(fieldHasPositions, fieldHasOffsets, fieldHasPayloads); |
| lastState = emptyState; |
| if (fieldHasPositions) { |
| if (fieldHasPayloads || fieldHasOffsets) { |
| return 3; // doc + pos + pay FP |
| } else { |
| return 2; // doc + pos FP |
| } |
| } else { |
| return 1; // doc FP |
| } |
| } |
| |
| @Override |
| public void startTerm() { |
| docStartFP = docOut.getFilePointer(); |
| if (fieldHasPositions) { |
| posStartFP = posOut.getFilePointer(); |
| if (fieldHasPayloads || fieldHasOffsets) { |
| payStartFP = payOut.getFilePointer(); |
| } |
| } |
| lastDocID = 0; |
| lastBlockDocID = -1; |
| // if (DEBUG) { |
| // System.out.println("FPW.startTerm startFP=" + docStartFP); |
| // } |
| skipWriter.resetSkip(); |
| } |
| |
| @Override |
| public void startDoc(int docID, int termDocFreq) throws IOException { |
| // if (DEBUG) { |
| // System.out.println("FPW.startDoc docID["+docBufferUpto+"]=" + docID); |
| // } |
| // Have collected a block of docs, and get a new doc. |
| // Should write skip data as well as postings list for |
| // current block. |
| if (lastBlockDocID != -1 && docBufferUpto == 0) { |
| // if (DEBUG) { |
| // System.out.println(" bufferSkip at writeBlock: lastDocID=" + lastBlockDocID + " docCount=" + (docCount-1)); |
| // } |
| skipWriter.bufferSkip(lastBlockDocID, docCount, lastBlockPosFP, lastBlockPayFP, lastBlockPosBufferUpto, lastBlockPayloadByteUpto); |
| } |
| |
| final int docDelta = docID - lastDocID; |
| |
| if (docID < 0 || (docCount > 0 && docDelta <= 0)) { |
| throw new CorruptIndexException("docs out of order (" + docID + " <= " + lastDocID + " ) (docOut: " + docOut + ")"); |
| } |
| |
| docDeltaBuffer[docBufferUpto] = docDelta; |
| // if (DEBUG) { |
| // System.out.println(" docDeltaBuffer[" + docBufferUpto + "]=" + docDelta); |
| // } |
| if (fieldHasFreqs) { |
| freqBuffer[docBufferUpto] = termDocFreq; |
| } |
| docBufferUpto++; |
| docCount++; |
| |
| if (docBufferUpto == BLOCK_SIZE) { |
| // if (DEBUG) { |
| // System.out.println(" write docDelta block @ fp=" + docOut.getFilePointer()); |
| // } |
| forUtil.writeBlock(docDeltaBuffer, encoded, docOut); |
| if (fieldHasFreqs) { |
| // if (DEBUG) { |
| // System.out.println(" write freq block @ fp=" + docOut.getFilePointer()); |
| // } |
| forUtil.writeBlock(freqBuffer, encoded, docOut); |
| } |
| // NOTE: don't set docBufferUpto back to 0 here; |
| // finishDoc will do so (because it needs to see that |
| // the block was filled so it can save skip data) |
| } |
| |
| |
| lastDocID = docID; |
| lastPosition = 0; |
| lastStartOffset = 0; |
| } |
| |
| /** Add a new position & payload */ |
| @Override |
| public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) throws IOException { |
| // if (DEBUG) { |
| // System.out.println("FPW.addPosition pos=" + position + " posBufferUpto=" + posBufferUpto + (fieldHasPayloads ? " payloadByteUpto=" + payloadByteUpto: "")); |
| // } |
| posDeltaBuffer[posBufferUpto] = position - lastPosition; |
| if (fieldHasPayloads) { |
| if (payload == null || payload.length == 0) { |
| // no payload |
| payloadLengthBuffer[posBufferUpto] = 0; |
| } else { |
| payloadLengthBuffer[posBufferUpto] = payload.length; |
| if (payloadByteUpto + payload.length > payloadBytes.length) { |
| payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payload.length); |
| } |
| System.arraycopy(payload.bytes, payload.offset, payloadBytes, payloadByteUpto, payload.length); |
| payloadByteUpto += payload.length; |
| } |
| } |
| |
| if (fieldHasOffsets) { |
| assert startOffset >= lastStartOffset; |
| assert endOffset >= startOffset; |
| offsetStartDeltaBuffer[posBufferUpto] = startOffset - lastStartOffset; |
| offsetLengthBuffer[posBufferUpto] = endOffset - startOffset; |
| lastStartOffset = startOffset; |
| } |
| |
| posBufferUpto++; |
| lastPosition = position; |
| if (posBufferUpto == BLOCK_SIZE) { |
| // if (DEBUG) { |
| // System.out.println(" write pos bulk block @ fp=" + posOut.getFilePointer()); |
| // } |
| forUtil.writeBlock(posDeltaBuffer, encoded, posOut); |
| |
| if (fieldHasPayloads) { |
| forUtil.writeBlock(payloadLengthBuffer, encoded, payOut); |
| payOut.writeVInt(payloadByteUpto); |
| payOut.writeBytes(payloadBytes, 0, payloadByteUpto); |
| payloadByteUpto = 0; |
| } |
| if (fieldHasOffsets) { |
| forUtil.writeBlock(offsetStartDeltaBuffer, encoded, payOut); |
| forUtil.writeBlock(offsetLengthBuffer, encoded, payOut); |
| } |
| posBufferUpto = 0; |
| } |
| } |
| |
| @Override |
| public void finishDoc() throws IOException { |
| // Since we don't know df for current term, we had to buffer |
| // those skip data for each block, and when a new doc comes, |
| // write them to skip file. |
| if (docBufferUpto == BLOCK_SIZE) { |
| lastBlockDocID = lastDocID; |
| if (posOut != null) { |
| if (payOut != null) { |
| lastBlockPayFP = payOut.getFilePointer(); |
| } |
| lastBlockPosFP = posOut.getFilePointer(); |
| lastBlockPosBufferUpto = posBufferUpto; |
| lastBlockPayloadByteUpto = payloadByteUpto; |
| } |
| // if (DEBUG) { |
| // System.out.println(" docBufferUpto="+docBufferUpto+" now get lastBlockDocID="+lastBlockDocID+" lastBlockPosFP=" + lastBlockPosFP + " lastBlockPosBufferUpto=" + lastBlockPosBufferUpto + " lastBlockPayloadByteUpto=" + lastBlockPayloadByteUpto); |
| // } |
| docBufferUpto = 0; |
| } |
| } |
| |
| /** Called when we are done adding docs to this term */ |
| @Override |
| public void finishTerm(BlockTermState _state) throws IOException { |
| IntBlockTermState state = (IntBlockTermState) _state; |
| assert state.docFreq > 0; |
| |
| // TODO: wasteful we are counting this (counting # docs |
| // for this term) in two places? |
| assert state.docFreq == docCount: state.docFreq + " vs " + docCount; |
| |
| // if (DEBUG) { |
| // System.out.println("FPW.finishTerm docFreq=" + state.docFreq); |
| // } |
| |
| // if (DEBUG) { |
| // if (docBufferUpto > 0) { |
| // System.out.println(" write doc/freq vInt block (count=" + docBufferUpto + ") at fp=" + docOut.getFilePointer() + " docStartFP=" + docStartFP); |
| // } |
| // } |
| |
| // docFreq == 1, don't write the single docid/freq to a separate file along with a pointer to it. |
| final int singletonDocID; |
| if (state.docFreq == 1) { |
| // pulse the singleton docid into the term dictionary, freq is implicitly totalTermFreq |
| singletonDocID = docDeltaBuffer[0]; |
| } else { |
| singletonDocID = -1; |
| // vInt encode the remaining doc deltas and freqs: |
| for(int i=0;i<docBufferUpto;i++) { |
| final int docDelta = docDeltaBuffer[i]; |
| final int freq = freqBuffer[i]; |
| if (!fieldHasFreqs) { |
| docOut.writeVInt(docDelta); |
| } else if (freqBuffer[i] == 1) { |
| docOut.writeVInt((docDelta<<1)|1); |
| } else { |
| docOut.writeVInt(docDelta<<1); |
| docOut.writeVInt(freq); |
| } |
| } |
| } |
| |
| final long lastPosBlockOffset; |
| |
| if (fieldHasPositions) { |
| // if (DEBUG) { |
| // if (posBufferUpto > 0) { |
| // System.out.println(" write pos vInt block (count=" + posBufferUpto + ") at fp=" + posOut.getFilePointer() + " posStartFP=" + posStartFP + " hasPayloads=" + fieldHasPayloads + " hasOffsets=" + fieldHasOffsets); |
| // } |
| // } |
| |
| // totalTermFreq is just total number of positions(or payloads, or offsets) |
| // associated with current term. |
| assert state.totalTermFreq != -1; |
| if (state.totalTermFreq > BLOCK_SIZE) { |
| // record file offset for last pos in last block |
| lastPosBlockOffset = posOut.getFilePointer() - posStartFP; |
| } else { |
| lastPosBlockOffset = -1; |
| } |
| if (posBufferUpto > 0) { |
| // TODO: should we send offsets/payloads to |
| // .pay...? seems wasteful (have to store extra |
| // vLong for low (< BLOCK_SIZE) DF terms = vast vast |
| // majority) |
| |
| // vInt encode the remaining positions/payloads/offsets: |
| int lastPayloadLength = -1; // force first payload length to be written |
| int lastOffsetLength = -1; // force first offset length to be written |
| int payloadBytesReadUpto = 0; |
| for(int i=0;i<posBufferUpto;i++) { |
| final int posDelta = posDeltaBuffer[i]; |
| if (fieldHasPayloads) { |
| final int payloadLength = payloadLengthBuffer[i]; |
| if (payloadLength != lastPayloadLength) { |
| lastPayloadLength = payloadLength; |
| posOut.writeVInt((posDelta<<1)|1); |
| posOut.writeVInt(payloadLength); |
| } else { |
| posOut.writeVInt(posDelta<<1); |
| } |
| |
| // if (DEBUG) { |
| // System.out.println(" i=" + i + " payloadLen=" + payloadLength); |
| // } |
| |
| if (payloadLength != 0) { |
| // if (DEBUG) { |
| // System.out.println(" write payload @ pos.fp=" + posOut.getFilePointer()); |
| // } |
| posOut.writeBytes(payloadBytes, payloadBytesReadUpto, payloadLength); |
| payloadBytesReadUpto += payloadLength; |
| } |
| } else { |
| posOut.writeVInt(posDelta); |
| } |
| |
| if (fieldHasOffsets) { |
| // if (DEBUG) { |
| // System.out.println(" write offset @ pos.fp=" + posOut.getFilePointer()); |
| // } |
| int delta = offsetStartDeltaBuffer[i]; |
| int length = offsetLengthBuffer[i]; |
| if (length == lastOffsetLength) { |
| posOut.writeVInt(delta << 1); |
| } else { |
| posOut.writeVInt(delta << 1 | 1); |
| posOut.writeVInt(length); |
| lastOffsetLength = length; |
| } |
| } |
| } |
| |
| if (fieldHasPayloads) { |
| assert payloadBytesReadUpto == payloadByteUpto; |
| payloadByteUpto = 0; |
| } |
| } |
| // if (DEBUG) { |
| // System.out.println(" totalTermFreq=" + state.totalTermFreq + " lastPosBlockOffset=" + lastPosBlockOffset); |
| // } |
| } else { |
| lastPosBlockOffset = -1; |
| } |
| |
| long skipOffset; |
| if (docCount > BLOCK_SIZE) { |
| skipOffset = skipWriter.writeSkip(docOut) - docStartFP; |
| |
| // if (DEBUG) { |
| // System.out.println("skip packet " + (docOut.getFilePointer() - (docStartFP + skipOffset)) + " bytes"); |
| // } |
| } else { |
| skipOffset = -1; |
| // if (DEBUG) { |
| // System.out.println(" no skip: docCount=" + docCount); |
| // } |
| } |
| // if (DEBUG) { |
| // System.out.println(" payStartFP=" + payStartFP); |
| // } |
| state.docStartFP = docStartFP; |
| state.posStartFP = posStartFP; |
| state.payStartFP = payStartFP; |
| state.singletonDocID = singletonDocID; |
| state.skipOffset = skipOffset; |
| state.lastPosBlockOffset = lastPosBlockOffset; |
| docBufferUpto = 0; |
| posBufferUpto = 0; |
| lastDocID = 0; |
| docCount = 0; |
| } |
| |
| @Override |
| public void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) throws IOException { |
| IntBlockTermState state = (IntBlockTermState)_state; |
| if (absolute) { |
| lastState = emptyState; |
| } |
| longs[0] = state.docStartFP - lastState.docStartFP; |
| if (fieldHasPositions) { |
| longs[1] = state.posStartFP - lastState.posStartFP; |
| if (fieldHasPayloads || fieldHasOffsets) { |
| longs[2] = state.payStartFP - lastState.payStartFP; |
| } |
| } |
| if (state.singletonDocID != -1) { |
| out.writeVInt(state.singletonDocID); |
| } |
| if (fieldHasPositions) { |
| if (state.lastPosBlockOffset != -1) { |
| out.writeVLong(state.lastPosBlockOffset); |
| } |
| } |
| if (state.skipOffset != -1) { |
| out.writeVLong(state.skipOffset); |
| } |
| lastState = state; |
| } |
| |
| @Override |
| public void close() throws IOException { |
| IOUtils.close(docOut, posOut, payOut); |
| } |
| } |