| package org.apache.lucene.codecs.idversion; |
| |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| import java.io.IOException; |
| |
| import org.apache.lucene.codecs.FieldsConsumer; |
| import org.apache.lucene.codecs.FieldsProducer; |
| import org.apache.lucene.codecs.PostingsFormat; |
| import org.apache.lucene.codecs.PostingsReaderBase; |
| import org.apache.lucene.codecs.PostingsWriterBase; |
| import org.apache.lucene.codecs.blocktree.BlockTreeTermsWriter; |
| import org.apache.lucene.index.SegmentReadState; |
| import org.apache.lucene.index.SegmentWriteState; |
| import org.apache.lucene.search.LiveFieldValues; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.IOUtils; |
| |
| /** A PostingsFormat optimized for primary-key (ID) fields that also |
| * record a version (long) for each ID, delivered as a payload |
| * created by {@link #longToBytes} during indexing. At search time, |
| * the TermsEnum implementation {@link IDVersionSegmentTermsEnum} |
| * enables fast (using only the terms index when possible) lookup for |
| * whether a given ID was previously indexed with version > N (see |
| * {@link IDVersionSegmentTermsEnum#seekExact(BytesRef,long)}. |
| * |
| * <p>This is most effective if the app assigns monotonically |
| * increasing global version to each indexed doc. Then, during |
| * indexing, use {@link |
| * IDVersionSegmentTermsEnum#seekExact(BytesRef,long)} (along with |
| * {@link LiveFieldValues}) to decide whether the document you are |
| * about to index was already indexed with a higher version, and skip |
| * it if so. |
| * |
| * <p>The field is effectively indexed as DOCS_ONLY and the docID is |
| * pulsed into the terms dictionary, but the user must feed in the |
| * version as a payload on the first token. |
| * |
| * <p>NOTE: term vectors cannot be indexed with this field (not that |
| * you should really ever want to do this). |
| * |
| * @lucene.experimental */ |
| |
| public class IDVersionPostingsFormat extends PostingsFormat { |
| |
| /** version must be >= this. */ |
| public static final long MIN_VERSION = 0; |
| |
| // TODO: we could delta encode instead, and keep the last bit: |
| |
| /** version must be <= this, because we encode with ZigZag. */ |
| public static final long MAX_VERSION = 0x3fffffffffffffffL; |
| |
| private final int minTermsInBlock; |
| private final int maxTermsInBlock; |
| |
| public IDVersionPostingsFormat() { |
| this(BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE); |
| } |
| |
| public IDVersionPostingsFormat(int minTermsInBlock, int maxTermsInBlock) { |
| super("IDVersion"); |
| this.minTermsInBlock = minTermsInBlock; |
| this.maxTermsInBlock = maxTermsInBlock; |
| } |
| |
| @Override |
| public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { |
| PostingsWriterBase postingsWriter = new IDVersionPostingsWriter(state.liveDocs); |
| boolean success = false; |
| try { |
| FieldsConsumer ret = new VersionBlockTreeTermsWriter(state, |
| postingsWriter, |
| minTermsInBlock, |
| maxTermsInBlock); |
| success = true; |
| return ret; |
| } finally { |
| if (!success) { |
| IOUtils.closeWhileHandlingException(postingsWriter); |
| } |
| } |
| } |
| |
| @Override |
| public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { |
| PostingsReaderBase postingsReader = new IDVersionPostingsReader(); |
| boolean success = false; |
| try { |
| FieldsProducer ret = new VersionBlockTreeTermsReader(postingsReader, state); |
| success = true; |
| return ret; |
| } finally { |
| if (!success) { |
| IOUtils.closeWhileHandlingException(postingsReader); |
| } |
| } |
| } |
| |
| public static long bytesToLong(BytesRef bytes) { |
| return ((bytes.bytes[bytes.offset]&0xFFL) << 56) | |
| ((bytes.bytes[bytes.offset+1]&0xFFL) << 48) | |
| ((bytes.bytes[bytes.offset+2]&0xFFL) << 40) | |
| ((bytes.bytes[bytes.offset+3]&0xFFL) << 32) | |
| ((bytes.bytes[bytes.offset+4]&0xFFL) << 24) | |
| ((bytes.bytes[bytes.offset+5]&0xFFL) << 16) | |
| ((bytes.bytes[bytes.offset+6]&0xFFL) << 8) | |
| (bytes.bytes[bytes.offset+7]&0xFFL); |
| } |
| |
| public static void longToBytes(long v, BytesRef bytes) { |
| if (v > MAX_VERSION || v < MIN_VERSION) { |
| throw new IllegalArgumentException("version must be >= MIN_VERSION=" + MIN_VERSION + " and <= MAX_VERSION=" + MAX_VERSION + " (got: " + v + ")"); |
| } |
| bytes.offset = 0; |
| bytes.length = 8; |
| bytes.bytes[0] = (byte) (v >> 56); |
| bytes.bytes[1] = (byte) (v >> 48); |
| bytes.bytes[2] = (byte) (v >> 40); |
| bytes.bytes[3] = (byte) (v >> 32); |
| bytes.bytes[4] = (byte) (v >> 24); |
| bytes.bytes[5] = (byte) (v >> 16); |
| bytes.bytes[6] = (byte) (v >> 8); |
| bytes.bytes[7] = (byte) v; |
| assert bytesToLong(bytes) == v: bytesToLong(bytes) + " vs " + v + " bytes=" + bytes; |
| } |
| } |