blob: d4c470c40a01345c1b05ce6af193c40e3f461c92 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.backward_codecs.lucene40.blocktree;
import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.fst.FST;
// TODO: can we share this with the frame in STE?
final class IntersectTermsEnumFrame {
final int ord;
long fp;
long fpOrig;
long fpEnd;
long lastSubFP;
// private static boolean DEBUG = IntersectTermsEnum.DEBUG;
// State in automaton
int state;
// State just before the last label
int lastState;
int metaDataUpto;
byte[] suffixBytes = new byte[128];
final ByteArrayDataInput suffixesReader = new ByteArrayDataInput();
byte[] suffixLengthBytes;
final ByteArrayDataInput suffixLengthsReader;
byte[] statBytes = new byte[64];
int statsSingletonRunLength = 0;
final ByteArrayDataInput statsReader = new ByteArrayDataInput();
byte[] floorData = new byte[32];
final ByteArrayDataInput floorDataReader = new ByteArrayDataInput();
// Length of prefix shared by all terms in this block
int prefix;
// Number of entries (term or sub-block) in this block
int entCount;
// Which term we will next read
int nextEnt;
// True if this block is either not a floor block,
// or, it's the last sub-block of a floor block
boolean isLastInFloor;
// True if all entries are terms
boolean isLeafBlock;
int numFollowFloorBlocks;
int nextFloorLabel;
final Transition transition = new Transition();
int transitionIndex;
int transitionCount;
FST.Arc<BytesRef> arc;
final BlockTermState termState;
// metadata buffer
byte[] bytes = new byte[32];
final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
// Cumulative output so far
BytesRef outputPrefix;
int startBytePos;
int suffix;
private final IntersectTermsEnum ite;
private final int version;
public IntersectTermsEnumFrame(IntersectTermsEnum ite, int ord) throws IOException {
this.ite = ite;
this.ord = ord;
this.termState = ite.fr.parent.postingsReader.newTermState();
this.termState.totalTermFreq = -1;
this.version = ite.fr.parent.version;
if (version >= Lucene40BlockTreeTermsReader.VERSION_COMPRESSED_SUFFIXES) {
suffixLengthBytes = new byte[32];
suffixLengthsReader = new ByteArrayDataInput();
} else {
suffixLengthBytes = null;
suffixLengthsReader = suffixesReader;
}
}
void loadNextFloorBlock() throws IOException {
assert numFollowFloorBlocks > 0 : "nextFloorLabel=" + nextFloorLabel;
do {
fp = fpOrig + (floorDataReader.readVLong() >>> 1);
numFollowFloorBlocks--;
if (numFollowFloorBlocks != 0) {
nextFloorLabel = floorDataReader.readByte() & 0xff;
} else {
nextFloorLabel = 256;
}
} while (numFollowFloorBlocks != 0 && nextFloorLabel <= transition.min);
load(null);
}
public void setState(int state) {
this.state = state;
transitionIndex = 0;
transitionCount = ite.automaton.getNumTransitions(state);
if (transitionCount != 0) {
ite.automaton.initTransition(state, transition);
ite.automaton.getNextTransition(transition);
} else {
// Must set min to -1 so the "label < min" check never falsely triggers:
transition.min = -1;
// Must set max to -1 so we immediately realize we need to step to the next transition and
// then pop this frame:
transition.max = -1;
}
}
void load(BytesRef frameIndexData) throws IOException {
if (frameIndexData != null) {
floorDataReader.reset(frameIndexData.bytes, frameIndexData.offset, frameIndexData.length);
// Skip first long -- has redundant fp, hasTerms
// flag, isFloor flag
final long code = floorDataReader.readVLong();
if ((code & Lucene40BlockTreeTermsReader.OUTPUT_FLAG_IS_FLOOR) != 0) {
// Floor frame
numFollowFloorBlocks = floorDataReader.readVInt();
nextFloorLabel = floorDataReader.readByte() & 0xff;
// If current state is not accept, and has transitions, we must process
// first block in case it has empty suffix:
if (ite.runAutomaton.isAccept(state) == false && transitionCount != 0) {
// Maybe skip floor blocks:
assert transitionIndex == 0 : "transitionIndex=" + transitionIndex;
while (numFollowFloorBlocks != 0 && nextFloorLabel <= transition.min) {
fp = fpOrig + (floorDataReader.readVLong() >>> 1);
numFollowFloorBlocks--;
if (numFollowFloorBlocks != 0) {
nextFloorLabel = floorDataReader.readByte() & 0xff;
} else {
nextFloorLabel = 256;
}
}
}
}
}
ite.in.seek(fp);
int code = ite.in.readVInt();
entCount = code >>> 1;
assert entCount > 0;
isLastInFloor = (code & 1) != 0;
// term suffixes:
if (version >= Lucene40BlockTreeTermsReader.VERSION_COMPRESSED_SUFFIXES) {
final long codeL = ite.in.readVLong();
isLeafBlock = (codeL & 0x04) != 0;
final int numSuffixBytes = (int) (codeL >>> 3);
if (suffixBytes.length < numSuffixBytes) {
suffixBytes = new byte[ArrayUtil.oversize(numSuffixBytes, 1)];
}
final CompressionAlgorithm compressionAlg;
try {
compressionAlg = CompressionAlgorithm.byCode((int) codeL & 0x03);
} catch (IllegalArgumentException e) {
throw new CorruptIndexException(e.getMessage(), ite.in, e);
}
compressionAlg.read(ite.in, suffixBytes, numSuffixBytes);
suffixesReader.reset(suffixBytes, 0, numSuffixBytes);
int numSuffixLengthBytes = ite.in.readVInt();
final boolean allEqual = (numSuffixLengthBytes & 0x01) != 0;
numSuffixLengthBytes >>>= 1;
if (suffixLengthBytes.length < numSuffixLengthBytes) {
suffixLengthBytes = new byte[ArrayUtil.oversize(numSuffixLengthBytes, 1)];
}
if (allEqual) {
Arrays.fill(suffixLengthBytes, 0, numSuffixLengthBytes, ite.in.readByte());
} else {
ite.in.readBytes(suffixLengthBytes, 0, numSuffixLengthBytes);
}
suffixLengthsReader.reset(suffixLengthBytes, 0, numSuffixLengthBytes);
} else {
code = ite.in.readVInt();
isLeafBlock = (code & 1) != 0;
int numBytes = code >>> 1;
if (suffixBytes.length < numBytes) {
suffixBytes = new byte[ArrayUtil.oversize(numBytes, 1)];
}
ite.in.readBytes(suffixBytes, 0, numBytes);
suffixesReader.reset(suffixBytes, 0, numBytes);
}
// stats
int numBytes = ite.in.readVInt();
if (statBytes.length < numBytes) {
statBytes = new byte[ArrayUtil.oversize(numBytes, 1)];
}
ite.in.readBytes(statBytes, 0, numBytes);
statsReader.reset(statBytes, 0, numBytes);
statsSingletonRunLength = 0;
metaDataUpto = 0;
termState.termBlockOrd = 0;
nextEnt = 0;
// metadata
numBytes = ite.in.readVInt();
if (bytes.length < numBytes) {
bytes = new byte[ArrayUtil.oversize(numBytes, 1)];
}
ite.in.readBytes(bytes, 0, numBytes);
bytesReader.reset(bytes, 0, numBytes);
if (!isLastInFloor) {
// Sub-blocks of a single floor block are always
// written one after another -- tail recurse:
fpEnd = ite.in.getFilePointer();
}
}
// TODO: maybe add scanToLabel; should give perf boost
// Decodes next entry; returns true if it's a sub-block
public boolean next() {
if (isLeafBlock) {
nextLeaf();
return false;
} else {
return nextNonLeaf();
}
}
public void nextLeaf() {
assert nextEnt != -1 && nextEnt < entCount
: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp;
nextEnt++;
suffix = suffixLengthsReader.readVInt();
startBytePos = suffixesReader.getPosition();
suffixesReader.skipBytes(suffix);
}
public boolean nextNonLeaf() {
assert nextEnt != -1 && nextEnt < entCount
: "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp;
nextEnt++;
final int code = suffixLengthsReader.readVInt();
suffix = code >>> 1;
startBytePos = suffixesReader.getPosition();
suffixesReader.skipBytes(suffix);
if ((code & 1) == 0) {
// A normal term
termState.termBlockOrd++;
return false;
} else {
// A sub-block; make sub-FP absolute:
lastSubFP = fp - suffixLengthsReader.readVLong();
return true;
}
}
public int getTermBlockOrd() {
return isLeafBlock ? nextEnt : termState.termBlockOrd;
}
public void decodeMetaData() throws IOException {
// lazily catch up on metadata decode:
final int limit = getTermBlockOrd();
boolean absolute = metaDataUpto == 0;
assert limit > 0;
// TODO: better API would be "jump straight to term=N"???
while (metaDataUpto < limit) {
// TODO: we could make "tiers" of metadata, ie,
// decode docFreq/totalTF but don't decode postings
// metadata; this way caller could get
// docFreq/totalTF w/o paying decode cost for
// postings
// TODO: if docFreq were bulk decoded we could
// just skipN here:
// stats
if (version >= Lucene40BlockTreeTermsReader.VERSION_COMPRESSED_SUFFIXES) {
if (statsSingletonRunLength > 0) {
termState.docFreq = 1;
termState.totalTermFreq = 1;
statsSingletonRunLength--;
} else {
int token = statsReader.readVInt();
if (version >= Lucene40BlockTreeTermsReader.VERSION_COMPRESSED_SUFFIXES
&& (token & 1) == 1) {
termState.docFreq = 1;
termState.totalTermFreq = 1;
statsSingletonRunLength = token >>> 1;
} else {
termState.docFreq = token >>> 1;
if (ite.fr.fieldInfo.getIndexOptions() == IndexOptions.DOCS) {
termState.totalTermFreq = termState.docFreq;
} else {
termState.totalTermFreq = termState.docFreq + statsReader.readVLong();
}
}
}
} else {
termState.docFreq = statsReader.readVInt();
// if (DEBUG) System.out.println(" dF=" + state.docFreq);
if (ite.fr.fieldInfo.getIndexOptions() == IndexOptions.DOCS) {
termState.totalTermFreq = termState.docFreq; // all postings have freq=1
} else {
termState.totalTermFreq = termState.docFreq + statsReader.readVLong();
// if (DEBUG) System.out.println(" totTF=" + state.totalTermFreq);
}
}
// metadata
ite.fr.parent.postingsReader.decodeTerm(bytesReader, ite.fr.fieldInfo, termState, absolute);
metaDataUpto++;
absolute = false;
}
termState.termBlockOrd = metaDataUpto;
}
}