blob: e3c6f917be3c9a742da6307ea180f4b5cb99b89c [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene84;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import org.apache.lucene.codecs.CompetitiveImpactAccumulator;
import org.apache.lucene.codecs.MultiLevelSkipListWriter;
import org.apache.lucene.index.Impact;
import org.apache.lucene.store.ByteBuffersDataOutput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
/**
* Write skip lists with multiple levels, and support skip within block ints.
*
* Assume that docFreq = 28, skipInterval = blockSize = 12
*
* | block#0 | | block#1 | |vInts|
* d d d d d d d d d d d d d d d d d d d d d d d d d d d d (posting list)
* ^ ^ (level 0 skip point)
*
* Note that skipWriter will ignore first document in block#0, since
* it is useless as a skip point. Also, we'll never skip into the vInts
* block, only record skip data at the start its start point(if it exist).
*
* For each skip point, we will record:
* 1. docID in former position, i.e. for position 12, record docID[11], etc.
* 2. its related file points(position, payload),
* 3. related numbers or uptos(position, payload).
* 4. start offset.
*
*/
final class Lucene84SkipWriter extends MultiLevelSkipListWriter {
private int[] lastSkipDoc;
private long[] lastSkipDocPointer;
private long[] lastSkipPosPointer;
private long[] lastSkipPayPointer;
private int[] lastPayloadByteUpto;
private final IndexOutput docOut;
private final IndexOutput posOut;
private final IndexOutput payOut;
private int curDoc;
private long curDocPointer;
private long curPosPointer;
private long curPayPointer;
private int curPosBufferUpto;
private int curPayloadByteUpto;
private CompetitiveImpactAccumulator[] curCompetitiveFreqNorms;
private boolean fieldHasPositions;
private boolean fieldHasOffsets;
private boolean fieldHasPayloads;
public Lucene84SkipWriter(int maxSkipLevels, int blockSize, int docCount, IndexOutput docOut, IndexOutput posOut, IndexOutput payOut) {
super(blockSize, 8, maxSkipLevels, docCount);
this.docOut = docOut;
this.posOut = posOut;
this.payOut = payOut;
lastSkipDoc = new int[maxSkipLevels];
lastSkipDocPointer = new long[maxSkipLevels];
if (posOut != null) {
lastSkipPosPointer = new long[maxSkipLevels];
if (payOut != null) {
lastSkipPayPointer = new long[maxSkipLevels];
}
lastPayloadByteUpto = new int[maxSkipLevels];
}
curCompetitiveFreqNorms = new CompetitiveImpactAccumulator[maxSkipLevels];
for (int i = 0; i < maxSkipLevels; ++i) {
curCompetitiveFreqNorms[i] = new CompetitiveImpactAccumulator();
}
}
public void setField(boolean fieldHasPositions, boolean fieldHasOffsets, boolean fieldHasPayloads) {
this.fieldHasPositions = fieldHasPositions;
this.fieldHasOffsets = fieldHasOffsets;
this.fieldHasPayloads = fieldHasPayloads;
}
// tricky: we only skip data for blocks (terms with more than 128 docs), but re-init'ing the skipper
// is pretty slow for rare terms in large segments as we have to fill O(log #docs in segment) of junk.
// this is the vast majority of terms (worst case: ID field or similar). so in resetSkip() we save
// away the previous pointers, and lazy-init only if we need to buffer skip data for the term.
private boolean initialized;
long lastDocFP;
long lastPosFP;
long lastPayFP;
@Override
public void resetSkip() {
lastDocFP = docOut.getFilePointer();
if (fieldHasPositions) {
lastPosFP = posOut.getFilePointer();
if (fieldHasOffsets || fieldHasPayloads) {
lastPayFP = payOut.getFilePointer();
}
}
if (initialized) {
for (CompetitiveImpactAccumulator acc : curCompetitiveFreqNorms) {
acc.clear();
}
}
initialized = false;
}
private void initSkip() {
if (!initialized) {
super.resetSkip();
Arrays.fill(lastSkipDoc, 0);
Arrays.fill(lastSkipDocPointer, lastDocFP);
if (fieldHasPositions) {
Arrays.fill(lastSkipPosPointer, lastPosFP);
if (fieldHasPayloads) {
Arrays.fill(lastPayloadByteUpto, 0);
}
if (fieldHasOffsets || fieldHasPayloads) {
Arrays.fill(lastSkipPayPointer, lastPayFP);
}
}
// sets of competitive freq,norm pairs should be empty at this point
assert Arrays.stream(curCompetitiveFreqNorms)
.map(CompetitiveImpactAccumulator::getCompetitiveFreqNormPairs)
.mapToInt(Collection::size)
.sum() == 0;
initialized = true;
}
}
/**
* Sets the values for the current skip data.
*/
public void bufferSkip(int doc, CompetitiveImpactAccumulator competitiveFreqNorms,
int numDocs, long posFP, long payFP, int posBufferUpto, int payloadByteUpto) throws IOException {
initSkip();
this.curDoc = doc;
this.curDocPointer = docOut.getFilePointer();
this.curPosPointer = posFP;
this.curPayPointer = payFP;
this.curPosBufferUpto = posBufferUpto;
this.curPayloadByteUpto = payloadByteUpto;
this.curCompetitiveFreqNorms[0].addAll(competitiveFreqNorms);
bufferSkip(numDocs);
}
private final ByteBuffersDataOutput freqNormOut = ByteBuffersDataOutput.newResettableInstance();
@Override
protected void writeSkipData(int level, IndexOutput skipBuffer) throws IOException {
int delta = curDoc - lastSkipDoc[level];
skipBuffer.writeVInt(delta);
lastSkipDoc[level] = curDoc;
skipBuffer.writeVLong(curDocPointer - lastSkipDocPointer[level]);
lastSkipDocPointer[level] = curDocPointer;
if (fieldHasPositions) {
skipBuffer.writeVLong(curPosPointer - lastSkipPosPointer[level]);
lastSkipPosPointer[level] = curPosPointer;
skipBuffer.writeVInt(curPosBufferUpto);
if (fieldHasPayloads) {
skipBuffer.writeVInt(curPayloadByteUpto);
}
if (fieldHasOffsets || fieldHasPayloads) {
skipBuffer.writeVLong(curPayPointer - lastSkipPayPointer[level]);
lastSkipPayPointer[level] = curPayPointer;
}
}
CompetitiveImpactAccumulator competitiveFreqNorms = curCompetitiveFreqNorms[level];
assert competitiveFreqNorms.getCompetitiveFreqNormPairs().size() > 0;
if (level + 1 < numberOfSkipLevels) {
curCompetitiveFreqNorms[level + 1].addAll(competitiveFreqNorms);
}
writeImpacts(competitiveFreqNorms, freqNormOut);
skipBuffer.writeVInt(Math.toIntExact(freqNormOut.size()));
freqNormOut.copyTo(skipBuffer);
freqNormOut.reset();
competitiveFreqNorms.clear();
}
static void writeImpacts(CompetitiveImpactAccumulator acc, DataOutput out) throws IOException {
Collection<Impact> impacts = acc.getCompetitiveFreqNormPairs();
Impact previous = new Impact(0, 0);
for (Impact impact : impacts) {
assert impact.freq > previous.freq;
assert Long.compareUnsigned(impact.norm, previous.norm) > 0;
int freqDelta = impact.freq - previous.freq - 1;
long normDelta = impact.norm - previous.norm - 1;
if (normDelta == 0) {
// most of time, norm only increases by 1, so we can fold everything in a single byte
out.writeVInt(freqDelta << 1);
} else {
out.writeVInt((freqDelta << 1) | 1);
out.writeZLong(normDelta);
}
previous = impact;
}
}
}