| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.codecs.lucene50; |
| |
| |
| import java.io.IOException; |
| import java.util.Arrays; |
| import java.util.Collection; |
| |
| import org.apache.lucene.codecs.CompetitiveImpactAccumulator; |
| import org.apache.lucene.codecs.MultiLevelSkipListWriter; |
| import org.apache.lucene.index.Impact; |
| import org.apache.lucene.store.IndexOutput; |
| import org.apache.lucene.store.RAMOutputStream; |
| |
| /** |
| * Write skip lists with multiple levels, and support skip within block ints. |
| * |
| * Assume that docFreq = 28, skipInterval = blockSize = 12 |
| * |
| * | block#0 | | block#1 | |vInts| |
| * d d d d d d d d d d d d d d d d d d d d d d d d d d d d (posting list) |
| * ^ ^ (level 0 skip point) |
| * |
| * Note that skipWriter will ignore first document in block#0, since |
| * it is useless as a skip point. Also, we'll never skip into the vInts |
| * block, only record skip data at the start its start point(if it exist). |
| * |
| * For each skip point, we will record: |
| * 1. docID in former position, i.e. for position 12, record docID[11], etc. |
| * 2. its related file points(position, payload), |
| * 3. related numbers or uptos(position, payload). |
| * 4. start offset. |
| * |
| */ |
| final class Lucene50SkipWriter extends MultiLevelSkipListWriter { |
| private int[] lastSkipDoc; |
| private long[] lastSkipDocPointer; |
| private long[] lastSkipPosPointer; |
| private long[] lastSkipPayPointer; |
| private int[] lastPayloadByteUpto; |
| |
| private final IndexOutput docOut; |
| private final IndexOutput posOut; |
| private final IndexOutput payOut; |
| |
| private int curDoc; |
| private long curDocPointer; |
| private long curPosPointer; |
| private long curPayPointer; |
| private int curPosBufferUpto; |
| private int curPayloadByteUpto; |
| private CompetitiveImpactAccumulator[] curCompetitiveFreqNorms; |
| private boolean fieldHasPositions; |
| private boolean fieldHasOffsets; |
| private boolean fieldHasPayloads; |
| |
| public Lucene50SkipWriter(int maxSkipLevels, int blockSize, int docCount, IndexOutput docOut, IndexOutput posOut, IndexOutput payOut) { |
| super(blockSize, 8, maxSkipLevels, docCount); |
| this.docOut = docOut; |
| this.posOut = posOut; |
| this.payOut = payOut; |
| |
| lastSkipDoc = new int[maxSkipLevels]; |
| lastSkipDocPointer = new long[maxSkipLevels]; |
| if (posOut != null) { |
| lastSkipPosPointer = new long[maxSkipLevels]; |
| if (payOut != null) { |
| lastSkipPayPointer = new long[maxSkipLevels]; |
| } |
| lastPayloadByteUpto = new int[maxSkipLevels]; |
| } |
| curCompetitiveFreqNorms = new CompetitiveImpactAccumulator[maxSkipLevels]; |
| for (int i = 0; i < maxSkipLevels; ++i) { |
| curCompetitiveFreqNorms[i] = new CompetitiveImpactAccumulator(); |
| } |
| } |
| |
| public void setField(boolean fieldHasPositions, boolean fieldHasOffsets, boolean fieldHasPayloads) { |
| this.fieldHasPositions = fieldHasPositions; |
| this.fieldHasOffsets = fieldHasOffsets; |
| this.fieldHasPayloads = fieldHasPayloads; |
| } |
| |
| // tricky: we only skip data for blocks (terms with more than 128 docs), but re-init'ing the skipper |
| // is pretty slow for rare terms in large segments as we have to fill O(log #docs in segment) of junk. |
| // this is the vast majority of terms (worst case: ID field or similar). so in resetSkip() we save |
| // away the previous pointers, and lazy-init only if we need to buffer skip data for the term. |
| private boolean initialized; |
| long lastDocFP; |
| long lastPosFP; |
| long lastPayFP; |
| |
| @Override |
| public void resetSkip() { |
| lastDocFP = docOut.getFilePointer(); |
| if (fieldHasPositions) { |
| lastPosFP = posOut.getFilePointer(); |
| if (fieldHasOffsets || fieldHasPayloads) { |
| lastPayFP = payOut.getFilePointer(); |
| } |
| } |
| if (initialized) { |
| for (CompetitiveImpactAccumulator acc : curCompetitiveFreqNorms) { |
| acc.clear(); |
| } |
| } |
| initialized = false; |
| } |
| |
| private void initSkip() { |
| if (!initialized) { |
| super.resetSkip(); |
| Arrays.fill(lastSkipDoc, 0); |
| Arrays.fill(lastSkipDocPointer, lastDocFP); |
| if (fieldHasPositions) { |
| Arrays.fill(lastSkipPosPointer, lastPosFP); |
| if (fieldHasPayloads) { |
| Arrays.fill(lastPayloadByteUpto, 0); |
| } |
| if (fieldHasOffsets || fieldHasPayloads) { |
| Arrays.fill(lastSkipPayPointer, lastPayFP); |
| } |
| } |
| // sets of competitive freq,norm pairs should be empty at this point |
| assert Arrays.stream(curCompetitiveFreqNorms) |
| .map(CompetitiveImpactAccumulator::getCompetitiveFreqNormPairs) |
| .mapToInt(Collection::size) |
| .sum() == 0; |
| initialized = true; |
| } |
| } |
| |
| /** |
| * Sets the values for the current skip data. |
| */ |
| public void bufferSkip(int doc, CompetitiveImpactAccumulator competitiveFreqNorms, |
| int numDocs, long posFP, long payFP, int posBufferUpto, int payloadByteUpto) throws IOException { |
| initSkip(); |
| this.curDoc = doc; |
| this.curDocPointer = docOut.getFilePointer(); |
| this.curPosPointer = posFP; |
| this.curPayPointer = payFP; |
| this.curPosBufferUpto = posBufferUpto; |
| this.curPayloadByteUpto = payloadByteUpto; |
| this.curCompetitiveFreqNorms[0].addAll(competitiveFreqNorms); |
| bufferSkip(numDocs); |
| } |
| |
| private final RAMOutputStream freqNormOut = new RAMOutputStream(); |
| |
| @Override |
| protected void writeSkipData(int level, IndexOutput skipBuffer) throws IOException { |
| |
| int delta = curDoc - lastSkipDoc[level]; |
| |
| skipBuffer.writeVInt(delta); |
| lastSkipDoc[level] = curDoc; |
| |
| skipBuffer.writeVLong(curDocPointer - lastSkipDocPointer[level]); |
| lastSkipDocPointer[level] = curDocPointer; |
| |
| if (fieldHasPositions) { |
| |
| skipBuffer.writeVLong(curPosPointer - lastSkipPosPointer[level]); |
| lastSkipPosPointer[level] = curPosPointer; |
| skipBuffer.writeVInt(curPosBufferUpto); |
| |
| if (fieldHasPayloads) { |
| skipBuffer.writeVInt(curPayloadByteUpto); |
| } |
| |
| if (fieldHasOffsets || fieldHasPayloads) { |
| skipBuffer.writeVLong(curPayPointer - lastSkipPayPointer[level]); |
| lastSkipPayPointer[level] = curPayPointer; |
| } |
| } |
| |
| CompetitiveImpactAccumulator competitiveFreqNorms = curCompetitiveFreqNorms[level]; |
| assert competitiveFreqNorms.getCompetitiveFreqNormPairs().size() > 0; |
| if (level + 1 < numberOfSkipLevels) { |
| curCompetitiveFreqNorms[level + 1].addAll(competitiveFreqNorms); |
| } |
| writeImpacts(competitiveFreqNorms, freqNormOut); |
| skipBuffer.writeVInt(Math.toIntExact(freqNormOut.getFilePointer())); |
| freqNormOut.writeTo(skipBuffer); |
| freqNormOut.reset(); |
| competitiveFreqNorms.clear(); |
| } |
| |
| static void writeImpacts(CompetitiveImpactAccumulator acc, IndexOutput out) throws IOException { |
| Collection<Impact> impacts = acc.getCompetitiveFreqNormPairs(); |
| Impact previous = new Impact(0, 0); |
| for (Impact impact : impacts) { |
| assert impact.freq > previous.freq; |
| assert Long.compareUnsigned(impact.norm, previous.norm) > 0; |
| int freqDelta = impact.freq - previous.freq - 1; |
| long normDelta = impact.norm - previous.norm - 1; |
| if (normDelta == 0) { |
| // most of time, norm only increases by 1, so we can fold everything in a single byte |
| out.writeVInt(freqDelta << 1); |
| } else { |
| out.writeVInt((freqDelta << 1) | 1); |
| out.writeZLong(normDelta); |
| } |
| previous = impact; |
| } |
| } |
| } |