lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/BlockLine.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.lucene.codecs.uniformsplit;

 import java.io.IOException;
 import org.apache.lucene.codecs.BlockTermState;
 import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.util.Accountable;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.RamUsageEstimator;

 /**
  * One term block line.
  *
  * <p>Contains a term and its details as a {@link BlockTermState}.
  *
  * <p>The line is written to the {@link UniformSplitPostingsFormat#TERMS_BLOCKS_EXTENSION block
  * file} in two parts. The first part is the term followed by an offset to the details region. The
  * second part is the term {@link BlockTermState}, written in the details region, after all the
  * terms of the block.
  *
  * <p>The separate details region allows fast scan of the terms without having to decode the details
  * for each term. At read time, the {@link BlockLine.Serializer#readLine} only reads the term and
  * its offset to the details. The corresponding {@link BlockTermState} is decoded on demand in the
  * {@link BlockReader} (see {@link BlockReader#readTermStateIfNotRead}).
  *
  * @lucene.experimental
  */
 public class BlockLine implements Accountable {

   private static final long BASE_RAM_USAGE =
       RamUsageEstimator.shallowSizeOfInstance(BlockLine.class);

   protected TermBytes termBytes;
   protected int termStateRelativeOffset;

   /** Only used for writing. */
   protected final BlockTermState termState;

   /** Constructor used for writing a {@link BlockLine}. */
   protected BlockLine(TermBytes termBytes, BlockTermState termState) {
     this(termBytes, -1, termState);
   }

   /** Constructor used for reading a {@link BlockLine}. */
   protected BlockLine(TermBytes termBytes, int termStateRelativeOffset) {
     this(termBytes, termStateRelativeOffset, null);
   }

   private BlockLine(TermBytes termBytes, int termStateRelativeOffset, BlockTermState termState) {
     reset(termBytes, termStateRelativeOffset);
     this.termState = termState;
   }

   /** Resets this {@link BlockLine} to reuse it when reading. */
   protected BlockLine reset(TermBytes termBytes, int termStateRelativeOffset) {
     assert termState == null;
     this.termBytes = termBytes;
     this.termStateRelativeOffset = termStateRelativeOffset;
     return this;
   }

   public TermBytes getTermBytes() {
     return termBytes;
   }

   /**
    * @return The offset of the {@link org.apache.lucene.index.TermState} bytes in the block,
    *     relatively to the term states base offset.
    */
   public int getTermStateRelativeOffset() {
     return termStateRelativeOffset;
   }

   @Override
   public long ramBytesUsed() {
     return BASE_RAM_USAGE + termBytes.ramBytesUsed() + RamUsageUtil.ramBytesUsed(termState);
   }

   /**
    * Reads/writes block lines with terms encoded incrementally inside a block. This class keeps a
    * state of the previous term read to decode the next term.
    */
   public static class Serializer implements Accountable {

     private static final long BASE_RAM_USAGE =
         RamUsageEstimator.shallowSizeOfInstance(Serializer.class);

     protected final BytesRef currentTerm;

     public Serializer() {
       currentTerm = new BytesRef(64);
     }

     /**
      * Reads the current line.
      *
      * @param isIncrementalEncodingSeed Whether the term is a seed of the incremental encoding.
      *     {@code true} for the first and middle term, {@code false} for other terms.
      * @param reuse A {@link BlockLine} instance to reuse; or null if none.
      */
     public BlockLine readLine(
         DataInput blockInput, boolean isIncrementalEncodingSeed, BlockLine reuse)
         throws IOException {
       int termStateRelativeOffset = blockInput.readVInt();
       if (termStateRelativeOffset < 0) {
         throw new CorruptIndexException(
             "Illegal termStateRelativeOffset= " + termStateRelativeOffset, blockInput);
       }
       return reuse == null
           ? new BlockLine(
               readIncrementallyEncodedTerm(blockInput, isIncrementalEncodingSeed, null),
               termStateRelativeOffset)
           : reuse.reset(
               readIncrementallyEncodedTerm(blockInput, isIncrementalEncodingSeed, reuse.termBytes),
               termStateRelativeOffset);
     }

     /**
      * Writes a line and its offset to the corresponding term state details in the details region.
      *
      * @param blockOutput The output pointing to the block terms region.
      * @param termStateRelativeOffset The offset to the corresponding term state details in the
      *     details region.
      * @param isIncrementalEncodingSeed Whether the term is a seed of the incremental encoding.
      *     {@code true} for the first and middle term, {@code false} for other terms.
      */
     public void writeLine(
         DataOutput blockOutput,
         BlockLine line,
         BlockLine previousLine,
         int termStateRelativeOffset,
         boolean isIncrementalEncodingSeed)
         throws IOException {
       blockOutput.writeVInt(termStateRelativeOffset);
       writeIncrementallyEncodedTerm(
           line.getTermBytes(),
           previousLine == null ? null : previousLine.getTermBytes(),
           isIncrementalEncodingSeed,
           blockOutput);
     }

     /**
      * Writes the term state details of a line in the details region.
      *
      * @param termStatesOutput The output pointing to the details region.
      */
     protected void writeLineTermState(
         DataOutput termStatesOutput,
         BlockLine line,
         FieldInfo fieldInfo,
         DeltaBaseTermStateSerializer encoder)
         throws IOException {
       assert line.termState != null;
       encoder.writeTermState(termStatesOutput, fieldInfo, line.termState);
     }

     protected void writeIncrementallyEncodedTerm(
         TermBytes termBytes,
         TermBytes previousTermBytes,
         boolean isIncrementalEncodingSeed,
         DataOutput blockOutput)
         throws IOException {
       BytesRef term = termBytes.getTerm();
       assert term.offset == 0;
       if (isIncrementalEncodingSeed) {
         // Mdp length is always 1 for an incremental encoding seed.
         blockOutput.writeVLong(term.length);
         blockOutput.writeBytes(term.bytes, 0, term.length);
         return;
       }
       if (term.length == 0) {
         // Empty term.
         blockOutput.writeVLong(0);
         return;
       }

       // For other lines we store:
       // - Mdp length.
       // - Suffix length.
       // - Suffix bytes.
       // Instead of writing mdp length and suffix length with 2 VInt, we can compress the storage
       // by merging them in a single VLong. The idea is to leverage the information we have about
       // the previous line. We know the previous line term length. And we know that
       // new line mdp length <= (previous line term length + 1)
       // So if numMdpBits = numBitsToEncode(previous line term length),
       // then we know we can encode (new line mdp length - 1) in numMdpBits.
       // Hence we encode (new line mdp length - 1) in the rightmost numMdpBits of the VLong.
       // And we encode new line suffix length in the remaining left bits of the VLong.
       // Most of the time both values will be encoded in a single byte.

       assert previousTermBytes != null;
       assert termBytes.getMdpLength() >= 1;

       int numMdpBits = numBitsToEncode(previousTermBytes.getTerm().length);
       assert numBitsToEncode(termBytes.getMdpLength() - 1) <= numMdpBits;

       long mdpAndSuffixLengths =
           (((long) termBytes.getSuffixLength()) << numMdpBits) | (termBytes.getMdpLength() - 1);
       assert mdpAndSuffixLengths != 0;
       blockOutput.writeVLong(mdpAndSuffixLengths);
       blockOutput.writeBytes(term.bytes, termBytes.getSuffixOffset(), termBytes.getSuffixLength());
     }

     protected TermBytes readIncrementallyEncodedTerm(
         DataInput blockInput, boolean isIncrementalEncodingSeed, TermBytes reuse)
         throws IOException {
       assert currentTerm.offset == 0;
       int mdpLength;
       if (isIncrementalEncodingSeed) {
         int length = (int) blockInput.readVLong();
         mdpLength = length == 0 ? 0 : 1;
         readBytes(blockInput, currentTerm, 0, length);
       } else {
         long mdpAndSuffixLengths = blockInput.readVLong();
         if (mdpAndSuffixLengths == 0) {
           // Empty term.
           mdpLength = 0;
           currentTerm.length = 0;
         } else {
           int numMdpBits = numBitsToEncode(currentTerm.length);
           mdpLength =
               (int) (mdpAndSuffixLengths & ((1 << numMdpBits) - 1))
                   + 1; // Get rightmost numMdpBits.
           int suffixLength = (int) (mdpAndSuffixLengths >>> numMdpBits); // Get remaining left bits.
           assert mdpLength >= 1;
           assert suffixLength >= 1;
           readBytes(blockInput, currentTerm, mdpLength - 1, suffixLength);
         }
       }
       return reuse == null
           ? new TermBytes(mdpLength, currentTerm)
           : reuse.reset(mdpLength, currentTerm);
     }

     /**
      * Reads {@code length} bytes from the given {@link DataInput} and stores them at {@code offset}
      * in {@code bytes.bytes}.
      */
     protected void readBytes(DataInput input, BytesRef bytes, int offset, int length)
         throws IOException {
       assert bytes.offset == 0;
       bytes.length = offset + length;
       bytes.bytes = ArrayUtil.grow(bytes.bytes, bytes.length);
       input.readBytes(bytes.bytes, offset, length);
     }

     @Override
     public long ramBytesUsed() {
       return BASE_RAM_USAGE + RamUsageUtil.ramBytesUsed(currentTerm);
     }

     /**
      * Gets the number of bits required to encode the value of the provided int. Returns 0 for int
      * value 0. Equivalent to (log2(i) + 1).
      */
     protected static int numBitsToEncode(int i) {
       return 32 - Integer.numberOfLeadingZeros(i);
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.lucene.codecs.uniformsplit;

	import java.io.IOException;
	import org.apache.lucene.codecs.BlockTermState;
	import org.apache.lucene.index.CorruptIndexException;
	import org.apache.lucene.index.FieldInfo;
	import org.apache.lucene.store.DataInput;
	import org.apache.lucene.store.DataOutput;
	import org.apache.lucene.util.Accountable;
	import org.apache.lucene.util.ArrayUtil;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.RamUsageEstimator;

	/**
	* One term block line.
	*
	* <p>Contains a term and its details as a {@link BlockTermState}.
	*
	* <p>The line is written to the {@link UniformSplitPostingsFormat#TERMS_BLOCKS_EXTENSION block
	* file} in two parts. The first part is the term followed by an offset to the details region. The
	* second part is the term {@link BlockTermState}, written in the details region, after all the
	* terms of the block.
	*
	* <p>The separate details region allows fast scan of the terms without having to decode the details
	* for each term. At read time, the {@link BlockLine.Serializer#readLine} only reads the term and
	* its offset to the details. The corresponding {@link BlockTermState} is decoded on demand in the
	* {@link BlockReader} (see {@link BlockReader#readTermStateIfNotRead}).
	*
	* @lucene.experimental
	*/
	public class BlockLine implements Accountable {

	private static final long BASE_RAM_USAGE =
	RamUsageEstimator.shallowSizeOfInstance(BlockLine.class);

	protected TermBytes termBytes;
	protected int termStateRelativeOffset;

	/** Only used for writing. */
	protected final BlockTermState termState;

	/** Constructor used for writing a {@link BlockLine}. */
	protected BlockLine(TermBytes termBytes, BlockTermState termState) {
	this(termBytes, -1, termState);
	}

	/** Constructor used for reading a {@link BlockLine}. */
	protected BlockLine(TermBytes termBytes, int termStateRelativeOffset) {
	this(termBytes, termStateRelativeOffset, null);
	}

	private BlockLine(TermBytes termBytes, int termStateRelativeOffset, BlockTermState termState) {
	reset(termBytes, termStateRelativeOffset);
	this.termState = termState;
	}

	/** Resets this {@link BlockLine} to reuse it when reading. */
	protected BlockLine reset(TermBytes termBytes, int termStateRelativeOffset) {
	assert termState == null;
	this.termBytes = termBytes;
	this.termStateRelativeOffset = termStateRelativeOffset;
	return this;
	}

	public TermBytes getTermBytes() {
	return termBytes;
	}

	/**
	* @return The offset of the {@link org.apache.lucene.index.TermState} bytes in the block,
	* relatively to the term states base offset.
	*/
	public int getTermStateRelativeOffset() {
	return termStateRelativeOffset;
	}

	@Override
	public long ramBytesUsed() {
	return BASE_RAM_USAGE + termBytes.ramBytesUsed() + RamUsageUtil.ramBytesUsed(termState);
	}

	/**
	* Reads/writes block lines with terms encoded incrementally inside a block. This class keeps a
	* state of the previous term read to decode the next term.
	*/
	public static class Serializer implements Accountable {

	private static final long BASE_RAM_USAGE =
	RamUsageEstimator.shallowSizeOfInstance(Serializer.class);

	protected final BytesRef currentTerm;

	public Serializer() {
	currentTerm = new BytesRef(64);
	}

	/**
	* Reads the current line.
	*
	* @param isIncrementalEncodingSeed Whether the term is a seed of the incremental encoding.
	* {@code true} for the first and middle term, {@code false} for other terms.
	* @param reuse A {@link BlockLine} instance to reuse; or null if none.
	*/
	public BlockLine readLine(
	DataInput blockInput, boolean isIncrementalEncodingSeed, BlockLine reuse)
	throws IOException {
	int termStateRelativeOffset = blockInput.readVInt();
	if (termStateRelativeOffset < 0) {
	throw new CorruptIndexException(
	"Illegal termStateRelativeOffset= " + termStateRelativeOffset, blockInput);
	}
	return reuse == null
	? new BlockLine(
	readIncrementallyEncodedTerm(blockInput, isIncrementalEncodingSeed, null),
	termStateRelativeOffset)
	: reuse.reset(
	readIncrementallyEncodedTerm(blockInput, isIncrementalEncodingSeed, reuse.termBytes),
	termStateRelativeOffset);
	}

	/**
	* Writes a line and its offset to the corresponding term state details in the details region.
	*
	* @param blockOutput The output pointing to the block terms region.
	* @param termStateRelativeOffset The offset to the corresponding term state details in the
	* details region.
	* @param isIncrementalEncodingSeed Whether the term is a seed of the incremental encoding.
	* {@code true} for the first and middle term, {@code false} for other terms.
	*/
	public void writeLine(
	DataOutput blockOutput,
	BlockLine line,
	BlockLine previousLine,
	int termStateRelativeOffset,
	boolean isIncrementalEncodingSeed)
	throws IOException {
	blockOutput.writeVInt(termStateRelativeOffset);
	writeIncrementallyEncodedTerm(
	line.getTermBytes(),
	previousLine == null ? null : previousLine.getTermBytes(),
	isIncrementalEncodingSeed,
	blockOutput);
	}

	/**
	* Writes the term state details of a line in the details region.
	*
	* @param termStatesOutput The output pointing to the details region.
	*/
	protected void writeLineTermState(
	DataOutput termStatesOutput,
	BlockLine line,
	FieldInfo fieldInfo,
	DeltaBaseTermStateSerializer encoder)
	throws IOException {
	assert line.termState != null;
	encoder.writeTermState(termStatesOutput, fieldInfo, line.termState);
	}

	protected void writeIncrementallyEncodedTerm(
	TermBytes termBytes,
	TermBytes previousTermBytes,
	boolean isIncrementalEncodingSeed,
	DataOutput blockOutput)
	throws IOException {
	BytesRef term = termBytes.getTerm();
	assert term.offset == 0;
	if (isIncrementalEncodingSeed) {
	// Mdp length is always 1 for an incremental encoding seed.
	blockOutput.writeVLong(term.length);
	blockOutput.writeBytes(term.bytes, 0, term.length);
	return;
	}
	if (term.length == 0) {
	// Empty term.
	blockOutput.writeVLong(0);
	return;
	}

	// For other lines we store:
	// - Mdp length.
	// - Suffix length.
	// - Suffix bytes.
	// Instead of writing mdp length and suffix length with 2 VInt, we can compress the storage
	// by merging them in a single VLong. The idea is to leverage the information we have about
	// the previous line. We know the previous line term length. And we know that
	// new line mdp length <= (previous line term length + 1)
	// So if numMdpBits = numBitsToEncode(previous line term length),
	// then we know we can encode (new line mdp length - 1) in numMdpBits.
	// Hence we encode (new line mdp length - 1) in the rightmost numMdpBits of the VLong.
	// And we encode new line suffix length in the remaining left bits of the VLong.
	// Most of the time both values will be encoded in a single byte.

	assert previousTermBytes != null;
	assert termBytes.getMdpLength() >= 1;

	int numMdpBits = numBitsToEncode(previousTermBytes.getTerm().length);
	assert numBitsToEncode(termBytes.getMdpLength() - 1) <= numMdpBits;

	long mdpAndSuffixLengths =
	(((long) termBytes.getSuffixLength()) << numMdpBits) \| (termBytes.getMdpLength() - 1);
	assert mdpAndSuffixLengths != 0;
	blockOutput.writeVLong(mdpAndSuffixLengths);
	blockOutput.writeBytes(term.bytes, termBytes.getSuffixOffset(), termBytes.getSuffixLength());
	}

	protected TermBytes readIncrementallyEncodedTerm(
	DataInput blockInput, boolean isIncrementalEncodingSeed, TermBytes reuse)
	throws IOException {
	assert currentTerm.offset == 0;
	int mdpLength;
	if (isIncrementalEncodingSeed) {
	int length = (int) blockInput.readVLong();
	mdpLength = length == 0 ? 0 : 1;
	readBytes(blockInput, currentTerm, 0, length);
	} else {
	long mdpAndSuffixLengths = blockInput.readVLong();
	if (mdpAndSuffixLengths == 0) {
	// Empty term.
	mdpLength = 0;
	currentTerm.length = 0;
	} else {
	int numMdpBits = numBitsToEncode(currentTerm.length);
	mdpLength =
	(int) (mdpAndSuffixLengths & ((1 << numMdpBits) - 1))
	+ 1; // Get rightmost numMdpBits.
	int suffixLength = (int) (mdpAndSuffixLengths >>> numMdpBits); // Get remaining left bits.
	assert mdpLength >= 1;
	assert suffixLength >= 1;
	readBytes(blockInput, currentTerm, mdpLength - 1, suffixLength);
	}
	}
	return reuse == null
	? new TermBytes(mdpLength, currentTerm)
	: reuse.reset(mdpLength, currentTerm);
	}

	/**
	* Reads {@code length} bytes from the given {@link DataInput} and stores them at {@code offset}
	* in {@code bytes.bytes}.
	*/
	protected void readBytes(DataInput input, BytesRef bytes, int offset, int length)
	throws IOException {
	assert bytes.offset == 0;
	bytes.length = offset + length;
	bytes.bytes = ArrayUtil.grow(bytes.bytes, bytes.length);
	input.readBytes(bytes.bytes, offset, length);
	}

	@Override
	public long ramBytesUsed() {
	return BASE_RAM_USAGE + RamUsageUtil.ramBytesUsed(currentTerm);
	}

	/**
	* Gets the number of bits required to encode the value of the provided int. Returns 0 for int
	* value 0. Equivalent to (log2(i) + 1).
	*/
	protected static int numBitsToEncode(int i) {
	return 32 - Integer.numberOfLeadingZeros(i);
	}
	}
	}