lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/BlockLine.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.lucene.codecs.uniformsplit;

 import java.io.IOException;

 import org.apache.lucene.codecs.BlockTermState;
 import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.util.Accountable;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.RamUsageEstimator;

 /**
  * One term block line.
  * <p>
  * Contains a term and its details as a {@link BlockTermState}.
  * <p>
  * The line is written to the {@link UniformSplitPostingsFormat#TERMS_BLOCKS_EXTENSION block file}
  * in two parts. The first part is the term followed by an offset to the details
  * region. The second part is the term {@link BlockTermState}, written in
  * the details region, after all the terms of the block.
  * <p>
  * The separate details region allows fast scan of the terms without having
  * to decode the details for each term. At read time, the {@link BlockLine.Serializer#readLine}
  * only reads the term and its offset to the details. The corresponding {@link BlockTermState}
  * is decoded on demand in the {@link BlockReader} (see {@link BlockReader#readTermStateIfNotRead}).
  *
  * @lucene.experimental
  */
 public class BlockLine implements Accountable {

   private static final long BASE_RAM_USAGE = RamUsageEstimator.shallowSizeOfInstance(BlockLine.class);

   protected TermBytes termBytes;
   protected int termStateRelativeOffset;

   /**
    * Only used for writing.
    */
   protected final BlockTermState termState;

   /**
    * Constructor used for writing a {@link BlockLine}.
    */
   protected BlockLine(TermBytes termBytes, BlockTermState termState) {
     this(termBytes, -1, termState);
   }

   /**
    * Constructor used for reading a {@link BlockLine}.
    */
   protected BlockLine(TermBytes termBytes, int termStateRelativeOffset) {
     this(termBytes, termStateRelativeOffset, null);
   }

   private BlockLine(TermBytes termBytes, int termStateRelativeOffset, BlockTermState termState) {
     reset(termBytes, termStateRelativeOffset);
     this.termState = termState;
   }

   /**
    * Resets this {@link BlockLine} to reuse it when reading.
    */
   protected BlockLine reset(TermBytes termBytes, int termStateRelativeOffset) {
     assert termState == null;
     this.termBytes = termBytes;
     this.termStateRelativeOffset = termStateRelativeOffset;
     return this;
   }

   public TermBytes getTermBytes() {
     return termBytes;
   }

   /**
    * @return The offset of the {@link org.apache.lucene.index.TermState}
    * bytes in the block, relatively to the term states base offset.
    */
   public int getTermStateRelativeOffset() {
     return termStateRelativeOffset;
   }

   @Override
   public long ramBytesUsed() {
     return BASE_RAM_USAGE
         + termBytes.ramBytesUsed()
         + RamUsageUtil.ramBytesUsed(termState);
   }

   /**
    * Reads/writes block lines with terms encoded incrementally inside a block.
    * This class keeps a state of the previous term read to decode the next term.
    */
   public static class Serializer implements Accountable {

     private static final long BASE_RAM_USAGE = RamUsageEstimator.shallowSizeOfInstance(Serializer.class);

     protected final BytesRef currentTerm;

     public Serializer() {
       currentTerm = new BytesRef(64);
     }

     /**
      * Reads the current line.
      *
      * @param isIncrementalEncodingSeed Whether the term is a seed of the
      *                                  incremental encoding. {@code true} for the first and
      *                                  middle term, {@code false} for other terms.
      * @param reuse                     A {@link BlockLine} instance to reuse; or null if none.
      */
     public BlockLine readLine(DataInput blockInput, boolean isIncrementalEncodingSeed, BlockLine reuse) throws IOException {
       int termStateRelativeOffset = blockInput.readVInt();
       if (termStateRelativeOffset < 0) {
         throw new CorruptIndexException("Illegal termStateRelativeOffset= " + termStateRelativeOffset, blockInput);
       }
       return reuse == null ?
           new BlockLine(readIncrementallyEncodedTerm(blockInput, isIncrementalEncodingSeed, null), termStateRelativeOffset)
           : reuse.reset(readIncrementallyEncodedTerm(blockInput, isIncrementalEncodingSeed, reuse.termBytes), termStateRelativeOffset);
     }

     /**
      * Writes a line and its offset to the corresponding term state details in
      * the details region.
      *
      * @param blockOutput               The output pointing to the block terms region.
      * @param termStateRelativeOffset   The offset to the corresponding term
      *                                  state details in the details region.
      * @param isIncrementalEncodingSeed Whether the term is a seed of
      *                                  the incremental encoding. {@code true} for the first
      *                                  and middle term, {@code false} for other terms.
      */
     public void writeLine(DataOutput blockOutput, BlockLine line, BlockLine previousLine,
                                  int termStateRelativeOffset, boolean isIncrementalEncodingSeed) throws IOException {
       blockOutput.writeVInt(termStateRelativeOffset);
       writeIncrementallyEncodedTerm(line.getTermBytes(), previousLine == null ? null : previousLine.getTermBytes(),
           isIncrementalEncodingSeed, blockOutput);
     }

     /**
      * Writes the term state details of a line in the details region.
      *
      * @param termStatesOutput The output pointing to the details region.
      */
     protected void writeLineTermState(DataOutput termStatesOutput, BlockLine line,
                                    FieldInfo fieldInfo, DeltaBaseTermStateSerializer encoder) throws IOException {
       assert line.termState != null;
       encoder.writeTermState(termStatesOutput, fieldInfo, line.termState);
     }

     protected void writeIncrementallyEncodedTerm(TermBytes termBytes, TermBytes previousTermBytes,
                                                       boolean isIncrementalEncodingSeed, DataOutput blockOutput) throws IOException {
       BytesRef term = termBytes.getTerm();
       assert term.offset == 0;
       if (isIncrementalEncodingSeed) {
         // Mdp length is always 1 for an incremental encoding seed.
         blockOutput.writeVLong(term.length);
         blockOutput.writeBytes(term.bytes, 0, term.length);
         return;
       }
       if (term.length == 0) {
         // Empty term.
         blockOutput.writeVLong(0);
         return;
       }

       // For other lines we store:
       // - Mdp length.
       // - Suffix length.
       // - Suffix bytes.
       // Instead of writing mdp length and suffix length with 2 VInt, we can compress the storage
       // by merging them in a single VLong. The idea is to leverage the information we have about
       // the previous line. We know the previous line term length. And we know that
       // new line mdp length <= (previous line term length + 1)
       // So if numMdpBits = numBitsToEncode(previous line term length),
       // then we know we can encode (new line mdp length - 1) in numMdpBits.
       // Hence we encode (new line mdp length - 1) in the rightmost numMdpBits of the VLong.
       // And we encode new line suffix length in the remaining left bits of the VLong.
       // Most of the time both values will be encoded in a single byte.

       assert previousTermBytes != null;
       assert termBytes.getMdpLength() >= 1;

       int numMdpBits = numBitsToEncode(previousTermBytes.getTerm().length);
       assert numBitsToEncode(termBytes.getMdpLength() - 1) <= numMdpBits;

       long mdpAndSuffixLengths = (((long) termBytes.getSuffixLength()) << numMdpBits) | (termBytes.getMdpLength() - 1);
       assert mdpAndSuffixLengths != 0;
       blockOutput.writeVLong(mdpAndSuffixLengths);
       blockOutput.writeBytes(term.bytes, termBytes.getSuffixOffset(), termBytes.getSuffixLength());
     }

     protected TermBytes readIncrementallyEncodedTerm(DataInput blockInput, boolean isIncrementalEncodingSeed, TermBytes reuse) throws IOException {
       assert currentTerm.offset == 0;
       int mdpLength;
       if (isIncrementalEncodingSeed) {
         int length = (int) blockInput.readVLong();
         mdpLength = length == 0 ? 0 : 1;
         readBytes(blockInput, currentTerm, 0, length);
       } else {
         long mdpAndSuffixLengths = blockInput.readVLong();
         if (mdpAndSuffixLengths == 0) {
           // Empty term.
           mdpLength = 0;
           currentTerm.length = 0;
         } else {
           int numMdpBits = numBitsToEncode(currentTerm.length);
           mdpLength = (int) (mdpAndSuffixLengths & ((1 << numMdpBits) - 1)) + 1; // Get rightmost numMdpBits.
           int suffixLength = (int) (mdpAndSuffixLengths >>> numMdpBits); // Get remaining left bits.
           assert mdpLength >= 1;
           assert suffixLength >= 1;
           readBytes(blockInput, currentTerm, mdpLength - 1, suffixLength);
         }
       }
       return reuse == null ?
           new TermBytes(mdpLength, currentTerm)
           : reuse.reset(mdpLength, currentTerm);
     }

     /**
      * Reads {@code length} bytes from the given {@link DataInput} and stores
      * them at {@code offset} in {@code bytes.bytes}.
      */
     protected void readBytes(DataInput input, BytesRef bytes, int offset, int length) throws IOException {
       assert bytes.offset == 0;
       bytes.length = offset + length;
       bytes.bytes = ArrayUtil.grow(bytes.bytes, bytes.length);
       input.readBytes(bytes.bytes, offset, length);
     }

     @Override
     public long ramBytesUsed() {
       return BASE_RAM_USAGE
           + RamUsageUtil.ramBytesUsed(currentTerm);
     }

     /**
      * Gets the number of bits required to encode the value of the provided int.
      * Returns 0 for int value 0. Equivalent to (log2(i) + 1).
      */
     protected static int numBitsToEncode(int i) {
       return 32 - Integer.numberOfLeadingZeros(i);
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.lucene.codecs.uniformsplit;

	import java.io.IOException;

	import org.apache.lucene.codecs.BlockTermState;
	import org.apache.lucene.index.CorruptIndexException;
	import org.apache.lucene.index.FieldInfo;
	import org.apache.lucene.store.DataInput;
	import org.apache.lucene.store.DataOutput;
	import org.apache.lucene.util.Accountable;
	import org.apache.lucene.util.ArrayUtil;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.RamUsageEstimator;

	/**
	* One term block line.
	* <p>
	* Contains a term and its details as a {@link BlockTermState}.
	* <p>
	* The line is written to the {@link UniformSplitPostingsFormat#TERMS_BLOCKS_EXTENSION block file}
	* in two parts. The first part is the term followed by an offset to the details
	* region. The second part is the term {@link BlockTermState}, written in
	* the details region, after all the terms of the block.
	* <p>
	* The separate details region allows fast scan of the terms without having
	* to decode the details for each term. At read time, the {@link BlockLine.Serializer#readLine}
	* only reads the term and its offset to the details. The corresponding {@link BlockTermState}
	* is decoded on demand in the {@link BlockReader} (see {@link BlockReader#readTermStateIfNotRead}).
	*
	* @lucene.experimental
	*/
	public class BlockLine implements Accountable {

	private static final long BASE_RAM_USAGE = RamUsageEstimator.shallowSizeOfInstance(BlockLine.class);

	protected TermBytes termBytes;
	protected int termStateRelativeOffset;

	/**
	* Only used for writing.
	*/
	protected final BlockTermState termState;

	/**
	* Constructor used for writing a {@link BlockLine}.
	*/
	protected BlockLine(TermBytes termBytes, BlockTermState termState) {
	this(termBytes, -1, termState);
	}

	/**
	* Constructor used for reading a {@link BlockLine}.
	*/
	protected BlockLine(TermBytes termBytes, int termStateRelativeOffset) {
	this(termBytes, termStateRelativeOffset, null);
	}

	private BlockLine(TermBytes termBytes, int termStateRelativeOffset, BlockTermState termState) {
	reset(termBytes, termStateRelativeOffset);
	this.termState = termState;
	}

	/**
	* Resets this {@link BlockLine} to reuse it when reading.
	*/
	protected BlockLine reset(TermBytes termBytes, int termStateRelativeOffset) {
	assert termState == null;
	this.termBytes = termBytes;
	this.termStateRelativeOffset = termStateRelativeOffset;
	return this;
	}

	public TermBytes getTermBytes() {
	return termBytes;
	}

	/**
	* @return The offset of the {@link org.apache.lucene.index.TermState}
	* bytes in the block, relatively to the term states base offset.
	*/
	public int getTermStateRelativeOffset() {
	return termStateRelativeOffset;
	}

	@Override
	public long ramBytesUsed() {
	return BASE_RAM_USAGE
	+ termBytes.ramBytesUsed()
	+ RamUsageUtil.ramBytesUsed(termState);
	}

	/**
	* Reads/writes block lines with terms encoded incrementally inside a block.
	* This class keeps a state of the previous term read to decode the next term.
	*/
	public static class Serializer implements Accountable {

	private static final long BASE_RAM_USAGE = RamUsageEstimator.shallowSizeOfInstance(Serializer.class);

	protected final BytesRef currentTerm;

	public Serializer() {
	currentTerm = new BytesRef(64);
	}

	/**
	* Reads the current line.
	*
	* @param isIncrementalEncodingSeed Whether the term is a seed of the
	* incremental encoding. {@code true} for the first and
	* middle term, {@code false} for other terms.
	* @param reuse A {@link BlockLine} instance to reuse; or null if none.
	*/
	public BlockLine readLine(DataInput blockInput, boolean isIncrementalEncodingSeed, BlockLine reuse) throws IOException {
	int termStateRelativeOffset = blockInput.readVInt();
	if (termStateRelativeOffset < 0) {
	throw new CorruptIndexException("Illegal termStateRelativeOffset= " + termStateRelativeOffset, blockInput);
	}
	return reuse == null ?
	new BlockLine(readIncrementallyEncodedTerm(blockInput, isIncrementalEncodingSeed, null), termStateRelativeOffset)
	: reuse.reset(readIncrementallyEncodedTerm(blockInput, isIncrementalEncodingSeed, reuse.termBytes), termStateRelativeOffset);
	}

	/**
	* Writes a line and its offset to the corresponding term state details in
	* the details region.
	*
	* @param blockOutput The output pointing to the block terms region.
	* @param termStateRelativeOffset The offset to the corresponding term
	* state details in the details region.
	* @param isIncrementalEncodingSeed Whether the term is a seed of
	* the incremental encoding. {@code true} for the first
	* and middle term, {@code false} for other terms.
	*/
	public void writeLine(DataOutput blockOutput, BlockLine line, BlockLine previousLine,
	int termStateRelativeOffset, boolean isIncrementalEncodingSeed) throws IOException {
	blockOutput.writeVInt(termStateRelativeOffset);
	writeIncrementallyEncodedTerm(line.getTermBytes(), previousLine == null ? null : previousLine.getTermBytes(),
	isIncrementalEncodingSeed, blockOutput);
	}

	/**
	* Writes the term state details of a line in the details region.
	*
	* @param termStatesOutput The output pointing to the details region.
	*/
	protected void writeLineTermState(DataOutput termStatesOutput, BlockLine line,
	FieldInfo fieldInfo, DeltaBaseTermStateSerializer encoder) throws IOException {
	assert line.termState != null;
	encoder.writeTermState(termStatesOutput, fieldInfo, line.termState);
	}

	protected void writeIncrementallyEncodedTerm(TermBytes termBytes, TermBytes previousTermBytes,
	boolean isIncrementalEncodingSeed, DataOutput blockOutput) throws IOException {
	BytesRef term = termBytes.getTerm();
	assert term.offset == 0;
	if (isIncrementalEncodingSeed) {
	// Mdp length is always 1 for an incremental encoding seed.
	blockOutput.writeVLong(term.length);
	blockOutput.writeBytes(term.bytes, 0, term.length);
	return;
	}
	if (term.length == 0) {
	// Empty term.
	blockOutput.writeVLong(0);
	return;
	}

	// For other lines we store:
	// - Mdp length.
	// - Suffix length.
	// - Suffix bytes.
	// Instead of writing mdp length and suffix length with 2 VInt, we can compress the storage
	// by merging them in a single VLong. The idea is to leverage the information we have about
	// the previous line. We know the previous line term length. And we know that
	// new line mdp length <= (previous line term length + 1)
	// So if numMdpBits = numBitsToEncode(previous line term length),
	// then we know we can encode (new line mdp length - 1) in numMdpBits.
	// Hence we encode (new line mdp length - 1) in the rightmost numMdpBits of the VLong.
	// And we encode new line suffix length in the remaining left bits of the VLong.
	// Most of the time both values will be encoded in a single byte.

	assert previousTermBytes != null;
	assert termBytes.getMdpLength() >= 1;

	int numMdpBits = numBitsToEncode(previousTermBytes.getTerm().length);
	assert numBitsToEncode(termBytes.getMdpLength() - 1) <= numMdpBits;

	long mdpAndSuffixLengths = (((long) termBytes.getSuffixLength()) << numMdpBits) \| (termBytes.getMdpLength() - 1);
	assert mdpAndSuffixLengths != 0;
	blockOutput.writeVLong(mdpAndSuffixLengths);
	blockOutput.writeBytes(term.bytes, termBytes.getSuffixOffset(), termBytes.getSuffixLength());
	}

	protected TermBytes readIncrementallyEncodedTerm(DataInput blockInput, boolean isIncrementalEncodingSeed, TermBytes reuse) throws IOException {
	assert currentTerm.offset == 0;
	int mdpLength;
	if (isIncrementalEncodingSeed) {
	int length = (int) blockInput.readVLong();
	mdpLength = length == 0 ? 0 : 1;
	readBytes(blockInput, currentTerm, 0, length);
	} else {
	long mdpAndSuffixLengths = blockInput.readVLong();
	if (mdpAndSuffixLengths == 0) {
	// Empty term.
	mdpLength = 0;
	currentTerm.length = 0;
	} else {
	int numMdpBits = numBitsToEncode(currentTerm.length);
	mdpLength = (int) (mdpAndSuffixLengths & ((1 << numMdpBits) - 1)) + 1; // Get rightmost numMdpBits.
	int suffixLength = (int) (mdpAndSuffixLengths >>> numMdpBits); // Get remaining left bits.
	assert mdpLength >= 1;
	assert suffixLength >= 1;
	readBytes(blockInput, currentTerm, mdpLength - 1, suffixLength);
	}
	}
	return reuse == null ?
	new TermBytes(mdpLength, currentTerm)
	: reuse.reset(mdpLength, currentTerm);
	}

	/**
	* Reads {@code length} bytes from the given {@link DataInput} and stores
	* them at {@code offset} in {@code bytes.bytes}.
	*/
	protected void readBytes(DataInput input, BytesRef bytes, int offset, int length) throws IOException {
	assert bytes.offset == 0;
	bytes.length = offset + length;
	bytes.bytes = ArrayUtil.grow(bytes.bytes, bytes.length);
	input.readBytes(bytes.bytes, offset, length);
	}

	@Override
	public long ramBytesUsed() {
	return BASE_RAM_USAGE
	+ RamUsageUtil.ramBytesUsed(currentTerm);
	}

	/**
	* Gets the number of bits required to encode the value of the provided int.
	* Returns 0 for int value 0. Equivalent to (log2(i) + 1).
	*/
	protected static int numBitsToEncode(int i) {
	return 32 - Integer.numberOfLeadingZeros(i);
	}
	}
	}