lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene70/IndexedDISI.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.backward_codecs.lucene70;

 import java.io.DataInput;
 import java.io.IOException;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.store.IndexInput;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.BitSetIterator;
 import org.apache.lucene.util.FixedBitSet;
 import org.apache.lucene.util.RoaringDocIdSet;

 /**
  * Disk-based implementation of a {@link DocIdSetIterator} which can return the index of the current
  * document, i.e. the ordinal of the current document among the list of documents that this iterator
  * can return. This is useful to implement sparse doc values by only having to encode values for
  * documents that actually have a value.
  *
  * <p>Implementation-wise, this {@link DocIdSetIterator} is inspired of {@link RoaringDocIdSet
  * roaring bitmaps} and encodes ranges of {@code 65536} documents independently and picks between 3
  * encodings depending on the density of the range:
  *
  * <ul>
  *   <li>{@code ALL} if the range contains 65536 documents exactly,
  *   <li>{@code DENSE} if the range contains 4096 documents or more; in that case documents are
  *       stored in a bit set,
  *   <li>{@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are stored in a {@link
  *       DataInput#readShort() short}.
  * </ul>
  *
  * <p>Only ranges that contain at least one value are encoded.
  *
  * <p>This implementation uses 6 bytes per document in the worst-case, which happens in the case
  * that all ranges contain exactly one document.
  *
  * @lucene.internal
  */
 final class IndexedDISI extends DocIdSetIterator {

   static final int MAX_ARRAY_LENGTH = (1 << 12) - 1;

   private static void flush(int block, FixedBitSet buffer, int cardinality, IndexOutput out)
       throws IOException {
     assert block >= 0 && block < 65536;
     out.writeShort((short) block);
     assert cardinality > 0 && cardinality <= 65536;
     out.writeShort((short) (cardinality - 1));
     if (cardinality > MAX_ARRAY_LENGTH) {
       if (cardinality != 65536) { // all docs are set
         for (long word : buffer.getBits()) {
           out.writeLong(word);
         }
       }
     } else {
       BitSetIterator it = new BitSetIterator(buffer, cardinality);
       for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
         out.writeShort((short) doc);
       }
     }
   }

   static void writeBitSet(DocIdSetIterator it, IndexOutput out) throws IOException {
     int i = 0;
     final FixedBitSet buffer = new FixedBitSet(1 << 16);
     int prevBlock = -1;
     for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
       final int block = doc >>> 16;
       if (prevBlock != -1 && block != prevBlock) {
         flush(prevBlock, buffer, i, out);
         buffer.clear(0, buffer.length());
         prevBlock = block;
         i = 0;
       }
       buffer.set(doc & 0xFFFF);
       i++;
       prevBlock = block;
     }
     if (i > 0) {
       flush(prevBlock, buffer, i, out);
       buffer.clear(0, buffer.length());
     }
     // NO_MORE_DOCS is stored explicitly
     buffer.set(DocIdSetIterator.NO_MORE_DOCS & 0xFFFF);
     flush(DocIdSetIterator.NO_MORE_DOCS >>> 16, buffer, 1, out);
   }

   /** The slice that stores the {@link DocIdSetIterator}. */
   private final IndexInput slice;

   private final long cost;

   IndexedDISI(IndexInput in, long offset, long length, long cost) throws IOException {
     this(in.slice("docs", offset, length), cost);
   }

   // This constructor allows to pass the slice directly in case it helps reuse
   // see eg. Lucene70 norms producer's merge instance
   IndexedDISI(IndexInput slice, long cost) throws IOException {
     this.slice = slice;
     this.cost = cost;
   }

   private int block = -1;
   private long blockEnd;
   private int nextBlockIndex = -1;
   Method method;

   private int doc = -1;
   private int index = -1;

   // SPARSE variables
   boolean exists;

   // DENSE variables
   private long word;
   private int wordIndex = -1;
   // number of one bits encountered so far, including those of `word`
   private int numberOfOnes;

   // ALL variables
   private int gap;

   @Override
   public int docID() {
     return doc;
   }

   @Override
   public int advance(int target) throws IOException {
     final int targetBlock = target & 0xFFFF0000;
     if (block < targetBlock) {
       advanceBlock(targetBlock);
     }
     if (block == targetBlock) {
       if (method.advanceWithinBlock(this, target)) {
         return doc;
       }
       readBlockHeader();
     }
     boolean found = method.advanceWithinBlock(this, block);
     assert found;
     return doc;
   }

   public boolean advanceExact(int target) throws IOException {
     final int targetBlock = target & 0xFFFF0000;
     if (block < targetBlock) {
       advanceBlock(targetBlock);
     }
     boolean found = block == targetBlock && method.advanceExactWithinBlock(this, target);
     this.doc = target;
     return found;
   }

   private void advanceBlock(int targetBlock) throws IOException {
     do {
       slice.seek(blockEnd);
       readBlockHeader();
     } while (block < targetBlock);
   }

   private void readBlockHeader() throws IOException {
     block = Short.toUnsignedInt(slice.readShort()) << 16;
     assert block >= 0;
     final int numValues = 1 + Short.toUnsignedInt(slice.readShort());
     index = nextBlockIndex;
     nextBlockIndex = index + numValues;
     if (numValues <= MAX_ARRAY_LENGTH) {
       method = Method.SPARSE;
       blockEnd = slice.getFilePointer() + (numValues << 1);
     } else if (numValues == 65536) {
       method = Method.ALL;
       blockEnd = slice.getFilePointer();
       gap = block - index - 1;
     } else {
       method = Method.DENSE;
       blockEnd = slice.getFilePointer() + (1 << 13);
       wordIndex = -1;
       numberOfOnes = index + 1;
     }
   }

   @Override
   public int nextDoc() throws IOException {
     return advance(doc + 1);
   }

   public int index() {
     return index;
   }

   @Override
   public long cost() {
     return cost;
   }

   enum Method {
     SPARSE {
       @Override
       boolean advanceWithinBlock(IndexedDISI disi, int target) throws IOException {
         final int targetInBlock = target & 0xFFFF;
         // TODO: binary search
         for (; disi.index < disi.nextBlockIndex; ) {
           int doc = Short.toUnsignedInt(disi.slice.readShort());
           disi.index++;
           if (doc >= targetInBlock) {
             disi.doc = disi.block | doc;
             disi.exists = true;
             return true;
           }
         }
         return false;
       }

       @Override
       boolean advanceExactWithinBlock(IndexedDISI disi, int target) throws IOException {
         final int targetInBlock = target & 0xFFFF;
         // TODO: binary search
         if (target == disi.doc) {
           return disi.exists;
         }
         for (; disi.index < disi.nextBlockIndex; ) {
           int doc = Short.toUnsignedInt(disi.slice.readShort());
           disi.index++;
           if (doc >= targetInBlock) {
             if (doc != targetInBlock) {
               disi.index--;
               disi.slice.seek(disi.slice.getFilePointer() - Short.BYTES);
               break;
             }
             disi.exists = true;
             return true;
           }
         }
         disi.exists = false;
         return false;
       }
     },
     DENSE {
       @Override
       boolean advanceWithinBlock(IndexedDISI disi, int target) throws IOException {
         final int targetInBlock = target & 0xFFFF;
         final int targetWordIndex = targetInBlock >>> 6;
         for (int i = disi.wordIndex + 1; i <= targetWordIndex; ++i) {
           disi.word = disi.slice.readLong();
           disi.numberOfOnes += Long.bitCount(disi.word);
         }
         disi.wordIndex = targetWordIndex;

         long leftBits = disi.word >>> target;
         if (leftBits != 0L) {
           disi.doc = target + Long.numberOfTrailingZeros(leftBits);
           disi.index = disi.numberOfOnes - Long.bitCount(leftBits);
           return true;
         }

         while (++disi.wordIndex < 1024) {
           disi.word = disi.slice.readLong();
           if (disi.word != 0) {
             disi.index = disi.numberOfOnes;
             disi.numberOfOnes += Long.bitCount(disi.word);
             disi.doc = disi.block | (disi.wordIndex << 6) | Long.numberOfTrailingZeros(disi.word);
             return true;
           }
         }
         return false;
       }

       @Override
       boolean advanceExactWithinBlock(IndexedDISI disi, int target) throws IOException {
         final int targetInBlock = target & 0xFFFF;
         final int targetWordIndex = targetInBlock >>> 6;
         for (int i = disi.wordIndex + 1; i <= targetWordIndex; ++i) {
           disi.word = disi.slice.readLong();
           disi.numberOfOnes += Long.bitCount(disi.word);
         }
         disi.wordIndex = targetWordIndex;

         long leftBits = disi.word >>> target;
         disi.index = disi.numberOfOnes - Long.bitCount(leftBits);
         return (leftBits & 1L) != 0;
       }
     },
     ALL {
       @Override
       boolean advanceWithinBlock(IndexedDISI disi, int target) throws IOException {
         disi.doc = target;
         disi.index = target - disi.gap;
         return true;
       }

       @Override
       boolean advanceExactWithinBlock(IndexedDISI disi, int target) throws IOException {
         disi.index = target - disi.gap;
         return true;
       }
     };

     /**
      * Advance to the first doc from the block that is equal to or greater than {@code target}.
      * Return true if there is such a doc and false otherwise.
      */
     abstract boolean advanceWithinBlock(IndexedDISI disi, int target) throws IOException;

     /**
      * Advance the iterator exactly to the position corresponding to the given {@code target} and
      * return whether this document exists.
      */
     abstract boolean advanceExactWithinBlock(IndexedDISI disi, int target) throws IOException;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.backward_codecs.lucene70;

	import java.io.DataInput;
	import java.io.IOException;
	import org.apache.lucene.search.DocIdSetIterator;
	import org.apache.lucene.store.IndexInput;
	import org.apache.lucene.store.IndexOutput;
	import org.apache.lucene.util.BitSetIterator;
	import org.apache.lucene.util.FixedBitSet;
	import org.apache.lucene.util.RoaringDocIdSet;

	/**
	* Disk-based implementation of a {@link DocIdSetIterator} which can return the index of the current
	* document, i.e. the ordinal of the current document among the list of documents that this iterator
	* can return. This is useful to implement sparse doc values by only having to encode values for
	* documents that actually have a value.
	*
	* <p>Implementation-wise, this {@link DocIdSetIterator} is inspired of {@link RoaringDocIdSet
	* roaring bitmaps} and encodes ranges of {@code 65536} documents independently and picks between 3
	* encodings depending on the density of the range:
	*
	* <ul>
	* <li>{@code ALL} if the range contains 65536 documents exactly,
	* <li>{@code DENSE} if the range contains 4096 documents or more; in that case documents are
	* stored in a bit set,
	* <li>{@code SPARSE} otherwise, and the lower 16 bits of the doc IDs are stored in a {@link
	* DataInput#readShort() short}.
	* </ul>
	*
	* <p>Only ranges that contain at least one value are encoded.
	*
	* <p>This implementation uses 6 bytes per document in the worst-case, which happens in the case
	* that all ranges contain exactly one document.
	*
	* @lucene.internal
	*/
	final class IndexedDISI extends DocIdSetIterator {

	static final int MAX_ARRAY_LENGTH = (1 << 12) - 1;

	private static void flush(int block, FixedBitSet buffer, int cardinality, IndexOutput out)
	throws IOException {
	assert block >= 0 && block < 65536;
	out.writeShort((short) block);
	assert cardinality > 0 && cardinality <= 65536;
	out.writeShort((short) (cardinality - 1));
	if (cardinality > MAX_ARRAY_LENGTH) {
	if (cardinality != 65536) { // all docs are set
	for (long word : buffer.getBits()) {
	out.writeLong(word);
	}
	}
	} else {
	BitSetIterator it = new BitSetIterator(buffer, cardinality);
	for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
	out.writeShort((short) doc);
	}
	}
	}

	static void writeBitSet(DocIdSetIterator it, IndexOutput out) throws IOException {
	int i = 0;
	final FixedBitSet buffer = new FixedBitSet(1 << 16);
	int prevBlock = -1;
	for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
	final int block = doc >>> 16;
	if (prevBlock != -1 && block != prevBlock) {
	flush(prevBlock, buffer, i, out);
	buffer.clear(0, buffer.length());
	prevBlock = block;
	i = 0;
	}
	buffer.set(doc & 0xFFFF);
	i++;
	prevBlock = block;
	}
	if (i > 0) {
	flush(prevBlock, buffer, i, out);
	buffer.clear(0, buffer.length());
	}
	// NO_MORE_DOCS is stored explicitly
	buffer.set(DocIdSetIterator.NO_MORE_DOCS & 0xFFFF);
	flush(DocIdSetIterator.NO_MORE_DOCS >>> 16, buffer, 1, out);
	}

	/** The slice that stores the {@link DocIdSetIterator}. */
	private final IndexInput slice;

	private final long cost;

	IndexedDISI(IndexInput in, long offset, long length, long cost) throws IOException {
	this(in.slice("docs", offset, length), cost);
	}

	// This constructor allows to pass the slice directly in case it helps reuse
	// see eg. Lucene70 norms producer's merge instance
	IndexedDISI(IndexInput slice, long cost) throws IOException {
	this.slice = slice;
	this.cost = cost;
	}

	private int block = -1;
	private long blockEnd;
	private int nextBlockIndex = -1;
	Method method;

	private int doc = -1;
	private int index = -1;

	// SPARSE variables
	boolean exists;

	// DENSE variables
	private long word;
	private int wordIndex = -1;
	// number of one bits encountered so far, including those of `word`
	private int numberOfOnes;

	// ALL variables
	private int gap;

	@Override
	public int docID() {
	return doc;
	}

	@Override
	public int advance(int target) throws IOException {
	final int targetBlock = target & 0xFFFF0000;
	if (block < targetBlock) {
	advanceBlock(targetBlock);
	}
	if (block == targetBlock) {
	if (method.advanceWithinBlock(this, target)) {
	return doc;
	}
	readBlockHeader();
	}
	boolean found = method.advanceWithinBlock(this, block);
	assert found;
	return doc;
	}

	public boolean advanceExact(int target) throws IOException {
	final int targetBlock = target & 0xFFFF0000;
	if (block < targetBlock) {
	advanceBlock(targetBlock);
	}
	boolean found = block == targetBlock && method.advanceExactWithinBlock(this, target);
	this.doc = target;
	return found;
	}

	private void advanceBlock(int targetBlock) throws IOException {
	do {
	slice.seek(blockEnd);
	readBlockHeader();
	} while (block < targetBlock);
	}

	private void readBlockHeader() throws IOException {
	block = Short.toUnsignedInt(slice.readShort()) << 16;
	assert block >= 0;
	final int numValues = 1 + Short.toUnsignedInt(slice.readShort());
	index = nextBlockIndex;
	nextBlockIndex = index + numValues;
	if (numValues <= MAX_ARRAY_LENGTH) {
	method = Method.SPARSE;
	blockEnd = slice.getFilePointer() + (numValues << 1);
	} else if (numValues == 65536) {
	method = Method.ALL;
	blockEnd = slice.getFilePointer();
	gap = block - index - 1;
	} else {
	method = Method.DENSE;
	blockEnd = slice.getFilePointer() + (1 << 13);
	wordIndex = -1;
	numberOfOnes = index + 1;
	}
	}

	@Override
	public int nextDoc() throws IOException {
	return advance(doc + 1);
	}

	public int index() {
	return index;
	}

	@Override
	public long cost() {
	return cost;
	}

	enum Method {
	SPARSE {
	@Override
	boolean advanceWithinBlock(IndexedDISI disi, int target) throws IOException {
	final int targetInBlock = target & 0xFFFF;
	// TODO: binary search
	for (; disi.index < disi.nextBlockIndex; ) {
	int doc = Short.toUnsignedInt(disi.slice.readShort());
	disi.index++;
	if (doc >= targetInBlock) {
	disi.doc = disi.block \| doc;
	disi.exists = true;
	return true;
	}
	}
	return false;
	}

	@Override
	boolean advanceExactWithinBlock(IndexedDISI disi, int target) throws IOException {
	final int targetInBlock = target & 0xFFFF;
	// TODO: binary search
	if (target == disi.doc) {
	return disi.exists;
	}
	for (; disi.index < disi.nextBlockIndex; ) {
	int doc = Short.toUnsignedInt(disi.slice.readShort());
	disi.index++;
	if (doc >= targetInBlock) {
	if (doc != targetInBlock) {
	disi.index--;
	disi.slice.seek(disi.slice.getFilePointer() - Short.BYTES);
	break;
	}
	disi.exists = true;
	return true;
	}
	}
	disi.exists = false;
	return false;
	}
	},
	DENSE {
	@Override
	boolean advanceWithinBlock(IndexedDISI disi, int target) throws IOException {
	final int targetInBlock = target & 0xFFFF;
	final int targetWordIndex = targetInBlock >>> 6;
	for (int i = disi.wordIndex + 1; i <= targetWordIndex; ++i) {
	disi.word = disi.slice.readLong();
	disi.numberOfOnes += Long.bitCount(disi.word);
	}
	disi.wordIndex = targetWordIndex;

	long leftBits = disi.word >>> target;
	if (leftBits != 0L) {
	disi.doc = target + Long.numberOfTrailingZeros(leftBits);
	disi.index = disi.numberOfOnes - Long.bitCount(leftBits);
	return true;
	}

	while (++disi.wordIndex < 1024) {
	disi.word = disi.slice.readLong();
	if (disi.word != 0) {
	disi.index = disi.numberOfOnes;
	disi.numberOfOnes += Long.bitCount(disi.word);
	disi.doc = disi.block \| (disi.wordIndex << 6) \| Long.numberOfTrailingZeros(disi.word);
	return true;
	}
	}
	return false;
	}

	@Override
	boolean advanceExactWithinBlock(IndexedDISI disi, int target) throws IOException {
	final int targetInBlock = target & 0xFFFF;
	final int targetWordIndex = targetInBlock >>> 6;
	for (int i = disi.wordIndex + 1; i <= targetWordIndex; ++i) {
	disi.word = disi.slice.readLong();
	disi.numberOfOnes += Long.bitCount(disi.word);
	}
	disi.wordIndex = targetWordIndex;

	long leftBits = disi.word >>> target;
	disi.index = disi.numberOfOnes - Long.bitCount(leftBits);
	return (leftBits & 1L) != 0;
	}
	},
	ALL {
	@Override
	boolean advanceWithinBlock(IndexedDISI disi, int target) throws IOException {
	disi.doc = target;
	disi.index = target - disi.gap;
	return true;
	}

	@Override
	boolean advanceExactWithinBlock(IndexedDISI disi, int target) throws IOException {
	disi.index = target - disi.gap;
	return true;
	}
	};

	/**
	* Advance to the first doc from the block that is equal to or greater than {@code target}.
	* Return true if there is such a doc and false otherwise.
	*/
	abstract boolean advanceWithinBlock(IndexedDISI disi, int target) throws IOException;

	/**
	* Advance the iterator exactly to the position corresponding to the given {@code target} and
	* return whether this document exists.
	*/
	abstract boolean advanceExactWithinBlock(IndexedDISI disi, int target) throws IOException;
	}
	}