hbase-server/src/main/java/org/apache/hadoop/hbase/io/hfile/HFileBlockIndex.java - hbase - Git at Google

 /*
  *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.hadoop.hbase.io.hfile;

 import java.io.ByteArrayOutputStream;
 import java.io.DataInput;
 import java.io.DataInputStream;
 import java.io.DataOutput;
 import java.io.DataOutputStream;
 import java.io.IOException;
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
 import java.util.concurrent.atomic.AtomicReference;

 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.hbase.ByteBufferedKeyOnlyKeyValue;
 import org.apache.hadoop.hbase.Cell;
 import org.apache.hadoop.hbase.CellComparator;
 import org.apache.hadoop.hbase.CellUtil;
 import org.apache.hadoop.hbase.KeyValue;
 import org.apache.hadoop.hbase.KeyValue.KeyOnlyKeyValue;
 import org.apache.hadoop.hbase.classification.InterfaceAudience;
 import org.apache.hadoop.hbase.io.HeapSize;
 import org.apache.hadoop.hbase.io.encoding.DataBlockEncoding;
 import org.apache.hadoop.hbase.io.hfile.HFile.CachingBlockReader;
 import org.apache.hadoop.hbase.nio.ByteBuff;
 import org.apache.hadoop.hbase.regionserver.KeyValueScanner;
 import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.hadoop.hbase.util.ClassSize;
 import org.apache.hadoop.hbase.util.ObjectIntPair;
 import org.apache.hadoop.io.WritableUtils;
 import org.apache.hadoop.util.StringUtils;

 /**
  * Provides functionality to write ({@link BlockIndexWriter}) and read
  * BlockIndexReader
  * single-level and multi-level block indexes.
  *
  * Examples of how to use the block index writer can be found in
  * {@link org.apache.hadoop.hbase.io.hfile.CompoundBloomFilterWriter} and
  *  {@link HFileWriterImpl}. Examples of how to use the reader can be
  *  found in {@link HFileReaderImpl} and
  *  {@link org.apache.hadoop.hbase.io.hfile.TestHFileBlockIndex}.
  */
 @InterfaceAudience.Private
 public class HFileBlockIndex {

   private static final Log LOG = LogFactory.getLog(HFileBlockIndex.class);

   static final int DEFAULT_MAX_CHUNK_SIZE = 128 * 1024;

   /**
    * The maximum size guideline for index blocks (both leaf, intermediate, and
    * root). If not specified, <code>DEFAULT_MAX_CHUNK_SIZE</code> is used.
    */
   public static final String MAX_CHUNK_SIZE_KEY = "hfile.index.block.max.size";

   /**
    * Minimum number of entries in a single index block. Even if we are above the
    * hfile.index.block.max.size we will keep writing to the same block unless we have that many
    * entries. We should have at least a few entries so that we don't have too many levels in the
    * multi-level index. This should be at least 2 to make sure there is no infinite recursion.
    */
   public static final String MIN_INDEX_NUM_ENTRIES_KEY = "hfile.index.block.min.entries";

   static final int DEFAULT_MIN_INDEX_NUM_ENTRIES = 16;

   /**
    * The number of bytes stored in each "secondary index" entry in addition to
    * key bytes in the non-root index block format. The first long is the file
    * offset of the deeper-level block the entry points to, and the int that
    * follows is that block's on-disk size without including header.
    */
   static final int SECONDARY_INDEX_ENTRY_OVERHEAD = Bytes.SIZEOF_INT
       + Bytes.SIZEOF_LONG;

   /**
    * Error message when trying to use inline block API in single-level mode.
    */
   private static final String INLINE_BLOCKS_NOT_ALLOWED =
       "Inline blocks are not allowed in the single-level-only mode";

   /**
    * The size of a meta-data record used for finding the mid-key in a
    * multi-level index. Consists of the middle leaf-level index block offset
    * (long), its on-disk size without header included (int), and the mid-key
    * entry's zero-based index in that leaf index block.
    */
   private static final int MID_KEY_METADATA_SIZE = Bytes.SIZEOF_LONG +
       2 * Bytes.SIZEOF_INT;

   /**
    * An implementation of the BlockIndexReader that deals with block keys which are plain
    * byte[] like MetaBlock or the Bloom Block for ROW bloom.
    * Does not need a comparator. It can work on Bytes.BYTES_RAWCOMPARATOR
    */
    static class ByteArrayKeyBlockIndexReader extends BlockIndexReader {

     private byte[][] blockKeys;

     public ByteArrayKeyBlockIndexReader(final int treeLevel,
         final CachingBlockReader cachingBlockReader) {
       this(treeLevel);
       this.cachingBlockReader = cachingBlockReader;
     }

     public ByteArrayKeyBlockIndexReader(final int treeLevel) {
       // Can be null for METAINDEX block
       searchTreeLevel = treeLevel;
     }

     @Override
     protected long calculateHeapSizeForBlockKeys(long heapSize) {
       // Calculating the size of blockKeys
       if (blockKeys != null) {
         heapSize += ClassSize.REFERENCE;
         // Adding array + references overhead
         heapSize += ClassSize.align(ClassSize.ARRAY + blockKeys.length * ClassSize.REFERENCE);

         // Adding bytes
         for (byte[] key : blockKeys) {
           heapSize += ClassSize.align(ClassSize.ARRAY + key.length);
         }
       }
       return heapSize;
     }

     @Override
     public boolean isEmpty() {
       return blockKeys.length == 0;
     }

     /**
      * @param i
      *          from 0 to {@link #getRootBlockCount() - 1}
      */
     public byte[] getRootBlockKey(int i) {
       return blockKeys[i];
     }

     @Override
     public BlockWithScanInfo loadDataBlockWithScanInfo(Cell key, HFileBlock currentBlock,
         boolean cacheBlocks, boolean pread, boolean isCompaction,
         DataBlockEncoding expectedDataBlockEncoding) throws IOException {
       // this would not be needed
       return null;
     }

     @Override
     public Cell midkey() throws IOException {
       // Not needed here
       return null;
     }

     @Override
     protected void initialize(int numEntries) {
       blockKeys = new byte[numEntries][];
     }

     @Override
     protected void add(final byte[] key, final long offset, final int dataSize) {
       blockOffsets[rootCount] = offset;
       blockKeys[rootCount] = key;
       blockDataSizes[rootCount] = dataSize;
       rootCount++;
     }

     @Override
     public int rootBlockContainingKey(byte[] key, int offset, int length, CellComparator comp) {
       int pos = Bytes.binarySearch(blockKeys, key, offset, length);
       // pos is between -(blockKeys.length + 1) to blockKeys.length - 1, see
       // binarySearch's javadoc.

       if (pos >= 0) {
         // This means this is an exact match with an element of blockKeys.
         assert pos < blockKeys.length;
         return pos;
       }

       // Otherwise, pos = -(i + 1), where blockKeys[i - 1] < key < blockKeys[i],
       // and i is in [0, blockKeys.length]. We are returning j = i - 1 such that
       // blockKeys[j] <= key < blockKeys[j + 1]. In particular, j = -1 if
       // key < blockKeys[0], meaning the file does not contain the given key.

       int i = -pos - 1;
       assert 0 <= i && i <= blockKeys.length;
       return i - 1;
     }

     @Override
     public int rootBlockContainingKey(Cell key) {
       // Should not be called on this because here it deals only with byte[]
       throw new UnsupportedOperationException(
           "Cannot search for a key that is of Cell type. Only plain byte array keys " +
           "can be searched for");
     }

     @Override
     public String toString() {
       StringBuilder sb = new StringBuilder();
       sb.append("size=" + rootCount).append("\n");
       for (int i = 0; i < rootCount; i++) {
         sb.append("key=").append(KeyValue.keyToString(blockKeys[i]))
             .append("\n  offset=").append(blockOffsets[i])
             .append(", dataSize=" + blockDataSizes[i]).append("\n");
       }
       return sb.toString();
     }

   }

   /**
    * An implementation of the BlockIndexReader that deals with block keys which are the key
    * part of a cell like the Data block index or the ROW_COL bloom blocks
    * This needs a comparator to work with the Cells
    */
    static class CellBasedKeyBlockIndexReader extends BlockIndexReader {

     private Cell[] blockKeys;
     /** Pre-computed mid-key */
     private AtomicReference<Cell> midKey = new AtomicReference<Cell>();
     /** Needed doing lookup on blocks. */
     private CellComparator comparator;

     public CellBasedKeyBlockIndexReader(final CellComparator c, final int treeLevel,
         final CachingBlockReader cachingBlockReader) {
       this(c, treeLevel);
       this.cachingBlockReader = cachingBlockReader;
     }

     public CellBasedKeyBlockIndexReader(final CellComparator c, final int treeLevel) {
       // Can be null for METAINDEX block
       comparator = c;
       searchTreeLevel = treeLevel;
     }

     @Override
     protected long calculateHeapSizeForBlockKeys(long heapSize) {
       if (blockKeys != null) {
         heapSize += ClassSize.REFERENCE;
         // Adding array + references overhead
         heapSize += ClassSize.align(ClassSize.ARRAY + blockKeys.length * ClassSize.REFERENCE);

         // Adding blockKeys
         for (Cell key : blockKeys) {
           heapSize += ClassSize.align(CellUtil.estimatedHeapSizeOf(key));
         }
       }
       // Add comparator and the midkey atomicreference
       heapSize += 2 * ClassSize.REFERENCE;
       return heapSize;
     }

     @Override
     public boolean isEmpty() {
       return blockKeys.length == 0;
     }

     /**
      * @param i
      *          from 0 to {@link #getRootBlockCount() - 1}
      */
     public Cell getRootBlockKey(int i) {
       return blockKeys[i];
     }

     @Override
     public BlockWithScanInfo loadDataBlockWithScanInfo(Cell key, HFileBlock currentBlock,
         boolean cacheBlocks, boolean pread, boolean isCompaction,
         DataBlockEncoding expectedDataBlockEncoding) throws IOException {
       int rootLevelIndex = rootBlockContainingKey(key);
       if (rootLevelIndex < 0 || rootLevelIndex >= blockOffsets.length) {
         return null;
       }

       // the next indexed key
       Cell nextIndexedKey = null;

       // Read the next-level (intermediate or leaf) index block.
       long currentOffset = blockOffsets[rootLevelIndex];
       int currentOnDiskSize = blockDataSizes[rootLevelIndex];

       if (rootLevelIndex < blockKeys.length - 1) {
         nextIndexedKey = blockKeys[rootLevelIndex + 1];
       } else {
         nextIndexedKey = KeyValueScanner.NO_NEXT_INDEXED_KEY;
       }

       int lookupLevel = 1; // How many levels deep we are in our lookup.
       int index = -1;

       HFileBlock block = null;
       boolean dataBlock = false;
       KeyOnlyKeyValue tmpNextIndexKV = new KeyValue.KeyOnlyKeyValue();
       while (true) {
         try {
           if (currentBlock != null && currentBlock.getOffset() == currentOffset) {
             // Avoid reading the same block again, even with caching turned off.
             // This is crucial for compaction-type workload which might have
             // caching turned off. This is like a one-block cache inside the
             // scanner.
             block = currentBlock;
           } else {
             // Call HFile's caching block reader API. We always cache index
             // blocks, otherwise we might get terrible performance.
             boolean shouldCache = cacheBlocks || (lookupLevel < searchTreeLevel);
             BlockType expectedBlockType;
             if (lookupLevel < searchTreeLevel - 1) {
               expectedBlockType = BlockType.INTERMEDIATE_INDEX;
             } else if (lookupLevel == searchTreeLevel - 1) {
               expectedBlockType = BlockType.LEAF_INDEX;
             } else {
               // this also accounts for ENCODED_DATA
               expectedBlockType = BlockType.DATA;
             }
             block =
                 cachingBlockReader.readBlock(currentOffset, currentOnDiskSize, shouldCache, pread,
                   isCompaction, true, expectedBlockType, expectedDataBlockEncoding);
           }

           if (block == null) {
             throw new IOException("Failed to read block at offset " + currentOffset
                 + ", onDiskSize=" + currentOnDiskSize);
           }

           // Found a data block, break the loop and check our level in the tree.
           if (block.getBlockType().isData()) {
             dataBlock = true;
             break;
           }

           // Not a data block. This must be a leaf-level or intermediate-level
           // index block. We don't allow going deeper than searchTreeLevel.
           if (++lookupLevel > searchTreeLevel) {
             throw new IOException("Search Tree Level overflow: lookupLevel=" + lookupLevel
                 + ", searchTreeLevel=" + searchTreeLevel);
           }

           // Locate the entry corresponding to the given key in the non-root
           // (leaf or intermediate-level) index block.
           ByteBuff buffer = block.getBufferWithoutHeader();
           index = locateNonRootIndexEntry(buffer, key, comparator);
           if (index == -1) {
             // This has to be changed
             // For now change this to key value
             throw new IOException("The key "
                 + CellUtil.getCellKeyAsString(key)
                 + " is before the" + " first key of the non-root index block " + block);
           }

           currentOffset = buffer.getLong();
           currentOnDiskSize = buffer.getInt();

           // Only update next indexed key if there is a next indexed key in the current level
           byte[] nonRootIndexedKey = getNonRootIndexedKey(buffer, index + 1);
           if (nonRootIndexedKey != null) {
             tmpNextIndexKV.setKey(nonRootIndexedKey, 0, nonRootIndexedKey.length);
             nextIndexedKey = tmpNextIndexKV;
           }
         } finally {
           if (!dataBlock) {
             // Return the block immediately if it is not the
             // data block
             cachingBlockReader.returnBlock(block);
           }
         }
       }

       if (lookupLevel != searchTreeLevel) {
         assert dataBlock == true;
         // Though we have retrieved a data block we have found an issue
         // in the retrieved data block. Hence returned the block so that
         // the ref count can be decremented
         cachingBlockReader.returnBlock(block);
         throw new IOException("Reached a data block at level " + lookupLevel +
             " but the number of levels is " + searchTreeLevel);
       }

       // set the next indexed key for the current block.
       BlockWithScanInfo blockWithScanInfo = new BlockWithScanInfo(block, nextIndexedKey);
       return blockWithScanInfo;
     }

     @Override
     public Cell midkey() throws IOException {
       if (rootCount == 0)
         throw new IOException("HFile empty");

       Cell targetMidKey = this.midKey.get();
       if (targetMidKey != null) {
         return targetMidKey;
       }

       if (midLeafBlockOffset >= 0) {
         if (cachingBlockReader == null) {
           throw new IOException("Have to read the middle leaf block but " +
               "no block reader available");
         }

         // Caching, using pread, assuming this is not a compaction.
         HFileBlock midLeafBlock = cachingBlockReader.readBlock(
             midLeafBlockOffset, midLeafBlockOnDiskSize, true, true, false, true,
             BlockType.LEAF_INDEX, null);
         try {
           ByteBuff b = midLeafBlock.getBufferWithoutHeader();
           int numDataBlocks = b.getIntAfterPosition(0);
           int keyRelOffset = b.getIntAfterPosition(Bytes.SIZEOF_INT * (midKeyEntry + 1));
           int keyLen = b.getIntAfterPosition(Bytes.SIZEOF_INT * (midKeyEntry + 2)) - keyRelOffset;
           int keyOffset =
               Bytes.SIZEOF_INT * (numDataBlocks + 2) + keyRelOffset
                   + SECONDARY_INDEX_ENTRY_OVERHEAD;
           byte[] bytes = b.toBytes(keyOffset, keyLen);
           targetMidKey = new KeyValue.KeyOnlyKeyValue(bytes, 0, bytes.length);
         } finally {
           cachingBlockReader.returnBlock(midLeafBlock);
         }
       } else {
         // The middle of the root-level index.
         targetMidKey = blockKeys[rootCount / 2];
       }

       this.midKey.set(targetMidKey);
       return targetMidKey;
     }

     @Override
     protected void initialize(int numEntries) {
       blockKeys = new Cell[numEntries];
     }

     /**
      * Adds a new entry in the root block index. Only used when reading.
      *
      * @param key Last key in the block
      * @param offset file offset where the block is stored
      * @param dataSize the uncompressed data size
      */
     @Override
     protected void add(final byte[] key, final long offset, final int dataSize) {
       blockOffsets[rootCount] = offset;
       // Create the blockKeys as Cells once when the reader is opened
       blockKeys[rootCount] = new KeyValue.KeyOnlyKeyValue(key, 0, key.length);
       blockDataSizes[rootCount] = dataSize;
       rootCount++;
     }

     @Override
     public int rootBlockContainingKey(final byte[] key, int offset, int length,
         CellComparator comp) {
       // This should always be called with Cell not with a byte[] key
       throw new UnsupportedOperationException("Cannot find for a key containing plain byte " +
       		"array. Only cell based keys can be searched for");
     }

     @Override
     public int rootBlockContainingKey(Cell key) {
       // Here the comparator should not be null as this happens for the root-level block
       int pos = Bytes.binarySearch(blockKeys, key, comparator);
       // pos is between -(blockKeys.length + 1) to blockKeys.length - 1, see
       // binarySearch's javadoc.

       if (pos >= 0) {
         // This means this is an exact match with an element of blockKeys.
         assert pos < blockKeys.length;
         return pos;
       }

       // Otherwise, pos = -(i + 1), where blockKeys[i - 1] < key < blockKeys[i],
       // and i is in [0, blockKeys.length]. We are returning j = i - 1 such that
       // blockKeys[j] <= key < blockKeys[j + 1]. In particular, j = -1 if
       // key < blockKeys[0], meaning the file does not contain the given key.

       int i = -pos - 1;
       assert 0 <= i && i <= blockKeys.length;
       return i - 1;
     }

     @Override
     public String toString() {
       StringBuilder sb = new StringBuilder();
       sb.append("size=" + rootCount).append("\n");
       for (int i = 0; i < rootCount; i++) {
         sb.append("key=").append((blockKeys[i]))
             .append("\n  offset=").append(blockOffsets[i])
             .append(", dataSize=" + blockDataSizes[i]).append("\n");
       }
       return sb.toString();
     }
   }
    /**
    * The reader will always hold the root level index in the memory. Index
    * blocks at all other levels will be cached in the LRU cache in practice,
    * although this API does not enforce that.
    *
    * All non-root (leaf and intermediate) index blocks contain what we call a
    * "secondary index": an array of offsets to the entries within the block.
    * This allows us to do binary search for the entry corresponding to the
    * given key without having to deserialize the block.
    */
    static abstract class BlockIndexReader implements HeapSize {

     protected long[] blockOffsets;
     protected int[] blockDataSizes;
     protected int rootCount = 0;

     // Mid-key metadata.
     protected long midLeafBlockOffset = -1;
     protected int midLeafBlockOnDiskSize = -1;
     protected int midKeyEntry = -1;

     /**
      * The number of levels in the block index tree. One if there is only root
      * level, two for root and leaf levels, etc.
      */
     protected int searchTreeLevel;

     /** A way to read {@link HFile} blocks at a given offset */
     protected CachingBlockReader cachingBlockReader;

     /**
      * @return true if the block index is empty.
      */
     public abstract boolean isEmpty();

     /**
      * Verifies that the block index is non-empty and throws an
      * {@link IllegalStateException} otherwise.
      */
     public void ensureNonEmpty() {
       if (isEmpty()) {
         throw new IllegalStateException("Block index is empty or not loaded");
       }
     }

     /**
      * Return the data block which contains this key. This function will only
      * be called when the HFile version is larger than 1.
      *
      * @param key the key we are looking for
      * @param currentBlock the current block, to avoid re-reading the same block
      * @param cacheBlocks
      * @param pread
      * @param isCompaction
      * @param expectedDataBlockEncoding the data block encoding the caller is
      *          expecting the data block to be in, or null to not perform this
      *          check and return the block irrespective of the encoding
      * @return reader a basic way to load blocks
      * @throws IOException
      */
     public HFileBlock seekToDataBlock(final Cell key, HFileBlock currentBlock, boolean cacheBlocks,
         boolean pread, boolean isCompaction, DataBlockEncoding expectedDataBlockEncoding)
         throws IOException {
       BlockWithScanInfo blockWithScanInfo = loadDataBlockWithScanInfo(key, currentBlock,
           cacheBlocks,
           pread, isCompaction, expectedDataBlockEncoding);
       if (blockWithScanInfo == null) {
         return null;
       } else {
         return blockWithScanInfo.getHFileBlock();
       }
     }

     /**
      * Return the BlockWithScanInfo which contains the DataBlock with other scan
      * info such as nextIndexedKey. This function will only be called when the
      * HFile version is larger than 1.
      *
      * @param key
      *          the key we are looking for
      * @param currentBlock
      *          the current block, to avoid re-reading the same block
      * @param cacheBlocks
      * @param pread
      * @param isCompaction
      * @param expectedDataBlockEncoding the data block encoding the caller is
      *          expecting the data block to be in, or null to not perform this
      *          check and return the block irrespective of the encoding.
      * @return the BlockWithScanInfo which contains the DataBlock with other
      *         scan info such as nextIndexedKey.
      * @throws IOException
      */
     public abstract BlockWithScanInfo loadDataBlockWithScanInfo(Cell key, HFileBlock currentBlock,
         boolean cacheBlocks,
         boolean pread, boolean isCompaction, DataBlockEncoding expectedDataBlockEncoding)
         throws IOException;

     /**
      * An approximation to the {@link HFile}'s mid-key. Operates on block
      * boundaries, and does not go inside blocks. In other words, returns the
      * first key of the middle block of the file.
      *
      * @return the first key of the middle block
      */
     public abstract Cell midkey() throws IOException;

     /**
      * @param i from 0 to {@link #getRootBlockCount() - 1}
      */
     public long getRootBlockOffset(int i) {
       return blockOffsets[i];
     }

     /**
      * @param i zero-based index of a root-level block
      * @return the on-disk size of the root-level block for version 2, or the
      *         uncompressed size for version 1
      */
     public int getRootBlockDataSize(int i) {
       return blockDataSizes[i];
     }

     /**
      * @return the number of root-level blocks in this block index
      */
     public int getRootBlockCount() {
       return rootCount;
     }

     /**
      * Finds the root-level index block containing the given key.
      *
      * @param key
      *          Key to find
      * @param comp
      *          the comparator to be used
      * @return Offset of block containing <code>key</code> (between 0 and the
      *         number of blocks - 1) or -1 if this file does not contain the
      *         request.
      */
     // When we want to find the meta index block or bloom block for ROW bloom
     // type Bytes.BYTES_RAWCOMPARATOR would be enough. For the ROW_COL bloom case we need the
     // CellComparator.
     public abstract int rootBlockContainingKey(final byte[] key, int offset, int length,
         CellComparator comp);

     /**
      * Finds the root-level index block containing the given key.
      *
      * @param key
      *          Key to find
      * @return Offset of block containing <code>key</code> (between 0 and the
      *         number of blocks - 1) or -1 if this file does not contain the
      *         request.
      */
     // When we want to find the meta index block or bloom block for ROW bloom
     // type
     // Bytes.BYTES_RAWCOMPARATOR would be enough. For the ROW_COL bloom case we
     // need the CellComparator.
     public int rootBlockContainingKey(final byte[] key, int offset, int length) {
       return rootBlockContainingKey(key, offset, length, null);
     }

     /**
      * Finds the root-level index block containing the given key.
      *
      * @param key
      *          Key to find
      */
     public abstract int rootBlockContainingKey(final Cell key);

     /**
      * The indexed key at the ith position in the nonRootIndex. The position starts at 0.
      * @param nonRootIndex
      * @param i the ith position
      * @return The indexed key at the ith position in the nonRootIndex.
      */
     protected byte[] getNonRootIndexedKey(ByteBuff nonRootIndex, int i) {
       int numEntries = nonRootIndex.getInt(0);
       if (i < 0 || i >= numEntries) {
         return null;
       }

       // Entries start after the number of entries and the secondary index.
       // The secondary index takes numEntries + 1 ints.
       int entriesOffset = Bytes.SIZEOF_INT * (numEntries + 2);
       // Targetkey's offset relative to the end of secondary index
       int targetKeyRelOffset = nonRootIndex.getInt(
           Bytes.SIZEOF_INT * (i + 1));

       // The offset of the target key in the blockIndex buffer
       int targetKeyOffset = entriesOffset     // Skip secondary index
           + targetKeyRelOffset               // Skip all entries until mid
           + SECONDARY_INDEX_ENTRY_OVERHEAD;  // Skip offset and on-disk-size

       // We subtract the two consecutive secondary index elements, which
       // gives us the size of the whole (offset, onDiskSize, key) tuple. We
       // then need to subtract the overhead of offset and onDiskSize.
       int targetKeyLength = nonRootIndex.getInt(Bytes.SIZEOF_INT * (i + 2)) -
         targetKeyRelOffset - SECONDARY_INDEX_ENTRY_OVERHEAD;

       // TODO check whether we can make BB backed Cell here? So can avoid bytes copy.
       return nonRootIndex.toBytes(targetKeyOffset, targetKeyLength);
     }

     /**
      * Performs a binary search over a non-root level index block. Utilizes the
      * secondary index, which records the offsets of (offset, onDiskSize,
      * firstKey) tuples of all entries.
      *
      * @param key
      *          the key we are searching for offsets to individual entries in
      *          the blockIndex buffer
      * @param nonRootIndex
      *          the non-root index block buffer, starting with the secondary
      *          index. The position is ignored.
      * @return the index i in [0, numEntries - 1] such that keys[i] <= key <
      *         keys[i + 1], if keys is the array of all keys being searched, or
      *         -1 otherwise
      * @throws IOException
      */
     static int binarySearchNonRootIndex(Cell key, ByteBuff nonRootIndex,
         CellComparator comparator) {

       int numEntries = nonRootIndex.getIntAfterPosition(0);
       int low = 0;
       int high = numEntries - 1;
       int mid = 0;

       // Entries start after the number of entries and the secondary index.
       // The secondary index takes numEntries + 1 ints.
       int entriesOffset = Bytes.SIZEOF_INT * (numEntries + 2);

       // If we imagine that keys[-1] = -Infinity and
       // keys[numEntries] = Infinity, then we are maintaining an invariant that
       // keys[low - 1] < key < keys[high + 1] while narrowing down the range.
       ByteBufferedKeyOnlyKeyValue nonRootIndexkeyOnlyKV = new ByteBufferedKeyOnlyKeyValue();
       ObjectIntPair<ByteBuffer> pair = new ObjectIntPair<ByteBuffer>();
       while (low <= high) {
         mid = (low + high) >>> 1;

         // Midkey's offset relative to the end of secondary index
         int midKeyRelOffset = nonRootIndex.getIntAfterPosition(Bytes.SIZEOF_INT * (mid + 1));

         // The offset of the middle key in the blockIndex buffer
         int midKeyOffset = entriesOffset       // Skip secondary index
             + midKeyRelOffset                  // Skip all entries until mid
             + SECONDARY_INDEX_ENTRY_OVERHEAD;  // Skip offset and on-disk-size

         // We subtract the two consecutive secondary index elements, which
         // gives us the size of the whole (offset, onDiskSize, key) tuple. We
         // then need to subtract the overhead of offset and onDiskSize.
         int midLength = nonRootIndex.getIntAfterPosition(Bytes.SIZEOF_INT * (mid + 2)) -
             midKeyRelOffset - SECONDARY_INDEX_ENTRY_OVERHEAD;

         // we have to compare in this order, because the comparator order
         // has special logic when the 'left side' is a special key.
         // TODO make KeyOnlyKeyValue to be Buffer backed and avoid array() call. This has to be
         // done after HBASE-12224 & HBASE-12282
         // TODO avoid array call.
         nonRootIndex.asSubByteBuffer(midKeyOffset, midLength, pair);
         nonRootIndexkeyOnlyKV.setKey(pair.getFirst(), pair.getSecond(), midLength);
         int cmp = comparator.compareKeyIgnoresMvcc(key, nonRootIndexkeyOnlyKV);

         // key lives above the midpoint
         if (cmp > 0)
           low = mid + 1; // Maintain the invariant that keys[low - 1] < key
         // key lives below the midpoint
         else if (cmp < 0)
           high = mid - 1; // Maintain the invariant that key < keys[high + 1]
         else
           return mid; // exact match
       }

       // As per our invariant, keys[low - 1] < key < keys[high + 1], meaning
       // that low - 1 < high + 1 and (low - high) <= 1. As per the loop break
       // condition, low >= high + 1. Therefore, low = high + 1.

       if (low != high + 1) {
         throw new IllegalStateException("Binary search broken: low=" + low
             + " " + "instead of " + (high + 1));
       }

       // OK, our invariant says that keys[low - 1] < key < keys[low]. We need to
       // return i such that keys[i] <= key < keys[i + 1]. Therefore i = low - 1.
       int i = low - 1;

       // Some extra validation on the result.
       if (i < -1 || i >= numEntries) {
         throw new IllegalStateException("Binary search broken: result is " +
             i + " but expected to be between -1 and (numEntries - 1) = " +
             (numEntries - 1));
       }

       return i;
     }

     /**
      * Search for one key using the secondary index in a non-root block. In case
      * of success, positions the provided buffer at the entry of interest, where
      * the file offset and the on-disk-size can be read.
      *
      * @param nonRootBlock
      *          a non-root block without header. Initial position does not
      *          matter.
      * @param key
      *          the byte array containing the key
      * @return the index position where the given key was found, otherwise
      *         return -1 in the case the given key is before the first key.
      *
      */
     static int locateNonRootIndexEntry(ByteBuff nonRootBlock, Cell key,
         CellComparator comparator) {
       int entryIndex = binarySearchNonRootIndex(key, nonRootBlock, comparator);

       if (entryIndex != -1) {
         int numEntries = nonRootBlock.getIntAfterPosition(0);

         // The end of secondary index and the beginning of entries themselves.
         int entriesOffset = Bytes.SIZEOF_INT * (numEntries + 2);

         // The offset of the entry we are interested in relative to the end of
         // the secondary index.
         int entryRelOffset = nonRootBlock
             .getIntAfterPosition(Bytes.SIZEOF_INT * (1 + entryIndex));

         nonRootBlock.position(entriesOffset + entryRelOffset);
       }

       return entryIndex;
     }

     /**
      * Read in the root-level index from the given input stream. Must match
      * what was written into the root level by
      * {@link BlockIndexWriter#writeIndexBlocks(FSDataOutputStream)} at the
      * offset that function returned.
      *
      * @param in the buffered input stream or wrapped byte input stream
      * @param numEntries the number of root-level index entries
      * @throws IOException
      */
     public void readRootIndex(DataInput in, final int numEntries) throws IOException {
       blockOffsets = new long[numEntries];
       initialize(numEntries);
       blockDataSizes = new int[numEntries];

       // If index size is zero, no index was written.
       if (numEntries > 0) {
         for (int i = 0; i < numEntries; ++i) {
           long offset = in.readLong();
           int dataSize = in.readInt();
           byte[] key = Bytes.readByteArray(in);
           add(key, offset, dataSize);
         }
       }
     }

     protected abstract void initialize(int numEntries);

     protected abstract void add(final byte[] key, final long offset, final int dataSize);

     /**
      * Read in the root-level index from the given input stream. Must match
      * what was written into the root level by
      * {@link BlockIndexWriter#writeIndexBlocks(FSDataOutputStream)} at the
      * offset that function returned.
      *
      * @param blk the HFile block
      * @param numEntries the number of root-level index entries
      * @return the buffered input stream or wrapped byte input stream
      * @throws IOException
      */
     public DataInputStream readRootIndex(HFileBlock blk, final int numEntries) throws IOException {
       DataInputStream in = blk.getByteStream();
       readRootIndex(in, numEntries);
       return in;
     }

     /**
      * Read the root-level metadata of a multi-level block index. Based on
      * {@link #readRootIndex(DataInput, int)}, but also reads metadata
      * necessary to compute the mid-key in a multi-level index.
      *
      * @param blk the HFile block
      * @param numEntries the number of root-level index entries
      * @throws IOException
      */
     public void readMultiLevelIndexRoot(HFileBlock blk,
         final int numEntries) throws IOException {
       DataInputStream in = readRootIndex(blk, numEntries);
       // after reading the root index the checksum bytes have to
       // be subtracted to know if the mid key exists.
       int checkSumBytes = blk.totalChecksumBytes();
       if ((in.available() - checkSumBytes) < MID_KEY_METADATA_SIZE) {
         // No mid-key metadata available.
         return;
       }
       midLeafBlockOffset = in.readLong();
       midLeafBlockOnDiskSize = in.readInt();
       midKeyEntry = in.readInt();
     }

     @Override
     public long heapSize() {
       // The BlockIndexReader does not have the blockKey, comparator and the midkey atomic reference
       long heapSize = ClassSize.align(3 * ClassSize.REFERENCE +
           2 * Bytes.SIZEOF_INT + ClassSize.OBJECT);

       // Mid-key metadata.
       heapSize += MID_KEY_METADATA_SIZE;

       heapSize = calculateHeapSizeForBlockKeys(heapSize);

       if (blockOffsets != null) {
         heapSize += ClassSize.align(ClassSize.ARRAY + blockOffsets.length
             * Bytes.SIZEOF_LONG);
       }

       if (blockDataSizes != null) {
         heapSize += ClassSize.align(ClassSize.ARRAY + blockDataSizes.length
             * Bytes.SIZEOF_INT);
       }

       return ClassSize.align(heapSize);
     }

     protected abstract long calculateHeapSizeForBlockKeys(long heapSize);
   }

   /**
    * Writes the block index into the output stream. Generate the tree from
    * bottom up. The leaf level is written to disk as a sequence of inline
    * blocks, if it is larger than a certain number of bytes. If the leaf level
    * is not large enough, we write all entries to the root level instead.
    *
    * After all leaf blocks have been written, we end up with an index
    * referencing the resulting leaf index blocks. If that index is larger than
    * the allowed root index size, the writer will break it up into
    * reasonable-size intermediate-level index block chunks write those chunks
    * out, and create another index referencing those chunks. This will be
    * repeated until the remaining index is small enough to become the root
    * index. However, in most practical cases we will only have leaf-level
    * blocks and the root index, or just the root index.
    */
   public static class BlockIndexWriter implements InlineBlockWriter {
     /**
      * While the index is being written, this represents the current block
      * index referencing all leaf blocks, with one exception. If the file is
      * being closed and there are not enough blocks to complete even a single
      * leaf block, no leaf blocks get written and this contains the entire
      * block index. After all levels of the index were written by
      * {@link #writeIndexBlocks(FSDataOutputStream)}, this contains the final
      * root-level index.
      */
     private BlockIndexChunk rootChunk = new BlockIndexChunk();

     /**
      * Current leaf-level chunk. New entries referencing data blocks get added
      * to this chunk until it grows large enough to be written to disk.
      */
     private BlockIndexChunk curInlineChunk = new BlockIndexChunk();

     /**
      * The number of block index levels. This is one if there is only root
      * level (even empty), two if there a leaf level and root level, and is
      * higher if there are intermediate levels. This is only final after
      * {@link #writeIndexBlocks(FSDataOutputStream)} has been called. The
      * initial value accounts for the root level, and will be increased to two
      * as soon as we find out there is a leaf-level in
      * {@link #blockWritten(long, int, int)}.
      */
     private int numLevels = 1;

     private HFileBlock.Writer blockWriter;
     private byte[] firstKey = null;

     /**
      * The total number of leaf-level entries, i.e. entries referenced by
      * leaf-level blocks. For the data block index this is equal to the number
      * of data blocks.
      */
     private long totalNumEntries;

     /** Total compressed size of all index blocks. */
     private long totalBlockOnDiskSize;

     /** Total uncompressed size of all index blocks. */
     private long totalBlockUncompressedSize;

     /** The maximum size guideline of all multi-level index blocks. */
     private int maxChunkSize;

     /** The maximum level of multi-level index blocks */
     private int minIndexNumEntries;

     /** Whether we require this block index to always be single-level. */
     private boolean singleLevelOnly;

     /** CacheConfig, or null if cache-on-write is disabled */
     private CacheConfig cacheConf;

     /** Name to use for computing cache keys */
     private String nameForCaching;

     /** Creates a single-level block index writer */
     public BlockIndexWriter() {
       this(null, null, null);
       singleLevelOnly = true;
     }

     /**
      * Creates a multi-level block index writer.
      *
      * @param blockWriter the block writer to use to write index blocks
      * @param cacheConf used to determine when and how a block should be cached-on-write.
      */
     public BlockIndexWriter(HFileBlock.Writer blockWriter,
         CacheConfig cacheConf, String nameForCaching) {
       if ((cacheConf == null) != (nameForCaching == null)) {
         throw new IllegalArgumentException("Block cache and file name for " +
             "caching must be both specified or both null");
       }

       this.blockWriter = blockWriter;
       this.cacheConf = cacheConf;
       this.nameForCaching = nameForCaching;
       this.maxChunkSize = HFileBlockIndex.DEFAULT_MAX_CHUNK_SIZE;
       this.minIndexNumEntries = HFileBlockIndex.DEFAULT_MIN_INDEX_NUM_ENTRIES;
     }

     public void setMaxChunkSize(int maxChunkSize) {
       if (maxChunkSize <= 0) {
         throw new IllegalArgumentException("Invalid maximum index block size");
       }
       this.maxChunkSize = maxChunkSize;
     }

     public void setMinIndexNumEntries(int minIndexNumEntries) {
       if (minIndexNumEntries <= 1) {
         throw new IllegalArgumentException("Invalid maximum index level, should be >= 2");
       }
       this.minIndexNumEntries = minIndexNumEntries;
     }

     /**
      * Writes the root level and intermediate levels of the block index into
      * the output stream, generating the tree from bottom up. Assumes that the
      * leaf level has been inline-written to the disk if there is enough data
      * for more than one leaf block. We iterate by breaking the current level
      * of the block index, starting with the index of all leaf-level blocks,
      * into chunks small enough to be written to disk, and generate its parent
      * level, until we end up with a level small enough to become the root
      * level.
      *
      * If the leaf level is not large enough, there is no inline block index
      * anymore, so we only write that level of block index to disk as the root
      * level.
      *
      * @param out FSDataOutputStream
      * @return position at which we entered the root-level index.
      * @throws IOException
      */
     public long writeIndexBlocks(FSDataOutputStream out) throws IOException {
       if (curInlineChunk != null && curInlineChunk.getNumEntries() != 0) {
         throw new IOException("Trying to write a multi-level block index, " +
             "but are " + curInlineChunk.getNumEntries() + " entries in the " +
             "last inline chunk.");
       }

       // We need to get mid-key metadata before we create intermediate
       // indexes and overwrite the root chunk.
       byte[] midKeyMetadata = numLevels > 1 ? rootChunk.getMidKeyMetadata()
           : null;

       if (curInlineChunk != null) {
         while (rootChunk.getRootSize() > maxChunkSize
             // HBASE-16288: if firstKey is larger than maxChunkSize we will loop indefinitely
             && rootChunk.getNumEntries() > minIndexNumEntries
             // Sanity check. We will not hit this (minIndexNumEntries ^ 16) blocks can be addressed
             && numLevels < 16) {
           rootChunk = writeIntermediateLevel(out, rootChunk);
           numLevels += 1;
         }
       }

       // write the root level
       long rootLevelIndexPos = out.getPos();

       {
         DataOutput blockStream =
             blockWriter.startWriting(BlockType.ROOT_INDEX);
         rootChunk.writeRoot(blockStream);
         if (midKeyMetadata != null)
           blockStream.write(midKeyMetadata);
         blockWriter.writeHeaderAndData(out);
         if (cacheConf != null) {
           HFileBlock blockForCaching = blockWriter.getBlockForCaching(cacheConf);
           cacheConf.getBlockCache().cacheBlock(new BlockCacheKey(nameForCaching,
             rootLevelIndexPos, true, blockForCaching.getBlockType()), blockForCaching);
         }
       }

       // Add root index block size
       totalBlockOnDiskSize += blockWriter.getOnDiskSizeWithoutHeader();
       totalBlockUncompressedSize +=
           blockWriter.getUncompressedSizeWithoutHeader();

       if (LOG.isTraceEnabled()) {
         LOG.trace("Wrote a " + numLevels + "-level index with root level at pos "
           + rootLevelIndexPos + ", " + rootChunk.getNumEntries()
           + " root-level entries, " + totalNumEntries + " total entries, "
           + StringUtils.humanReadableInt(this.totalBlockOnDiskSize) +
           " on-disk size, "
           + StringUtils.humanReadableInt(totalBlockUncompressedSize) +
           " total uncompressed size.");
       }
       return rootLevelIndexPos;
     }

     /**
      * Writes the block index data as a single level only. Does not do any
      * block framing.
      *
      * @param out the buffered output stream to write the index to. Typically a
      *          stream writing into an {@link HFile} block.
      * @param description a short description of the index being written. Used
      *          in a log message.
      * @throws IOException
      */
     public void writeSingleLevelIndex(DataOutput out, String description)
         throws IOException {
       expectNumLevels(1);

       if (!singleLevelOnly)
         throw new IOException("Single-level mode is turned off");

       if (rootChunk.getNumEntries() > 0)
         throw new IOException("Root-level entries already added in " +
             "single-level mode");

       rootChunk = curInlineChunk;
       curInlineChunk = new BlockIndexChunk();

       if (LOG.isTraceEnabled()) {
         LOG.trace("Wrote a single-level " + description + " index with "
           + rootChunk.getNumEntries() + " entries, " + rootChunk.getRootSize()
           + " bytes");
       }
       rootChunk.writeRoot(out);
     }

     /**
      * Split the current level of the block index into intermediate index
      * blocks of permitted size and write those blocks to disk. Return the next
      * level of the block index referencing those intermediate-level blocks.
      *
      * @param out
      * @param currentLevel the current level of the block index, such as the a
      *          chunk referencing all leaf-level index blocks
      * @return the parent level block index, which becomes the root index after
      *         a few (usually zero) iterations
      * @throws IOException
      */
     private BlockIndexChunk writeIntermediateLevel(FSDataOutputStream out,
         BlockIndexChunk currentLevel) throws IOException {
       // Entries referencing intermediate-level blocks we are about to create.
       BlockIndexChunk parent = new BlockIndexChunk();

       // The current intermediate-level block index chunk.
       BlockIndexChunk curChunk = new BlockIndexChunk();

       for (int i = 0; i < currentLevel.getNumEntries(); ++i) {
         curChunk.add(currentLevel.getBlockKey(i),
             currentLevel.getBlockOffset(i), currentLevel.getOnDiskDataSize(i));

         // HBASE-16288: We have to have at least minIndexNumEntries(16) items in the index so that
         // we won't end up with too-many levels for a index with very large rowKeys. Also, if the
         // first key is larger than maxChunkSize this will cause infinite recursion.
         if (i >= minIndexNumEntries && curChunk.getRootSize() >= maxChunkSize) {
           writeIntermediateBlock(out, parent, curChunk);
         }
       }

       if (curChunk.getNumEntries() > 0) {
         writeIntermediateBlock(out, parent, curChunk);
       }

       return parent;
     }

     private void writeIntermediateBlock(FSDataOutputStream out,
         BlockIndexChunk parent, BlockIndexChunk curChunk) throws IOException {
       long beginOffset = out.getPos();
       DataOutputStream dos = blockWriter.startWriting(
           BlockType.INTERMEDIATE_INDEX);
       curChunk.writeNonRoot(dos);
       byte[] curFirstKey = curChunk.getBlockKey(0);
       blockWriter.writeHeaderAndData(out);

       if (getCacheOnWrite()) {
         HFileBlock blockForCaching = blockWriter.getBlockForCaching(cacheConf);
         cacheConf.getBlockCache().cacheBlock(new BlockCacheKey(nameForCaching,
           beginOffset, true, blockForCaching.getBlockType()), blockForCaching);
       }

       // Add intermediate index block size
       totalBlockOnDiskSize += blockWriter.getOnDiskSizeWithoutHeader();
       totalBlockUncompressedSize +=
           blockWriter.getUncompressedSizeWithoutHeader();

       // OFFSET is the beginning offset the chunk of block index entries.
       // SIZE is the total byte size of the chunk of block index entries
       // + the secondary index size
       // FIRST_KEY is the first key in the chunk of block index
       // entries.
       parent.add(curFirstKey, beginOffset,
           blockWriter.getOnDiskSizeWithHeader());

       // clear current block index chunk
       curChunk.clear();
       curFirstKey = null;
     }

     /**
      * @return how many block index entries there are in the root level
      */
     public final int getNumRootEntries() {
       return rootChunk.getNumEntries();
     }

     /**
      * @return the number of levels in this block index.
      */
     public int getNumLevels() {
       return numLevels;
     }

     private void expectNumLevels(int expectedNumLevels) {
       if (numLevels != expectedNumLevels) {
         throw new IllegalStateException("Number of block index levels is "
             + numLevels + "but is expected to be " + expectedNumLevels);
       }
     }

     /**
      * Whether there is an inline block ready to be written. In general, we
      * write an leaf-level index block as an inline block as soon as its size
      * as serialized in the non-root format reaches a certain threshold.
      */
     @Override
     public boolean shouldWriteBlock(boolean closing) {
       if (singleLevelOnly) {
         throw new UnsupportedOperationException(INLINE_BLOCKS_NOT_ALLOWED);
       }

       if (curInlineChunk == null) {
         throw new IllegalStateException("curInlineChunk is null; has shouldWriteBlock been " +
             "called with closing=true and then called again?");
       }

       if (curInlineChunk.getNumEntries() == 0) {
         return false;
       }

       // We do have some entries in the current inline chunk.
       if (closing) {
         if (rootChunk.getNumEntries() == 0) {
           // We did not add any leaf-level blocks yet. Instead of creating a
           // leaf level with one block, move these entries to the root level.

           expectNumLevels(1);
           rootChunk = curInlineChunk;
           curInlineChunk = null;  // Disallow adding any more index entries.
           return false;
         }

         return true;
       } else {
         return curInlineChunk.getNonRootSize() >= maxChunkSize;
       }
     }

     /**
      * Write out the current inline index block. Inline blocks are non-root
      * blocks, so the non-root index format is used.
      *
      * @param out
      */
     @Override
     public void writeInlineBlock(DataOutput out) throws IOException {
       if (singleLevelOnly)
         throw new UnsupportedOperationException(INLINE_BLOCKS_NOT_ALLOWED);

       // Write the inline block index to the output stream in the non-root
       // index block format.
       curInlineChunk.writeNonRoot(out);

       // Save the first key of the inline block so that we can add it to the
       // parent-level index.
       firstKey = curInlineChunk.getBlockKey(0);

       // Start a new inline index block
       curInlineChunk.clear();
     }

     /**
      * Called after an inline block has been written so that we can add an
      * entry referring to that block to the parent-level index.
      */
     @Override
     public void blockWritten(long offset, int onDiskSize, int uncompressedSize) {
       // Add leaf index block size
       totalBlockOnDiskSize += onDiskSize;
       totalBlockUncompressedSize += uncompressedSize;

       if (singleLevelOnly)
         throw new UnsupportedOperationException(INLINE_BLOCKS_NOT_ALLOWED);

       if (firstKey == null) {
         throw new IllegalStateException("Trying to add second-level index " +
             "entry with offset=" + offset + " and onDiskSize=" + onDiskSize +
             "but the first key was not set in writeInlineBlock");
       }

       if (rootChunk.getNumEntries() == 0) {
         // We are writing the first leaf block, so increase index level.
         expectNumLevels(1);
         numLevels = 2;
       }

       // Add another entry to the second-level index. Include the number of
       // entries in all previous leaf-level chunks for mid-key calculation.
       rootChunk.add(firstKey, offset, onDiskSize, totalNumEntries);
       firstKey = null;
     }

     @Override
     public BlockType getInlineBlockType() {
       return BlockType.LEAF_INDEX;
     }

     /**
      * Add one index entry to the current leaf-level block. When the leaf-level
      * block gets large enough, it will be flushed to disk as an inline block.
      *
      * @param firstKey the first key of the data block
      * @param blockOffset the offset of the data block
      * @param blockDataSize the on-disk size of the data block ({@link HFile}
      *          format version 2), or the uncompressed size of the data block (
      *          {@link HFile} format version 1).
      */
     public void addEntry(byte[] firstKey, long blockOffset, int blockDataSize) {
       curInlineChunk.add(firstKey, blockOffset, blockDataSize);
       ++totalNumEntries;
     }

     /**
      * @throws IOException if we happened to write a multi-level index.
      */
     public void ensureSingleLevel() throws IOException {
       if (numLevels > 1) {
         throw new IOException ("Wrote a " + numLevels + "-level index with " +
             rootChunk.getNumEntries() + " root-level entries, but " +
             "this is expected to be a single-level block index.");
       }
     }

     /**
      * @return true if we are using cache-on-write. This is configured by the
      *         caller of the constructor by either passing a valid block cache
      *         or null.
      */
     @Override
     public boolean getCacheOnWrite() {
       return cacheConf != null && cacheConf.shouldCacheIndexesOnWrite();
     }

     /**
      * The total uncompressed size of the root index block, intermediate-level
      * index blocks, and leaf-level index blocks.
      *
      * @return the total uncompressed size of all index blocks
      */
     public long getTotalUncompressedSize() {
       return totalBlockUncompressedSize;
     }

   }

   /**
    * A single chunk of the block index in the process of writing. The data in
    * this chunk can become a leaf-level, intermediate-level, or root index
    * block.
    */
   static class BlockIndexChunk {

     /** First keys of the key range corresponding to each index entry. */
     private final List<byte[]> blockKeys = new ArrayList<byte[]>();

     /** Block offset in backing stream. */
     private final List<Long> blockOffsets = new ArrayList<Long>();

     /** On-disk data sizes of lower-level data or index blocks. */
     private final List<Integer> onDiskDataSizes = new ArrayList<Integer>();

     /**
      * The cumulative number of sub-entries, i.e. entries on deeper-level block
      * index entries. numSubEntriesAt[i] is the number of sub-entries in the
      * blocks corresponding to this chunk's entries #0 through #i inclusively.
      */
     private final List<Long> numSubEntriesAt = new ArrayList<Long>();

     /**
      * The offset of the next entry to be added, relative to the end of the
      * "secondary index" in the "non-root" format representation of this index
      * chunk. This is the next value to be added to the secondary index.
      */
     private int curTotalNonRootEntrySize = 0;

     /**
      * The accumulated size of this chunk if stored in the root index format.
      */
     private int curTotalRootSize = 0;

     /**
      * The "secondary index" used for binary search over variable-length
      * records in a "non-root" format block. These offsets are relative to the
      * end of this secondary index.
      */
     private final List<Integer> secondaryIndexOffsetMarks =
         new ArrayList<Integer>();

     /**
      * Adds a new entry to this block index chunk.
      *
      * @param firstKey the first key in the block pointed to by this entry
      * @param blockOffset the offset of the next-level block pointed to by this
      *          entry
      * @param onDiskDataSize the on-disk data of the block pointed to by this
      *          entry, including header size
      * @param curTotalNumSubEntries if this chunk is the root index chunk under
      *          construction, this specifies the current total number of
      *          sub-entries in all leaf-level chunks, including the one
      *          corresponding to the second-level entry being added.
      */
     void add(byte[] firstKey, long blockOffset, int onDiskDataSize,
         long curTotalNumSubEntries) {
       // Record the offset for the secondary index
       secondaryIndexOffsetMarks.add(curTotalNonRootEntrySize);
       curTotalNonRootEntrySize += SECONDARY_INDEX_ENTRY_OVERHEAD
           + firstKey.length;

       curTotalRootSize += Bytes.SIZEOF_LONG + Bytes.SIZEOF_INT
           + WritableUtils.getVIntSize(firstKey.length) + firstKey.length;

       blockKeys.add(firstKey);
       blockOffsets.add(blockOffset);
       onDiskDataSizes.add(onDiskDataSize);

       if (curTotalNumSubEntries != -1) {
         numSubEntriesAt.add(curTotalNumSubEntries);

         // Make sure the parallel arrays are in sync.
         if (numSubEntriesAt.size() != blockKeys.size()) {
           throw new IllegalStateException("Only have key/value count " +
               "stats for " + numSubEntriesAt.size() + " block index " +
               "entries out of " + blockKeys.size());
         }
       }
     }

     /**
      * The same as {@link #add(byte[], long, int, long)} but does not take the
      * key/value into account. Used for single-level indexes.
      *
      * @see {@link #add(byte[], long, int, long)}
      */
     public void add(byte[] firstKey, long blockOffset, int onDiskDataSize) {
       add(firstKey, blockOffset, onDiskDataSize, -1);
     }

     public void clear() {
       blockKeys.clear();
       blockOffsets.clear();
       onDiskDataSizes.clear();
       secondaryIndexOffsetMarks.clear();
       numSubEntriesAt.clear();
       curTotalNonRootEntrySize = 0;
       curTotalRootSize = 0;
     }

     /**
      * Finds the entry corresponding to the deeper-level index block containing
      * the given deeper-level entry (a "sub-entry"), assuming a global 0-based
      * ordering of sub-entries.
      *
      * <p>
      * <i> Implementation note. </i> We are looking for i such that
      * numSubEntriesAt[i - 1] <= k < numSubEntriesAt[i], because a deeper-level
      * block #i (0-based) contains sub-entries # numSubEntriesAt[i - 1]'th
      * through numSubEntriesAt[i] - 1, assuming a global 0-based ordering of
      * sub-entries. i is by definition the insertion point of k in
      * numSubEntriesAt.
      *
      * @param k sub-entry index, from 0 to the total number sub-entries - 1
      * @return the 0-based index of the entry corresponding to the given
      *         sub-entry
      */
     public int getEntryBySubEntry(long k) {
       // We define mid-key as the key corresponding to k'th sub-entry
       // (0-based).

       int i = Collections.binarySearch(numSubEntriesAt, k);

       // Exact match: cumulativeWeight[i] = k. This means chunks #0 through
       // #i contain exactly k sub-entries, and the sub-entry #k (0-based)
       // is in the (i + 1)'th chunk.
       if (i >= 0)
         return i + 1;

       // Inexact match. Return the insertion point.
       return -i - 1;
     }

     /**
      * Used when writing the root block index of a multi-level block index.
      * Serializes additional information allowing to efficiently identify the
      * mid-key.
      *
      * @return a few serialized fields for finding the mid-key
      * @throws IOException if could not create metadata for computing mid-key
      */
     public byte[] getMidKeyMetadata() throws IOException {
       ByteArrayOutputStream baos = new ByteArrayOutputStream(
           MID_KEY_METADATA_SIZE);
       DataOutputStream baosDos = new DataOutputStream(baos);
       long totalNumSubEntries = numSubEntriesAt.get(blockKeys.size() - 1);
       if (totalNumSubEntries == 0) {
         throw new IOException("No leaf-level entries, mid-key unavailable");
       }
       long midKeySubEntry = (totalNumSubEntries - 1) / 2;
       int midKeyEntry = getEntryBySubEntry(midKeySubEntry);

       baosDos.writeLong(blockOffsets.get(midKeyEntry));
       baosDos.writeInt(onDiskDataSizes.get(midKeyEntry));

       long numSubEntriesBefore = midKeyEntry > 0
           ? numSubEntriesAt.get(midKeyEntry - 1) : 0;
       long subEntryWithinEntry = midKeySubEntry - numSubEntriesBefore;
       if (subEntryWithinEntry < 0 || subEntryWithinEntry > Integer.MAX_VALUE)
       {
         throw new IOException("Could not identify mid-key index within the "
             + "leaf-level block containing mid-key: out of range ("
             + subEntryWithinEntry + ", numSubEntriesBefore="
             + numSubEntriesBefore + ", midKeySubEntry=" + midKeySubEntry
             + ")");
       }

       baosDos.writeInt((int) subEntryWithinEntry);

       if (baosDos.size() != MID_KEY_METADATA_SIZE) {
         throw new IOException("Could not write mid-key metadata: size=" +
             baosDos.size() + ", correct size: " + MID_KEY_METADATA_SIZE);
       }

       // Close just to be good citizens, although this has no effect.
       baos.close();

       return baos.toByteArray();
     }

     /**
      * Writes the block index chunk in the non-root index block format. This
      * format contains the number of entries, an index of integer offsets
      * for quick binary search on variable-length records, and tuples of
      * block offset, on-disk block size, and the first key for each entry.
      *
      * @param out
      * @throws IOException
      */
     void writeNonRoot(DataOutput out) throws IOException {
       // The number of entries in the block.
       out.writeInt(blockKeys.size());

       if (secondaryIndexOffsetMarks.size() != blockKeys.size()) {
         throw new IOException("Corrupted block index chunk writer: " +
             blockKeys.size() + " entries but " +
             secondaryIndexOffsetMarks.size() + " secondary index items");
       }

       // For each entry, write a "secondary index" of relative offsets to the
       // entries from the end of the secondary index. This works, because at
       // read time we read the number of entries and know where the secondary
       // index ends.
       for (int currentSecondaryIndex : secondaryIndexOffsetMarks)
         out.writeInt(currentSecondaryIndex);

       // We include one other element in the secondary index to calculate the
       // size of each entry more easily by subtracting secondary index elements.
       out.writeInt(curTotalNonRootEntrySize);

       for (int i = 0; i < blockKeys.size(); ++i) {
         out.writeLong(blockOffsets.get(i));
         out.writeInt(onDiskDataSizes.get(i));
         out.write(blockKeys.get(i));
       }
     }

     /**
      * @return the size of this chunk if stored in the non-root index block
      *         format
      */
     int getNonRootSize() {
       return Bytes.SIZEOF_INT                          // Number of entries
           + Bytes.SIZEOF_INT * (blockKeys.size() + 1)  // Secondary index
           + curTotalNonRootEntrySize;                  // All entries
     }

     /**
      * Writes this chunk into the given output stream in the root block index
      * format. This format is similar to the {@link HFile} version 1 block
      * index format, except that we store on-disk size of the block instead of
      * its uncompressed size.
      *
      * @param out the data output stream to write the block index to. Typically
      *          a stream writing into an {@link HFile} block.
      * @throws IOException
      */
     void writeRoot(DataOutput out) throws IOException {
       for (int i = 0; i < blockKeys.size(); ++i) {
         out.writeLong(blockOffsets.get(i));
         out.writeInt(onDiskDataSizes.get(i));
         Bytes.writeByteArray(out, blockKeys.get(i));
       }
     }

     /**
      * @return the size of this chunk if stored in the root index block format
      */
     int getRootSize() {
       return curTotalRootSize;
     }

     /**
      * @return the number of entries in this block index chunk
      */
     public int getNumEntries() {
       return blockKeys.size();
     }

     public byte[] getBlockKey(int i) {
       return blockKeys.get(i);
     }

     public long getBlockOffset(int i) {
       return blockOffsets.get(i);
     }

     public int getOnDiskDataSize(int i) {
       return onDiskDataSizes.get(i);
     }

     public long getCumulativeNumKV(int i) {
       if (i < 0)
         return 0;
       return numSubEntriesAt.get(i);
     }

   }

   public static int getMaxChunkSize(Configuration conf) {
     return conf.getInt(MAX_CHUNK_SIZE_KEY, DEFAULT_MAX_CHUNK_SIZE);
   }

   public static int getMinIndexNumEntries(Configuration conf) {
     return conf.getInt(MIN_INDEX_NUM_ENTRIES_KEY, DEFAULT_MIN_INDEX_NUM_ENTRIES);
   }
 }