contrib/piggybank/java/src/main/java/org/apache/pig/piggybank/storage/IndexedStorage.java - pig - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.pig.piggybank.storage;

 import java.io.IOException;
 import java.io.InputStream;
 import java.util.Arrays;
 import java.util.Comparator;
 import java.util.Iterator;
 import java.util.List;

 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.PositionedReadable;
 import org.apache.hadoop.fs.Seekable;
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.WritableComparable;
 import org.apache.hadoop.mapreduce.InputFormat;
 import org.apache.hadoop.mapreduce.InputSplit;
 import org.apache.hadoop.mapreduce.JobContext;
 import org.apache.hadoop.mapreduce.OutputFormat;
 import org.apache.hadoop.mapreduce.RecordReader;
 import org.apache.hadoop.mapreduce.RecordWriter;
 import org.apache.hadoop.mapreduce.TaskAttemptContext;
 import org.apache.hadoop.mapreduce.TaskAttemptID;
 import org.apache.hadoop.mapreduce.lib.input.FileSplit;
 import org.apache.pig.IndexableLoadFunc;
 import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRConfiguration;
 import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigTextInputFormat;
 import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigTextOutputFormat;
 import org.apache.pig.backend.hadoop.executionengine.shims.HadoopShims;
 import org.apache.pig.piggybank.storage.IndexedStorage.IndexedStorageInputFormat.IndexedStorageRecordReader;
 import org.apache.pig.piggybank.storage.IndexedStorage.IndexedStorageInputFormat.IndexedStorageRecordReader.IndexedStorageRecordReaderComparator;
 import org.apache.pig.builtin.PigStorage;
 import org.apache.pig.data.DataReaderWriter;
 import org.apache.pig.data.Tuple;
 import org.apache.pig.data.TupleFactory;
 import org.apache.pig.impl.util.StorageUtil;
 import org.apache.pig.data.DataType;
 import org.apache.pig.data.DataByteArray;
 import org.apache.pig.backend.hadoop.executionengine.shims.HadoopShims;

 /**
  * <code>IndexedStorage</code> is a form of <code>PigStorage</code> that supports a
  * per record seek.  <code>IndexedStorage</code> creates a separate (hidden) index file for
  * every data file that is written.  The format of the index file is:
  * <pre>
  * | Header     |
  * | Index Body |
  * | Footer     |
  * </pre>
  * The Header contains the list of record indices (field numbers) that represent index keys.
  * The Index Body contains a <code>Tuple</code> for each record in the data.
  * The fields of the <code>Tuple</code> are:
  * <ul>
  * <li> The index key(s) <code>Tuple</code> </li>
  * <li> The number of records that share this index key. </li>
  * <li> Offset into the data file to read the first matching record. </li>
  * </ul>
  * The Footer contains sequentially:
  * <ul>
  * <li> The smallest key(s) <code>Tuple</code> in the index. </li>
  * <li> The largest key(s) <code>Tuple</code> in the index. </li>
  * <li> The offset in bytes to the start of the footer </li>
  * </ul>
  *
  * <code>IndexStorage</code> implements <code>IndexableLoadFunc</code> and
  * can be used as the 'right table' in a PIG 'merge' or 'merge-sparse' join.
  *
  * <code>IndexStorage</code> does not require the data to be globally partitioned & sorted
  * by index keys.  Each partition (separate index) must be locally sorted.
  *
  * Also note IndexStorage is a loader to demonstrate "merge-sparse" join.
  */
 public class IndexedStorage extends PigStorage implements IndexableLoadFunc {

     /**
      * Constructs a Pig Storer that uses specified regex as a field delimiter.
      * @param delimiter - field delimiter to use
      * @param offsetsToIndexKeys - list of offset into Tuple for index keys (comma separated)
      */
     public IndexedStorage(String delimiter, String offsetsToIndexKeys) {
         super(delimiter);

         this.fieldDelimiter = StorageUtil.parseFieldDel(delimiter);

         String[] stroffsetsToIndexKeys = offsetsToIndexKeys.split(",");
         this.offsetsToIndexKeys = new int[stroffsetsToIndexKeys.length];
         for (int i = 0; i < stroffsetsToIndexKeys.length; ++i) {
             this.offsetsToIndexKeys[i] = Integer.parseInt(stroffsetsToIndexKeys[i]);
         }
     }

     @Override
     public OutputFormat getOutputFormat() {
         return new IndexedStorageOutputFormat(fieldDelimiter, offsetsToIndexKeys);
     }

     /**
      * Assumes this list of readers is already sorted except for the provided element.
          * This element is bubbled up the array to its appropriate sort location
          * (faster than doing a Utils sort).
      */
     private void sortReader(int startIndex) {
         int idx = startIndex;
         while (idx < this.readers.length - 1) {
             IndexedStorageRecordReader reader1 = this.readers[idx];
             IndexedStorageRecordReader reader2 = this.readers[idx+1];
             if (this.readerComparator.compare(reader1, reader2) <= 0) {
                 return;
             }
             this.readers[idx] = reader2;
             this.readers[idx+1] = reader1;
             idx++;
         }
     }

     /**
      * Internal OutputFormat class
      */
     public static class IndexedStorageOutputFormat extends PigTextOutputFormat {

         public IndexedStorageOutputFormat(byte delimiter, int[] offsetsToIndexKeys) {
             /* Call the base class constructor */
             super(delimiter);

             this.fieldDelimiter = delimiter;
             this.offsetsToIndexKeys = offsetsToIndexKeys;
         }

         @Override
         public RecordWriter<WritableComparable, Tuple> getRecordWriter(
                 TaskAttemptContext context) throws IOException,
                 InterruptedException {

             Configuration conf = context.getConfiguration();

             FileSystem fs = FileSystem.get(conf);
             Path file = this.getDefaultWorkFile(context, "");
             FSDataOutputStream fileOut = fs.create(file, false);

             IndexManager indexManager = new IndexManager(offsetsToIndexKeys);
             indexManager.createIndexFile(fs, file);
             return new IndexedStorageRecordWriter(fileOut, this.fieldDelimiter, indexManager);
         }

         /**
          * Internal class to do the actual record writing and index generation
          *
          */
         public static class IndexedStorageRecordWriter extends PigLineRecordWriter {

             public IndexedStorageRecordWriter(FSDataOutputStream fileOut, byte fieldDel, IndexManager indexManager) throws IOException {
                 super(fileOut, fieldDel);

                 this.fileOut = fileOut;
                 this.indexManager = indexManager;

                 /* Write the index header first */
                 this.indexManager.WriteIndexHeader();
             }

             @Override
             public void write(WritableComparable key, Tuple value) throws IOException {
                 /* Write the data */
                 long offset = this.fileOut.getPos();
                 super.write(key, value);

                 /* Build index */
                 this.indexManager.BuildIndex(value, offset);
             }

             @Override
             public void close(TaskAttemptContext context)
             throws IOException {
                 this.indexManager.WriterIndexFooter();
                 this.indexManager.Close();
                 super.close(context);
             }

             /**
              * Output stream for data
              */
             private FSDataOutputStream fileOut;

             /**
              * Index builder
              */
             private IndexManager indexManager = null;
         }

         /**
          * Delimiter to use between fields
          */
         final private byte fieldDelimiter;

         /**
          * Offsets to index keys in given tuple
          */
         final protected int[] offsetsToIndexKeys;
     }

     @Override
     public InputFormat getInputFormat() {
         return new IndexedStorageInputFormat();
     }

     @Override
     public Tuple getNext() throws IOException {
         if (this.readers == null) {
             return super.getNext();
         }

         while (currentReaderIndexStart < this.readers.length) {
             IndexedStorageRecordReader r = this.readers[currentReaderIndexStart];

             this.prepareToRead(r, null);
             Tuple tuple = super.getNext();
             if (tuple == null) {
                 currentReaderIndexStart++;
                 r.close();
                 continue; //next Reader
             }

             //if we haven't yet initialized the indexManager (by reading the first index key)
             if (r.indexManager.lastIndexKeyTuple == null) {

                 //initialize the indexManager
                 if (r.indexManager.ReadIndex() == null) {
                     //There should never be a case where there is a non-null record - but no corresponding index.
                     throw new IOException("Missing Index for Tuple: " + tuple);
                 }
             }

             r.indexManager.numberOfTuples--;

             if (r.indexManager.numberOfTuples == 0) {
                 if (r.indexManager.ReadIndex() == null) {
                     r.close();
                     currentReaderIndexStart++;
                 } else {
                     //Since the index of the current reader was increased, we may need to push the
                     //current reader back in the sorted list of readers.
                     sortReader(currentReaderIndexStart);
                 }
             }
             return tuple;
         }

         return null;
     }

     /**
      * IndexableLoadFunc interface implementation
      */
     @Override
     public void initialize(Configuration conf) throws IOException {
         try {
             InputFormat inputFormat = this.getInputFormat();
             TaskAttemptID id = HadoopShims.getNewTaskAttemptID();

             if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
                         conf.set(MRConfiguration.JOB_CREDENTIALS_BINARY, System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
             }
             List<FileSplit> fileSplits = inputFormat.getSplits(HadoopShims.createJobContext(conf, null));
             this.readers = new IndexedStorageRecordReader[fileSplits.size()];

             int idx = 0;
             Iterator<FileSplit> it = fileSplits.iterator();
             while (it.hasNext()) {
                 FileSplit fileSplit = it.next();
                 TaskAttemptContext context = HadoopShims.createTaskAttemptContext(conf, id);
                 IndexedStorageRecordReader r = (IndexedStorageRecordReader) inputFormat.createRecordReader(fileSplit, context);
                 r.initialize(fileSplit, context);
                 this.readers[idx] = r;
                 idx++;
             }

             Arrays.sort(this.readers, this.readerComparator);
         } catch (InterruptedException e) {
             throw new IOException(e);
         }
     }

     @Override
     /* The list of readers is always sorted before and after this call. */
     public void seekNear(Tuple keys) throws IOException {

         /* Keeps track of the last (if any) reader where seekNear was called */
         int lastIndexModified = -1;

         int idx = currentReaderIndexStart;
         while (idx < this.readers.length) {
             IndexedStorageRecordReader r = this.readers[idx];

             /* The key falls within the range of the reader index */
             if (keys.compareTo(r.indexManager.maxIndexKeyTuple) <= 0 && keys.compareTo(r.indexManager.minIndexKeyTuple) >= 0) {
                 r.seekNear(keys);
                 lastIndexModified = idx;

             /* The key is greater than the current range of the reader index */
             } else if (keys.compareTo(r.indexManager.maxIndexKeyTuple) > 0) {
                 currentReaderIndexStart++;
             /* DO NOTHING - The key is less than the current range of the reader index */
             } else {
                 break;
             }
             idx++;
         }

         /*
          * There is something to sort.
          * We can rely on the following invariants that make the following check accurate:
           *  - currentReaderIndexStart is always >= 0.
          *  - lastIndexModified is only positive if seekNear was called.
          *  - lastIndexModified >= currentReaderIndexStart if lastIndexModifed >= 0.  This is true because the list
           * is already sorted.
          */
         if (lastIndexModified - currentReaderIndexStart >= 0) {

             /*
              * The following logic is optimized for the (common) case where there are a tiny number of readers that
                  * need to be repositioned relative to the other readers in the much larger sorted list.
              */

             /* First, just sort the readers that were updated relative to one another. */
             Arrays.sort(this.readers, currentReaderIndexStart, lastIndexModified+1, this.readerComparator);

             /* In descending order, push the updated readers back in the the sorted list. */
             for (idx = lastIndexModified; idx >= currentReaderIndexStart; idx--) {
                 sortReader(idx);
             }
         }
     }

     @Override
     public void close() throws IOException {
         for (IndexedStorageRecordReader reader : this.readers) {
             reader.close();
         }
     }

     /**
      * <code>IndexManager</code> manages the index file (both writing and reading)
      * It keeps track of the last index read during reading.
      */
     public static class IndexManager {

         /**
          * Constructor (called during reading)
          * @param ifile index file to read
          */
         public IndexManager(FileStatus ifile) {
             this.indexFile = ifile;
             this.offsetToFooter = -1;
         }

         /**
          * Constructor (called during writing)
          * @param offsetsToIndexKeys
          */
         public IndexManager(int[] offsetsToIndexKeys) {
             this.offsetsToIndexKeys = offsetsToIndexKeys;
             this.offsetToFooter = -1;
         }

         /**
          * Construct index file path for a given a data file
          * @param file - Data file
          * @return - Index file path for given data file
          */
         private static Path getIndexFileName(Path file) {
             return new Path(file.getParent(), "." + file.getName() + ".index");
         }

         /**
          * Open the index file for writing for given data file
          * @param fs
          * @param file
          * @throws IOException
          */
         public void createIndexFile(FileSystem fs, Path file) throws IOException {
             this.indexOut = fs.create(IndexManager.getIndexFileName(file), false);
         }

         /**
           * Opens the index file.
          */
         public void openIndexFile(FileSystem fs) throws IOException {
             this.indexIn = fs.open(this.indexFile.getPath());
         }

         /**
          * Close the index file
          * @throws IOException
          */
         public void Close() throws IOException {
             this.indexOut.close();
         }

         /**
          * Build index tuple
          *
          * @throws IOException
          */
         private void BuildIndex(Tuple t, long offset) throws IOException {
             /* Build index key tuple */
             Tuple indexKeyTuple = tupleFactory.newTuple(this.offsetsToIndexKeys.length);
             for (int i = 0; i < this.offsetsToIndexKeys.length; ++i) {
                 indexKeyTuple.set(i, t.get(this.offsetsToIndexKeys[i]));
             }

             /* Check if we have already seen Tuple(s) with same index keys */
             if (indexKeyTuple.compareTo(this.lastIndexKeyTuple) == 0) {
                 /* We have seen Tuple(s) with given index keys, update the tuple count */
                 this.numberOfTuples += 1;
             }
             else {
                 if (this.lastIndexKeyTuple != null)
                     this.WriteIndex();

                 this.lastIndexKeyTuple = indexKeyTuple;
                 this.minIndexKeyTuple = ((this.minIndexKeyTuple == null) || (indexKeyTuple.compareTo(this.minIndexKeyTuple) < 0)) ? indexKeyTuple : this.minIndexKeyTuple;
                 this.maxIndexKeyTuple = ((this.maxIndexKeyTuple == null) || (indexKeyTuple.compareTo(this.maxIndexKeyTuple) > 0)) ? indexKeyTuple : this.maxIndexKeyTuple;

                 /* New index tuple for newly seen index key */
                 this.indexTuple = tupleFactory.newTuple(3);

                 /* Add index keys to index Tuple */
                 this.indexTuple.set(0, indexKeyTuple);

                 /* Reset Tuple count for index key */
                 this.numberOfTuples = 1;

                 /* Remember offset to Tuple with new index keys */
                 this.indexTuple.set(2, offset);
             }
         }

         /**
          * Write index header
          * @param indexOut - Stream to write to
          * @param ih - Index header to write
          * @throws IOException
          */
         public void WriteIndexHeader() throws IOException {
             /* Number of index keys */
             indexOut.writeInt(this.offsetsToIndexKeys.length);

             /* Offset to index keys */
             for (int i = 0; i < this.offsetsToIndexKeys.length; ++i) {
                 indexOut.writeInt(this.offsetsToIndexKeys[i]);
             }
         }

         /**
          * Read index header
          * @param indexIn - Stream to read from
          * @return Index header
          * @throws IOException
          */
         public void ReadIndexHeader() throws IOException {
             /* Number of index keys */
             int nkeys = this.indexIn.readInt();

             /* Offset to index keys */
             this.offsetsToIndexKeys = new int[nkeys];
             for (int i = 0; i < nkeys; ++i) {
                 offsetsToIndexKeys[i] = this.indexIn.readInt();
             }
         }

         /**
          * Writes the index footer
          */
         public void WriterIndexFooter() throws IOException {
             /* Flush indexes for remaining records */
             this.WriteIndex();

             /* record the offset to footer */
             this.offsetToFooter = this.indexOut.getPos();

             /* Write index footer */
             DataReaderWriter.writeDatum(indexOut, this.minIndexKeyTuple);
             DataReaderWriter.writeDatum(indexOut, this.maxIndexKeyTuple);

             /* Offset to footer */
             indexOut.writeLong(this.offsetToFooter);
         }

         /**
          * Reads the index footer
          */
         public void ReadIndexFooter() throws IOException {
             long currentOffset = this.indexIn.getPos();

             this.SeekToIndexFooter();
             this.minIndexKeyTuple = (Tuple)DataReaderWriter.readDatum(this.indexIn);
             this.maxIndexKeyTuple = (Tuple)DataReaderWriter.readDatum(this.indexIn);

             this.indexIn.seek(currentOffset);
         }

         /**
          * Seeks to the index footer
          */
         public void SeekToIndexFooter() throws IOException {
             if (this.offsetToFooter < 0) {
                 /* offset to footer is at last long (8 bytes) in the file */
                 this.indexIn.seek(this.indexFile.getLen()-8);
                 this.offsetToFooter = this.indexIn.readLong();
             }
             this.indexIn.seek(this.offsetToFooter);
         }

         /**
          * Writes the current index.
          */
         public void WriteIndex() throws IOException {
             this.indexTuple.set(1, this.numberOfTuples);
             DataReaderWriter.writeDatum(this.indexOut, this.indexTuple);
         }

         /**
          * Extracts the index key from the index tuple
          */
         public Tuple getIndexKeyTuple(Tuple indexTuple) throws IOException {
             if (indexTuple.size() == 3)
                 return (Tuple)indexTuple.get(0);
             else
                 throw new IOException("Invalid index record with size " + indexTuple.size());
         }

         /**
          * Extracts the number of records that share the current key from the index tuple.
          */
         public long getIndexKeyTupleCount(Tuple indexTuple) throws IOException {
             if (indexTuple.size() == 3)
                 return (Long)indexTuple.get(1);
             else
                 throw new IOException("Invalid index record with size " + indexTuple.size());
         }

         /**
          * Extracts the offset into the data file from the index tuple.
          */
         public long getOffset(Tuple indexTuple) throws IOException {
             if (indexTuple.size() == 3)
                 return (Long)indexTuple.get(2);
             else
                 throw new IOException("Invalid index record with size " + indexTuple.size());
         }

         /**
           * Reads the next index from the index file (or null if EOF) and extracts
          * the index fields.
          */
         public Tuple ReadIndex() throws IOException {
             if (this.indexIn.getPos() < this.offsetToFooter) {
                 indexTuple = (Tuple)DataReaderWriter.readDatum(this.indexIn);
                 if (indexTuple != null) {
                     this.lastIndexKeyTuple = this.getIndexKeyTuple(indexTuple);
                     this.numberOfTuples = this.getIndexKeyTupleCount(indexTuple);
                 }
                 return indexTuple;
             }
             return null;
         }

         /**
          * Scans the index looking for a given key.
          * @return the matching index tuple OR the last index tuple
          * greater than the requested key if no match is found.
          */
         public Tuple ScanIndex(Tuple keys) throws IOException {
             if (lastIndexKeyTuple != null && keys.compareTo(this.lastIndexKeyTuple) <= 0) {
                 return indexTuple;
             }

             /* Scan the index looking for given key */
             while ((indexTuple = this.ReadIndex()) != null) {
                 if (keys.compareTo(this.lastIndexKeyTuple) > 0)
                     continue;
                 else
                     break;
             }

             return indexTuple;
         }

         /**
          * stores the list of record indices that identify keys.
          */
         private int[] offsetsToIndexKeys = null;

         /**
          * offset in bytes to the start of the footer of the index.
          */
         private long offsetToFooter = -1;

         /**
          * output stream when writing the index.
          */
         FSDataOutputStream indexOut;

         /**
          * input stream when reading the index.
          */
         FSDataInputStream indexIn;

         /**
          * Tuple factory to create index tuples
          */
         private TupleFactory tupleFactory = TupleFactory.getInstance();

         /**
          * Index key tuple of the form
          * ((Tuple of index keys), count of tuples with index keys, offset to first tuple with index keys)
          */
         private Tuple indexTuple = tupleFactory.newTuple(3);

         /**
          * "Smallest" index key tuple seen
          */
         private Tuple minIndexKeyTuple = null;

         /**
          * "Biggest" index key tuple seen
          */
         private Tuple maxIndexKeyTuple = null;

         /**
          * Last seen index key tuple
          */
         private Tuple lastIndexKeyTuple = null;

         /**
          * Number of tuples seen for a index key
          */
         private long numberOfTuples = 0;

         /**
           * The index file.
          */
         private FileStatus indexFile;
     }

     /**
      * Internal InputFormat class
      */
     public static class IndexedStorageInputFormat extends PigTextInputFormat {

         @Override
         public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context) {
             IndexManager im = null;
             try {
                 FileSystem fs = FileSystem.get(context.getConfiguration());
                 Path indexFile = IndexManager.getIndexFileName(((FileSplit)split).getPath());
                 im = new IndexManager(fs.getFileStatus(indexFile));
                 im.openIndexFile(fs);
                 im.ReadIndexHeader();
                 im.ReadIndexFooter();
             } catch (IOException e) {
                 // TODO Auto-generated catch block
                 e.printStackTrace();
             }

             return new IndexedStorageRecordReader(im);
         }

         @Override
         public boolean isSplitable(JobContext context, Path filename) {
             return false;
         }

         /**
          * Internal RecordReader class
           */
         public static class IndexedStorageRecordReader extends RecordReader<LongWritable, Text> {
             private long start;
             private long pos;
             private long end;
             private IndexedStorageLineReader in;
             private int maxLineLength;
             private LongWritable key = null;
             private Text value = null;
             private IndexManager indexManager = null;

             @Override
             public String toString() {
                 return indexManager.minIndexKeyTuple + "|" + indexManager.lastIndexKeyTuple + "|" + indexManager.maxIndexKeyTuple;
             }

             public IndexedStorageRecordReader(IndexManager im) {
                 this.indexManager = im;
             }

             /**
              * Class to compare record readers using underlying indexes
              *
              */
             public static class IndexedStorageRecordReaderComparator implements Comparator<IndexedStorageRecordReader> {
                 @Override
                 public int compare(IndexedStorageRecordReader o1, IndexedStorageRecordReader o2) {
                     Tuple t1 = (o1.indexManager.lastIndexKeyTuple == null) ?  o1.indexManager.minIndexKeyTuple : o1.indexManager.lastIndexKeyTuple;
                     Tuple t2 = (o2.indexManager.lastIndexKeyTuple == null) ?  o2.indexManager.minIndexKeyTuple : o2.indexManager.lastIndexKeyTuple;
                     return t1.compareTo(t2);
                 }
             }

             public static class IndexedStorageLineReader {
                 private static final int DEFAULT_BUFFER_SIZE = 64 * 1024;
                 private int bufferSize = DEFAULT_BUFFER_SIZE;
                 private InputStream in;
                 private byte[] buffer;
                 // the number of bytes of real data in the buffer
                 private int bufferLength = 0;
                 // the current position in the buffer
                 private int bufferPosn = 0;
                 private long bufferOffset = 0;

                 private static final byte CR = '\r';
                 private static final byte LF = '\n';

                 /**
                  * Create a line reader that reads from the given stream using the
                  * default buffer-size (64k).
                  * @param in The input stream
                  * @throws IOException
                  */
                 public IndexedStorageLineReader(InputStream in) {
                     this(in, DEFAULT_BUFFER_SIZE);
                 }

                 /**
                  * Create a line reader that reads from the given stream using the
                  * given buffer-size.
                  * @param in The input stream
                  * @param bufferSize Size of the read buffer
                  * @throws IOException
                  */
                 public IndexedStorageLineReader(InputStream in, int bufferSize) {
                     if( !(in instanceof Seekable) || !(in instanceof PositionedReadable) ) {
                           throw new IllegalArgumentException(
                               "In is not an instance of Seekable or PositionedReadable");
                     }

                     this.in = in;
                     this.bufferSize = bufferSize;
                     this.buffer = new byte[this.bufferSize];
                 }

                 /**
                  * Create a line reader that reads from the given stream using the
                  * <code>io.file.buffer.size</code> specified in the given
                  * <code>Configuration</code>.
                  * @param in input stream
                  * @param conf configuration
                  * @throws IOException
                  */
                 public IndexedStorageLineReader(InputStream in, Configuration conf) throws IOException {
                     this(in, conf.getInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE));
                 }

                 /**
                  * Close the underlying stream.
                  * @throws IOException
                  */
                 public void close() throws IOException {
                     in.close();
                 }

                 /**
                  * Read one line from the InputStream into the given Text.  A line
                  * can be terminated by one of the following: '\n' (LF) , '\r' (CR),
                  * or '\r\n' (CR+LF).  EOF also terminates an otherwise unterminated
                  * line.
                  *
                  * @param str the object to store the given line (without newline)
                  * @param maxLineLength the maximum number of bytes to store into str;
                  *  the rest of the line is silently discarded.
                  * @param maxBytesToConsume the maximum number of bytes to consume
                  *  in this call.  This is only a hint, because if the line cross
                  *  this threshold, we allow it to happen.  It can overshoot
                  *  potentially by as much as one buffer length.
                  *
                  * @return the number of bytes read including the (longest) newline
                  * found.
                  *
                  * @throws IOException if the underlying stream throws
                  */
                 public int readLine(Text str, int maxLineLength,
                         int maxBytesToConsume) throws IOException {
                     /* We're reading data from in, but the head of the stream may be
                      * already buffered in buffer, so we have several cases:
                      * 1. No newline characters are in the buffer, so we need to copy
                      *    everything and read another buffer from the stream.
                      * 2. An unambiguously terminated line is in buffer, so we just
                      *    copy to str.
                      * 3. Ambiguously terminated line is in buffer, i.e. buffer ends
                      *    in CR.  In this case we copy everything up to CR to str, but
                      *    we also need to see what follows CR: if it's LF, then we
                      *    need consume LF as well, so next call to readLine will read
                      *    from after that.
                      * We use a flag prevCharCR to signal if previous character was CR
                      * and, if it happens to be at the end of the buffer, delay
                      * consuming it until we have a chance to look at the char that
                      * follows.
                      */
                     str.clear();
                     int txtLength = 0; //tracks str.getLength(), as an optimization
                     int newlineLength = 0; //length of terminating newline
                     boolean prevCharCR = false; //true of prev char was CR
                     long bytesConsumed = 0;
                     do {
                         int startPosn = bufferPosn; //starting from where we left off the last time
                         if (bufferPosn >= bufferLength) {
                             startPosn = bufferPosn = 0;
                             if (prevCharCR)
                                 ++bytesConsumed; //account for CR from previous read

                             bufferOffset = ((Seekable)in).getPos();
                             bufferLength = in.read(buffer);

                             if (bufferLength <= 0)
                                 break; // EOF
                         }
                         for (; bufferPosn < bufferLength; ++bufferPosn) { //search for newline
                             if (buffer[bufferPosn] == LF) {
                                 newlineLength = (prevCharCR) ? 2 : 1;
                                 ++bufferPosn; // at next invocation proceed from following byte
                                 break;
                             }
                             if (prevCharCR) { //CR + notLF, we are at notLF
                                 newlineLength = 1;
                                 break;
                             }
                             prevCharCR = (buffer[bufferPosn] == CR);
                         }
                         int readLength = bufferPosn - startPosn;
                         if (prevCharCR && newlineLength == 0)
                             --readLength; //CR at the end of the buffer
                         bytesConsumed += readLength;
                         int appendLength = readLength - newlineLength;
                         if (appendLength > maxLineLength - txtLength) {
                             appendLength = maxLineLength - txtLength;
                         }
                         if (appendLength > 0) {
                             str.append(buffer, startPosn, appendLength);
                             txtLength += appendLength;
                         }
                     } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume);

                     if (bytesConsumed > (long)Integer.MAX_VALUE)
                         throw new IOException("Too many bytes before newline: " + bytesConsumed);
                     return (int)bytesConsumed;
                 }

                 /**
                  * Read from the InputStream into the given Text.
                  * @param str the object to store the given line
                  * @param maxLineLength the maximum number of bytes to store into str.
                  * @return the number of bytes read including the newline
                  * @throws IOException if the underlying stream throws
                  */
                 public int readLine(Text str, int maxLineLength) throws IOException {
                     return readLine(str, maxLineLength, Integer.MAX_VALUE);
                 }

                 /**
                  * Read from the InputStream into the given Text.
                  * @param str the object to store the given line
                  * @return the number of bytes read including the newline
                  * @throws IOException if the underlying stream throws
                  */
                 public int readLine(Text str) throws IOException {
                     return readLine(str, Integer.MAX_VALUE, Integer.MAX_VALUE);
                 }

                 /**
                  * If given offset is within the buffer, adjust the buffer position to read from
                  * otherwise seek to the given offset from start of the file.
                  * @param offset
                  * @throws IOException
                  */
                 public void seek(long offset) throws IOException {
                     if ((offset >= bufferOffset) && (offset < (bufferOffset + bufferLength)))
                         bufferPosn = (int) (offset - bufferOffset);
                     else {
                         bufferPosn = bufferLength;
                         ((Seekable)in).seek(offset);
                     }
                 }
             }

             @Override
             public void initialize(InputSplit genericSplit, TaskAttemptContext context)
             throws IOException, InterruptedException {

                 FileSplit split = (FileSplit) genericSplit;
                 Configuration job = context.getConfiguration();
                 this.maxLineLength = job.getInt(MRConfiguration.LINERECORDREADER_MAXLENGTH, Integer.MAX_VALUE);
                 start = split.getStart();
                 end = start + split.getLength();
                 final Path file = split.getPath();

                 FileSystem fs = file.getFileSystem(job);
                 FSDataInputStream fileIn = fs.open(split.getPath());
                 boolean skipFirstLine = false;
                 if (start != 0) {
                     skipFirstLine = true;
                     --start;
                     fileIn.seek(start);
                 }
                 in = new IndexedStorageLineReader(fileIn, job);
                 if (skipFirstLine) {
                     start += in.readLine(new Text(), 0, (int)Math.min((long)Integer.MAX_VALUE, end - start));
                 }
                 this.pos = start;
             }

             public void seek(long offset) throws IOException {
                 in.seek(offset);
                 pos = offset;
             }

             /**
              * Scan the index for given key and seek to appropriate offset in the data
              * @param keys to look for
              * @return true if the given key was found, false otherwise
              * @throws IOException
              */
             public boolean seekNear(Tuple keys) throws IOException {
                 boolean ret = false;
                 Tuple indexTuple = this.indexManager.ScanIndex(keys);
                 if (indexTuple != null) {
                     long offset = this.indexManager.getOffset(indexTuple) ;
                     in.seek(offset);

                     if (keys.compareTo(this.indexManager.getIndexKeyTuple(indexTuple)) == 0) {
                         ret = true;
                     }
                 }

                 return ret;
             }

             @Override
             public boolean nextKeyValue() throws IOException,
             InterruptedException {
                 if (key == null) {
                     key = new LongWritable();
                 }
                 key.set(pos);
                 if (value == null) {
                     value = new Text();
                 }
                 int newSize = 0;
                 while (pos < end) {
                     newSize = in.readLine(value, maxLineLength,
                             Math.max((int)Math.min(Integer.MAX_VALUE, end-pos),
                                     maxLineLength));
                     if (newSize == 0) {
                         break;
                     }
                     pos += newSize;
                     if (newSize < maxLineLength) {
                         break;
                     }
                 }
                 if (newSize == 0) {
                     key = null;
                     value = null;
                     return false;
                 } else {
                     return true;
                 }
             }

             @Override
             public LongWritable getCurrentKey() throws IOException,
             InterruptedException {
                 return key;
             }

             @Override
             public Text getCurrentValue() throws IOException,
             InterruptedException {
                 return value;
             }

             @Override
             public float getProgress() throws IOException, InterruptedException {
                 if (start == end) {
                     return 0.0f;
                 } else {
                     return Math.min(1.0f, (pos - start) / (float)(end - start));
                 }
             }

             @Override
             public void close() throws IOException {
                 if (in != null) {
                     in.close();
                 }
             }
         }
     }

     /**
      * List of record readers.
      */
     protected IndexedStorageRecordReader[] readers = null;

     /**
      * Index into the the list of readers to the current reader.
       * Readers before this index have been fully scanned for keys.
      */
     protected int currentReaderIndexStart = 0;

     /**
      * Delimiter to use between fields
      */
     protected byte fieldDelimiter = '\t';

     /**
      * Offsets to index keys in tuple
      */
     final protected int[] offsetsToIndexKeys;

     /**
      * Comparator used to compare key tuples.
      */
     protected Comparator<IndexedStorageRecordReader> readerComparator = new IndexedStorageInputFormat.IndexedStorageRecordReader.IndexedStorageRecordReaderComparator();
 }