solr/core/src/java/org/apache/solr/update/HdfsTransactionLog.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.solr.update;

 import java.io.IOException;
 import java.util.Collection;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;

 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrInputDocument;
 import org.apache.solr.common.util.DataInputInputStream;
 import org.apache.solr.common.util.FastInputStream;
 import org.apache.solr.common.util.FastOutputStream;
 import org.apache.solr.common.util.JavaBinCodec;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 /**
  *  Log Format: List{Operation, Version, ...}
  *  ADD, VERSION, DOC
  *  DELETE, VERSION, ID_BYTES
  *  DELETE_BY_QUERY, VERSION, String
  *
  *  TODO: keep two files, one for [operation, version, id] and the other for the actual
  *  document data.  That way we could throw away document log files more readily
  *  while retaining the smaller operation log files longer (and we can retrieve
  *  the stored fields from the latest documents from the index).
  *
  *  This would require keeping all source fields stored of course.
  *
  *  This would also allow to not log document data for requests with commit=true
  *  in them (since we know that if the request succeeds, all docs will be committed)
  *
  */
 public class HdfsTransactionLog extends TransactionLog {
   public static Logger log = LoggerFactory.getLogger(HdfsTransactionLog.class);


   Path tlogFile;


   private FSDataOutputStream tlogOutStream;
   private FileSystem fs;

   HdfsTransactionLog(FileSystem fs, Path tlogFile, Collection<String> globalStrings) {
     this(fs, tlogFile, globalStrings, false);
   }

   HdfsTransactionLog(FileSystem fs, Path tlogFile, Collection<String> globalStrings, boolean openExisting) {
     super();
     boolean success = false;
     this.fs = fs;

     try {
       if (debug) {
         //log.debug("New TransactionLog file=" + tlogFile + ", exists=" + tlogFile.exists() + ", size=" + tlogFile.length() + ", openExisting=" + openExisting);
       }
       this.tlogFile = tlogFile;

       // TODO: look into forcefully taking over any lease
       if (fs.exists(tlogFile) && openExisting) {
         tlogOutStream = fs.append(tlogFile);
       } else {
         fs.delete(tlogFile, false);

         tlogOutStream = fs.create(tlogFile, (short)1);
         tlogOutStream.hsync();
       }

       fos = new FastOutputStream(tlogOutStream, new byte[65536], 0);
       long start = tlogOutStream.getPos();

       if (openExisting) {
         if (start > 0) {
           readHeader(null);

          // we should already be at the end
          // raf.seek(start);

         //  assert channel.position() == start;
           fos.setWritten(start);    // reflect that we aren't starting at the beginning
           //assert fos.size() == channel.size();
         } else {
           addGlobalStrings(globalStrings);
         }
       } else {
         if (start > 0) {
           log.error("New transaction log already exists:" + tlogFile + " size=" + tlogOutStream.size());
         }

         addGlobalStrings(globalStrings);
       }

       success = true;

     } catch (IOException e) {
       throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
     } finally {
       if (!success && tlogOutStream != null) {
         try {
           tlogOutStream.close();
         } catch (Exception e) {
           log.error("Error closing tlog file (after error opening)", e);
         }
       }
     }
   }

   @Override
   public boolean endsWithCommit() throws IOException {
     long size;
     synchronized (this) {
       fos.flush();
       tlogOutStream.hflush();
       size = fos.size();
     }


     // the end of the file should have the end message (added during a commit) plus a 4 byte size
     byte[] buf = new byte[ END_MESSAGE.length() ];
     long pos = size - END_MESSAGE.length() - 4;
     if (pos < 0) return false;

     FSDataFastInputStream dis = new FSDataFastInputStream(fs.open(tlogFile), pos);
     try {
     //ChannelFastInputStream is = new ChannelFastInputStream(channel, pos);
     dis.read(buf);
     for (int i=0; i<buf.length; i++) {
       if (buf[i] != END_MESSAGE.charAt(i)) return false;
     }
     } finally {
       dis.close();
     }
     return true;
   }

   // This could mess with any readers or reverse readers that are open, or anything that might try to do a log lookup.
   // This should only be used to roll back buffered updates, not actually applied updates.
   @Override
   public void rollback(long pos) throws IOException {
     synchronized (this) {
       assert snapshot_size == pos;
       fos.flush();
       tlogOutStream.hflush();
       // TODO: how do we rollback with hdfs?? We need HDFS-3107
       //raf.setLength(pos);
       fos.setWritten(pos);
       assert fos.size() == pos;
       numRecords = snapshot_numRecords;
     }
   }

   private void readHeader(FastInputStream fis) throws IOException {
     // read existing header
     boolean closeFis = false;
     if (fis == null) closeFis = true;
     fis = fis != null ? fis : new FSDataFastInputStream(fs.open(tlogFile), 0);
     Map header = null;
     try {
       LogCodec codec = new LogCodec(resolver);
       header = (Map) codec.unmarshal(fis);

       fis.readInt(); // skip size
     } finally {
       if (fis != null && closeFis) {
         fis.close();
       }
     }
     // needed to read other records

     synchronized (this) {
       globalStringList = (List<String>)header.get("strings");
       globalStringMap = new HashMap<String, Integer>(globalStringList.size());
       for (int i=0; i<globalStringList.size(); i++) {
         globalStringMap.put( globalStringList.get(i), i+1);
       }
     }
   }

   @Override
   public long writeCommit(CommitUpdateCommand cmd, int flags) {
     LogCodec codec = new LogCodec(resolver);
     synchronized (this) {
       try {
         long pos = fos.size();   // if we had flushed, this should be equal to channel.position()

         if (pos == 0) {
           writeLogHeader(codec);
           pos = fos.size();
         }

         codec.init(fos);
         codec.writeTag(JavaBinCodec.ARR, 3);
         codec.writeInt(UpdateLog.COMMIT | flags);  // should just take one byte
         codec.writeLong(cmd.getVersion());
         codec.writeStr(END_MESSAGE);  // ensure these bytes are (almost) last in the file

         endRecord(pos);

         fos.flush();  // flush since this will be the last record in a log fill
         tlogOutStream.hflush();

         //assert fos.size() == channel.size();

         return pos;
       } catch (IOException e) {
         throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
       }
     }
   }


   /* This method is thread safe */
   @Override
   public Object lookup(long pos) {
     // A negative position can result from a log replay (which does not re-log, but does
     // update the version map.  This is OK since the node won't be ACTIVE when this happens.
     if (pos < 0) return null;

     try {
       // make sure any unflushed buffer has been flushed
       synchronized (this) {
         // TODO: optimize this by keeping track of what we have flushed up to
         fos.flushBuffer();

         // flush to hdfs
         tlogOutStream.hflush();
         /***
          System.out.println("###flushBuffer to " + fos.size() + " raf.length()=" + raf.length() + " pos="+pos);
         if (fos.size() != raf.length() || pos >= fos.size() ) {
           throw new RuntimeException("ERROR" + "###flushBuffer to " + fos.size() + " raf.length()=" + raf.length() + " pos="+pos);
         }
         ***/
       }

       FSDataFastInputStream dis = new FSDataFastInputStream(fs.open(tlogFile),
           pos);
       try {
         dis.seek(pos);
         LogCodec codec = new LogCodec(resolver);
         return codec.readVal(new FastInputStream(dis));
       } finally {
         dis.close();
       }
     } catch (IOException e) {
       throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "pos=" + pos, e);
     }
   }

   @Override
   public void finish(UpdateLog.SyncLevel syncLevel) {
     if (syncLevel == UpdateLog.SyncLevel.NONE) return;
     try {
       synchronized (this) {
         fos.flushBuffer();

         // we must flush to hdfs
         // TODO: we probably don't need to
         // hsync below if we do this - I
         // think they are equivalent.
         tlogOutStream.hflush();
       }

       if (syncLevel == UpdateLog.SyncLevel.FSYNC) {
         // Since fsync is outside of synchronized block, we can end up with a partial
         // last record on power failure (which is OK, and does not represent an error...
         // we just need to be aware of it when reading).

         //raf.getFD().sync();
         tlogOutStream.hsync();
       }

     } catch (IOException e) {
       throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
     }
   }

   @Override
   protected void close() {
     try {
       if (debug) {
         log.debug("Closing tlog" + this);
       }

       synchronized (this) {
         fos.flush();
         tlogOutStream.hflush();
         fos.close();

         tlogOutStream.close();
       }

       if (deleteOnClose) {
         fs.delete(tlogFile, true);
       }
     } catch (IOException e) {
       throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
     }
   }

   public String toString() {
     return "hdfs tlog{file=" + tlogFile.toString() + " refcount=" + refcount.get() + "}";
   }

   /** Returns a reader that can be used while a log is still in use.
    * Currently only *one* LogReader may be outstanding, and that log may only
    * be used from a single thread. */
   @Override
   public LogReader getReader(long startingPos) {
     return new HDFSLogReader(startingPos);
   }

   /** Returns a single threaded reverse reader */
   @Override
   public ReverseReader getReverseReader() throws IOException {
     return new HDFSReverseReader();
   }


   public class HDFSLogReader extends LogReader{
     FSDataFastInputStream fis;
     private LogCodec codec = new LogCodec(resolver);

     public HDFSLogReader(long startingPos) {
       super();
       incref();
       try {
         FSDataInputStream fdis = fs.open(tlogFile);
         fis = new FSDataFastInputStream(fdis, startingPos);
       } catch (IOException e) {
         throw new RuntimeException(e);
       }
     }

     /** Returns the next object from the log, or null if none available.
      *
      * @return The log record, or null if EOF
      * @throws IOException If there is a low-level I/O error.
      */
     public Object next() throws IOException, InterruptedException {
       long pos = fis.position();


       synchronized (HdfsTransactionLog.this) {
         if (trace) {
           log.trace("Reading log record.  pos="+pos+" currentSize="+fos.size());
         }

         if (pos >= fos.size()) {
           return null;
         }

         fos.flushBuffer();
         tlogOutStream.hflush();

         // we actually need a new reader
         fis.close();
         try {
           FSDataInputStream fdis = fs.open(tlogFile);
           fis = new FSDataFastInputStream(fdis, pos);
         } catch (IOException e) {
           throw new RuntimeException(e);
         }

       }
       if (pos == 0) {
         readHeader(fis);

         // shouldn't currently happen - header and first record are currently written at the same time
         synchronized (HdfsTransactionLog.this) {
           if (fis.position() >= fos.size()) {
             return null;
           }
           pos = fis.position();
         }
       }

       tlogOutStream.hflush();
       Object o = codec.readVal(fis);

       // skip over record size
       int size = fis.readInt();
       assert size == fis.position() - pos - 4;

       return o;
     }

     public void close() {
       try {
         fis.close();
       } catch (IOException e) {
         throw new RuntimeException(e);
       }
       decref();
     }

     @Override
     public String toString() {
       synchronized (HdfsTransactionLog.this) {
         return "LogReader{" + "file=" + tlogFile + ", position=" + fis.position() + ", end=" + fos.size() + "}";
       }
     }

   }

   public class HDFSReverseReader extends ReverseReader {
     FSDataFastInputStream fis;
     private LogCodec codec = new LogCodec(resolver) {
       @Override
       public SolrInputDocument readSolrInputDocument(DataInputInputStream dis) {
         // Given that the SolrInputDocument is last in an add record, it's OK to just skip
         // reading it completely.
         return null;
       }
     };

     int nextLength;  // length of the next record (the next one closer to the start of the log file)
     long prevPos;    // where we started reading from last time (so prevPos - nextLength == start of next record)

     public HDFSReverseReader() throws IOException {
       incref();

       long sz;
       synchronized (HdfsTransactionLog.this) {
         fos.flushBuffer();

         // this must be an hflush
         tlogOutStream.hflush();
         sz = fos.size();
         //assert sz == channel.size();
       }

       fis = new FSDataFastInputStream(fs.open(tlogFile), 0);

       if (sz >=4) {
         // readHeader(fis);  // should not be needed
         prevPos = sz - 4;
         fis.seek(prevPos);
         nextLength = fis.readInt();
       }
     }


     /** Returns the next object from the log, or null if none available.
      *
      * @return The log record, or null if EOF
      * @throws IOException If there is a low-level I/O error.
      */
     public Object next() throws IOException {
       if (prevPos <= 0) return null;

       long endOfThisRecord = prevPos;

       int thisLength = nextLength;

       long recordStart = prevPos - thisLength;  // back up to the beginning of the next record
       prevPos = recordStart - 4;  // back up 4 more to read the length of the next record

       if (prevPos <= 0) return null;  // this record is the header

       long bufferPos = fis.getBufferPos();
       if (prevPos >= bufferPos) {
         // nothing to do... we're within the current buffer
       } else {
         // Position buffer so that this record is at the end.
         // For small records, this will cause subsequent calls to next() to be within the buffer.
         long seekPos =  endOfThisRecord - fis.getBufferSize();
         seekPos = Math.min(seekPos, prevPos); // seek to the start of the record if it's larger then the block size.
         seekPos = Math.max(seekPos, 0);
         fis.seek(seekPos);
         fis.peek();  // cause buffer to be filled
       }

       fis.seek(prevPos);
       nextLength = fis.readInt();     // this is the length of the *next* record (i.e. closer to the beginning)

       // TODO: optionally skip document data
       Object o = codec.readVal(fis);

       // assert fis.position() == prevPos + 4 + thisLength;  // this is only true if we read all the data (and we currently skip reading SolrInputDocument
       return o;
     }

     /* returns the position in the log file of the last record returned by next() */
     public long position() {
       return prevPos + 4;  // skip the length
     }

     public void close() {
       try {
         fis.close();
       } catch (IOException e) {
         throw new RuntimeException(e);
       }
       decref();
     }

     @Override
     public String toString() {
       synchronized (HdfsTransactionLog.this) {
         return "LogReader{" + "file=" + tlogFile + ", position=" + fis.position() + ", end=" + fos.size() + "}";
       }
     }


   }

 }


 class FSDataFastInputStream extends FastInputStream {
   private FSDataInputStream fis;

   public FSDataFastInputStream(FSDataInputStream fis, long chPosition) {
     // super(null, new byte[10],0,0);    // a small buffer size for testing purposes
     super(null);
     this.fis = fis;
     super.readFromStream = chPosition;
   }

   @Override
   public int readWrappedStream(byte[] target, int offset, int len) throws IOException {
     return fis.read(readFromStream, target, offset, len);
   }

   public void seek(long position) throws IOException {
     if (position <= readFromStream && position >= getBufferPos()) {
       // seek within buffer
       pos = (int)(position - getBufferPos());
     } else {
       // long currSize = ch.size();   // not needed - underlying read should handle (unless read never done)
       // if (position > currSize) throw new EOFException("Read past EOF: seeking to " + position + " on file of size " + currSize + " file=" + ch);
       readFromStream = position;
       end = pos = 0;
     }
     assert position() == position;
   }

   /** where is the start of the buffer relative to the whole file */
   public long getBufferPos() {
     return readFromStream - end;
   }

   public int getBufferSize() {
     return buf.length;
   }

   @Override
   public void close() throws IOException {
     fis.close();
   }

   @Override
   public String toString() {
     return "readFromStream="+readFromStream +" pos="+pos +" end="+end + " bufferPos="+getBufferPos() + " position="+position() ;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.solr.update;

	import java.io.IOException;
	import java.util.Collection;
	import java.util.HashMap;
	import java.util.List;
	import java.util.Map;

	import org.apache.hadoop.fs.FSDataInputStream;
	import org.apache.hadoop.fs.FSDataOutputStream;
	import org.apache.hadoop.fs.FileSystem;
	import org.apache.hadoop.fs.Path;
	import org.apache.solr.common.SolrException;
	import org.apache.solr.common.SolrInputDocument;
	import org.apache.solr.common.util.DataInputInputStream;
	import org.apache.solr.common.util.FastInputStream;
	import org.apache.solr.common.util.FastOutputStream;
	import org.apache.solr.common.util.JavaBinCodec;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	/**
	* Log Format: List{Operation, Version, ...}
	* ADD, VERSION, DOC
	* DELETE, VERSION, ID_BYTES
	* DELETE_BY_QUERY, VERSION, String
	*
	* TODO: keep two files, one for [operation, version, id] and the other for the actual
	* document data. That way we could throw away document log files more readily
	* while retaining the smaller operation log files longer (and we can retrieve
	* the stored fields from the latest documents from the index).
	*
	* This would require keeping all source fields stored of course.
	*
	* This would also allow to not log document data for requests with commit=true
	* in them (since we know that if the request succeeds, all docs will be committed)
	*
	*/
	public class HdfsTransactionLog extends TransactionLog {
	public static Logger log = LoggerFactory.getLogger(HdfsTransactionLog.class);


	Path tlogFile;


	private FSDataOutputStream tlogOutStream;
	private FileSystem fs;

	HdfsTransactionLog(FileSystem fs, Path tlogFile, Collection<String> globalStrings) {
	this(fs, tlogFile, globalStrings, false);
	}

	HdfsTransactionLog(FileSystem fs, Path tlogFile, Collection<String> globalStrings, boolean openExisting) {
	super();
	boolean success = false;
	this.fs = fs;

	try {
	if (debug) {
	//log.debug("New TransactionLog file=" + tlogFile + ", exists=" + tlogFile.exists() + ", size=" + tlogFile.length() + ", openExisting=" + openExisting);
	}
	this.tlogFile = tlogFile;

	// TODO: look into forcefully taking over any lease
	if (fs.exists(tlogFile) && openExisting) {
	tlogOutStream = fs.append(tlogFile);
	} else {
	fs.delete(tlogFile, false);

	tlogOutStream = fs.create(tlogFile, (short)1);
	tlogOutStream.hsync();
	}

	fos = new FastOutputStream(tlogOutStream, new byte[65536], 0);
	long start = tlogOutStream.getPos();

	if (openExisting) {
	if (start > 0) {
	readHeader(null);

	// we should already be at the end
	// raf.seek(start);

	// assert channel.position() == start;
	fos.setWritten(start); // reflect that we aren't starting at the beginning
	//assert fos.size() == channel.size();
	} else {
	addGlobalStrings(globalStrings);
	}
	} else {
	if (start > 0) {
	log.error("New transaction log already exists:" + tlogFile + " size=" + tlogOutStream.size());
	}

	addGlobalStrings(globalStrings);
	}

	success = true;

	} catch (IOException e) {
	throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
	} finally {
	if (!success && tlogOutStream != null) {
	try {
	tlogOutStream.close();
	} catch (Exception e) {
	log.error("Error closing tlog file (after error opening)", e);
	}
	}
	}
	}

	@Override
	public boolean endsWithCommit() throws IOException {
	long size;
	synchronized (this) {
	fos.flush();
	tlogOutStream.hflush();
	size = fos.size();
	}


	// the end of the file should have the end message (added during a commit) plus a 4 byte size
	byte[] buf = new byte[ END_MESSAGE.length() ];
	long pos = size - END_MESSAGE.length() - 4;
	if (pos < 0) return false;

	FSDataFastInputStream dis = new FSDataFastInputStream(fs.open(tlogFile), pos);
	try {
	//ChannelFastInputStream is = new ChannelFastInputStream(channel, pos);
	dis.read(buf);
	for (int i=0; i<buf.length; i++) {
	if (buf[i] != END_MESSAGE.charAt(i)) return false;
	}
	} finally {
	dis.close();
	}
	return true;
	}

	// This could mess with any readers or reverse readers that are open, or anything that might try to do a log lookup.
	// This should only be used to roll back buffered updates, not actually applied updates.
	@Override
	public void rollback(long pos) throws IOException {
	synchronized (this) {
	assert snapshot_size == pos;
	fos.flush();
	tlogOutStream.hflush();
	// TODO: how do we rollback with hdfs?? We need HDFS-3107
	//raf.setLength(pos);
	fos.setWritten(pos);
	assert fos.size() == pos;
	numRecords = snapshot_numRecords;
	}
	}

	private void readHeader(FastInputStream fis) throws IOException {
	// read existing header
	boolean closeFis = false;
	if (fis == null) closeFis = true;
	fis = fis != null ? fis : new FSDataFastInputStream(fs.open(tlogFile), 0);
	Map header = null;
	try {
	LogCodec codec = new LogCodec(resolver);
	header = (Map) codec.unmarshal(fis);

	fis.readInt(); // skip size
	} finally {
	if (fis != null && closeFis) {
	fis.close();
	}
	}
	// needed to read other records

	synchronized (this) {
	globalStringList = (List<String>)header.get("strings");
	globalStringMap = new HashMap<String, Integer>(globalStringList.size());
	for (int i=0; i<globalStringList.size(); i++) {
	globalStringMap.put( globalStringList.get(i), i+1);
	}
	}
	}

	@Override
	public long writeCommit(CommitUpdateCommand cmd, int flags) {
	LogCodec codec = new LogCodec(resolver);
	synchronized (this) {
	try {
	long pos = fos.size(); // if we had flushed, this should be equal to channel.position()

	if (pos == 0) {
	writeLogHeader(codec);
	pos = fos.size();
	}

	codec.init(fos);
	codec.writeTag(JavaBinCodec.ARR, 3);
	codec.writeInt(UpdateLog.COMMIT \| flags); // should just take one byte
	codec.writeLong(cmd.getVersion());
	codec.writeStr(END_MESSAGE); // ensure these bytes are (almost) last in the file

	endRecord(pos);

	fos.flush(); // flush since this will be the last record in a log fill
	tlogOutStream.hflush();

	//assert fos.size() == channel.size();

	return pos;
	} catch (IOException e) {
	throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
	}
	}
	}


	/* This method is thread safe */
	@Override
	public Object lookup(long pos) {
	// A negative position can result from a log replay (which does not re-log, but does
	// update the version map. This is OK since the node won't be ACTIVE when this happens.
	if (pos < 0) return null;

	try {
	// make sure any unflushed buffer has been flushed
	synchronized (this) {
	// TODO: optimize this by keeping track of what we have flushed up to
	fos.flushBuffer();

	// flush to hdfs
	tlogOutStream.hflush();
	/***
	System.out.println("###flushBuffer to " + fos.size() + " raf.length()=" + raf.length() + " pos="+pos);
	if (fos.size() != raf.length() \|\| pos >= fos.size() ) {
	throw new RuntimeException("ERROR" + "###flushBuffer to " + fos.size() + " raf.length()=" + raf.length() + " pos="+pos);
	}
	***/
	}

	FSDataFastInputStream dis = new FSDataFastInputStream(fs.open(tlogFile),
	pos);
	try {
	dis.seek(pos);
	LogCodec codec = new LogCodec(resolver);
	return codec.readVal(new FastInputStream(dis));
	} finally {
	dis.close();
	}
	} catch (IOException e) {
	throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "pos=" + pos, e);
	}
	}

	@Override
	public void finish(UpdateLog.SyncLevel syncLevel) {
	if (syncLevel == UpdateLog.SyncLevel.NONE) return;
	try {
	synchronized (this) {
	fos.flushBuffer();

	// we must flush to hdfs
	// TODO: we probably don't need to
	// hsync below if we do this - I
	// think they are equivalent.
	tlogOutStream.hflush();
	}

	if (syncLevel == UpdateLog.SyncLevel.FSYNC) {
	// Since fsync is outside of synchronized block, we can end up with a partial
	// last record on power failure (which is OK, and does not represent an error...
	// we just need to be aware of it when reading).

	//raf.getFD().sync();
	tlogOutStream.hsync();
	}

	} catch (IOException e) {
	throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
	}
	}

	@Override
	protected void close() {
	try {
	if (debug) {
	log.debug("Closing tlog" + this);
	}

	synchronized (this) {
	fos.flush();
	tlogOutStream.hflush();
	fos.close();

	tlogOutStream.close();
	}

	if (deleteOnClose) {
	fs.delete(tlogFile, true);
	}
	} catch (IOException e) {
	throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
	}
	}

	public String toString() {
	return "hdfs tlog{file=" + tlogFile.toString() + " refcount=" + refcount.get() + "}";
	}

	/** Returns a reader that can be used while a log is still in use.
	* Currently only one LogReader may be outstanding, and that log may only
	* be used from a single thread. */
	@Override
	public LogReader getReader(long startingPos) {
	return new HDFSLogReader(startingPos);
	}

	/** Returns a single threaded reverse reader */
	@Override
	public ReverseReader getReverseReader() throws IOException {
	return new HDFSReverseReader();
	}


	public class HDFSLogReader extends LogReader{
	FSDataFastInputStream fis;
	private LogCodec codec = new LogCodec(resolver);

	public HDFSLogReader(long startingPos) {
	super();
	incref();
	try {
	FSDataInputStream fdis = fs.open(tlogFile);
	fis = new FSDataFastInputStream(fdis, startingPos);
	} catch (IOException e) {
	throw new RuntimeException(e);
	}
	}

	/** Returns the next object from the log, or null if none available.
	*
	* @return The log record, or null if EOF
	* @throws IOException If there is a low-level I/O error.
	*/
	public Object next() throws IOException, InterruptedException {
	long pos = fis.position();


	synchronized (HdfsTransactionLog.this) {
	if (trace) {
	log.trace("Reading log record. pos="+pos+" currentSize="+fos.size());
	}

	if (pos >= fos.size()) {
	return null;
	}

	fos.flushBuffer();
	tlogOutStream.hflush();

	// we actually need a new reader
	fis.close();
	try {
	FSDataInputStream fdis = fs.open(tlogFile);
	fis = new FSDataFastInputStream(fdis, pos);
	} catch (IOException e) {
	throw new RuntimeException(e);
	}

	}
	if (pos == 0) {
	readHeader(fis);

	// shouldn't currently happen - header and first record are currently written at the same time
	synchronized (HdfsTransactionLog.this) {
	if (fis.position() >= fos.size()) {
	return null;
	}
	pos = fis.position();
	}
	}

	tlogOutStream.hflush();
	Object o = codec.readVal(fis);

	// skip over record size
	int size = fis.readInt();
	assert size == fis.position() - pos - 4;

	return o;
	}

	public void close() {
	try {
	fis.close();
	} catch (IOException e) {
	throw new RuntimeException(e);
	}
	decref();
	}

	@Override
	public String toString() {
	synchronized (HdfsTransactionLog.this) {
	return "LogReader{" + "file=" + tlogFile + ", position=" + fis.position() + ", end=" + fos.size() + "}";
	}
	}

	}

	public class HDFSReverseReader extends ReverseReader {
	FSDataFastInputStream fis;
	private LogCodec codec = new LogCodec(resolver) {
	@Override
	public SolrInputDocument readSolrInputDocument(DataInputInputStream dis) {
	// Given that the SolrInputDocument is last in an add record, it's OK to just skip
	// reading it completely.
	return null;
	}
	};

	int nextLength; // length of the next record (the next one closer to the start of the log file)
	long prevPos; // where we started reading from last time (so prevPos - nextLength == start of next record)

	public HDFSReverseReader() throws IOException {
	incref();

	long sz;
	synchronized (HdfsTransactionLog.this) {
	fos.flushBuffer();

	// this must be an hflush
	tlogOutStream.hflush();
	sz = fos.size();
	//assert sz == channel.size();
	}

	fis = new FSDataFastInputStream(fs.open(tlogFile), 0);

	if (sz >=4) {
	// readHeader(fis); // should not be needed
	prevPos = sz - 4;
	fis.seek(prevPos);
	nextLength = fis.readInt();
	}
	}


	/** Returns the next object from the log, or null if none available.
	*
	* @return The log record, or null if EOF
	* @throws IOException If there is a low-level I/O error.
	*/
	public Object next() throws IOException {
	if (prevPos <= 0) return null;

	long endOfThisRecord = prevPos;

	int thisLength = nextLength;

	long recordStart = prevPos - thisLength; // back up to the beginning of the next record
	prevPos = recordStart - 4; // back up 4 more to read the length of the next record

	if (prevPos <= 0) return null; // this record is the header

	long bufferPos = fis.getBufferPos();
	if (prevPos >= bufferPos) {
	// nothing to do... we're within the current buffer
	} else {
	// Position buffer so that this record is at the end.
	// For small records, this will cause subsequent calls to next() to be within the buffer.
	long seekPos = endOfThisRecord - fis.getBufferSize();
	seekPos = Math.min(seekPos, prevPos); // seek to the start of the record if it's larger then the block size.
	seekPos = Math.max(seekPos, 0);
	fis.seek(seekPos);
	fis.peek(); // cause buffer to be filled
	}

	fis.seek(prevPos);
	nextLength = fis.readInt(); // this is the length of the next record (i.e. closer to the beginning)

	// TODO: optionally skip document data
	Object o = codec.readVal(fis);

	// assert fis.position() == prevPos + 4 + thisLength; // this is only true if we read all the data (and we currently skip reading SolrInputDocument
	return o;
	}

	/* returns the position in the log file of the last record returned by next() */
	public long position() {
	return prevPos + 4; // skip the length
	}

	public void close() {
	try {
	fis.close();
	} catch (IOException e) {
	throw new RuntimeException(e);
	}
	decref();
	}

	@Override
	public String toString() {
	synchronized (HdfsTransactionLog.this) {
	return "LogReader{" + "file=" + tlogFile + ", position=" + fis.position() + ", end=" + fos.size() + "}";
	}
	}


	}

	}



	class FSDataFastInputStream extends FastInputStream {
	private FSDataInputStream fis;

	public FSDataFastInputStream(FSDataInputStream fis, long chPosition) {
	// super(null, new byte[10],0,0); // a small buffer size for testing purposes
	super(null);
	this.fis = fis;
	super.readFromStream = chPosition;
	}

	@Override
	public int readWrappedStream(byte[] target, int offset, int len) throws IOException {
	return fis.read(readFromStream, target, offset, len);
	}

	public void seek(long position) throws IOException {
	if (position <= readFromStream && position >= getBufferPos()) {
	// seek within buffer
	pos = (int)(position - getBufferPos());
	} else {
	// long currSize = ch.size(); // not needed - underlying read should handle (unless read never done)
	// if (position > currSize) throw new EOFException("Read past EOF: seeking to " + position + " on file of size " + currSize + " file=" + ch);
	readFromStream = position;
	end = pos = 0;
	}
	assert position() == position;
	}

	/** where is the start of the buffer relative to the whole file */
	public long getBufferPos() {
	return readFromStream - end;
	}

	public int getBufferSize() {
	return buf.length;
	}

	@Override
	public void close() throws IOException {
	fis.close();
	}

	@Override
	public String toString() {
	return "readFromStream="+readFromStream +" pos="+pos +" end="+end + " bufferPos="+getBufferPos() + " position="+position() ;
	}
	}