hbase-server/src/main/java/org/apache/hadoop/hbase/wal/WALSplitter.java - hbase - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.hadoop.hbase.wal;

 import static org.apache.hadoop.hbase.wal.BoundedRecoveredHFilesOutputSink.DEFAULT_WAL_SPLIT_TO_HFILE;
 import static org.apache.hadoop.hbase.wal.BoundedRecoveredHFilesOutputSink.WAL_SPLIT_TO_HFILE;
 import static org.apache.hadoop.hbase.wal.WALSplitUtil.finishSplitLogFile;

 import java.io.EOFException;
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InterruptedIOException;
 import java.text.ParseException;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
 import java.util.Map;
 import java.util.TreeMap;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.atomic.AtomicReference;
 import org.apache.commons.lang3.ArrayUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hbase.HBaseConfiguration;
 import org.apache.hadoop.hbase.HConstants;
 import org.apache.hadoop.hbase.TableDescriptors;
 import org.apache.hadoop.hbase.coordination.SplitLogWorkerCoordination;
 import org.apache.hadoop.hbase.master.SplitLogManager;
 import org.apache.hadoop.hbase.monitoring.MonitoredTask;
 import org.apache.hadoop.hbase.monitoring.TaskMonitor;
 import org.apache.hadoop.hbase.procedure2.util.StringUtils;
 import org.apache.hadoop.hbase.regionserver.LastSequenceId;
 import org.apache.hadoop.hbase.regionserver.RegionServerServices;
 import org.apache.hadoop.hbase.regionserver.wal.WALCellCodec;
 import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.hadoop.hbase.util.CancelableProgressable;
 import org.apache.hadoop.hbase.util.CommonFSUtils;
 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
 import org.apache.hadoop.hbase.util.FSTableDescriptors;
 import org.apache.hadoop.hbase.util.RecoverLeaseFSUtils;
 import org.apache.hadoop.hbase.wal.WAL.Entry;
 import org.apache.hadoop.hbase.wal.WAL.Reader;
 import org.apache.hadoop.hbase.zookeeper.ZKSplitLog;
 import org.apache.hadoop.ipc.RemoteException;
 import org.apache.yetus.audience.InterfaceAudience;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
 import org.apache.hbase.thirdparty.com.google.common.base.Preconditions;
 import org.apache.hbase.thirdparty.com.google.protobuf.TextFormat;

 import org.apache.hadoop.hbase.shaded.protobuf.generated.ClusterStatusProtos.RegionStoreSequenceIds;
 import org.apache.hadoop.hbase.shaded.protobuf.generated.ClusterStatusProtos.StoreSequenceId;

 /**
  * This class is responsible for splitting up a bunch of regionserver commit log
  * files that are no longer being written to, into new files, one per region, for
  * recovering data on startup. Delete the old log files when finished.
  */
 @InterfaceAudience.Private
 public class WALSplitter {
   private static final Logger LOG = LoggerFactory.getLogger(WALSplitter.class);

   /** By default we retry errors in splitting, rather than skipping. */
   public static final boolean SPLIT_SKIP_ERRORS_DEFAULT = false;

   // Parameters for split process
   protected final Path walDir;
   protected final FileSystem walFS;
   protected final Configuration conf;
   final Path rootDir;
   final FileSystem rootFS;
   final RegionServerServices rsServices;
   final TableDescriptors tableDescriptors;

   // Major subcomponents of the split process.
   // These are separated into inner classes to make testing easier.
   OutputSink outputSink;
   private EntryBuffers entryBuffers;

   private SplitLogWorkerCoordination splitLogWorkerCoordination;
   private final WALFactory walFactory;

   private MonitoredTask status;

   // For checking the latest flushed sequence id
   protected final LastSequenceId sequenceIdChecker;

   // Map encodedRegionName -> lastFlushedSequenceId
   protected Map<String, Long> lastFlushedSequenceIds = new ConcurrentHashMap<>();

   // Map encodedRegionName -> maxSeqIdInStores
   protected Map<String, Map<byte[], Long>> regionMaxSeqIdInStores = new ConcurrentHashMap<>();

   // the file being split currently
   private FileStatus fileBeingSplit;

   private final String tmpDirName;

   public final static String SPLIT_WRITER_CREATION_BOUNDED = "hbase.split.writer.creation.bounded";
   public final static String SPLIT_WAL_BUFFER_SIZE = "hbase.regionserver.hlog.splitlog.buffersize";
   public final static String SPLIT_WAL_WRITER_THREADS =
       "hbase.regionserver.hlog.splitlog.writer.threads";

   @VisibleForTesting
   WALSplitter(final WALFactory factory, Configuration conf, Path walDir, FileSystem walFS,
       Path rootDir, FileSystem rootFS, LastSequenceId idChecker,
       SplitLogWorkerCoordination splitLogWorkerCoordination, RegionServerServices rsServices) {
     this.conf = HBaseConfiguration.create(conf);
     String codecClassName =
         conf.get(WALCellCodec.WAL_CELL_CODEC_CLASS_KEY, WALCellCodec.class.getName());
     this.conf.set(HConstants.RPC_CODEC_CONF_KEY, codecClassName);
     this.walDir = walDir;
     this.walFS = walFS;
     this.rootDir = rootDir;
     this.rootFS = rootFS;
     this.sequenceIdChecker = idChecker;
     this.splitLogWorkerCoordination = splitLogWorkerCoordination;
     this.rsServices = rsServices;
     if (rsServices != null) {
       this.tableDescriptors = rsServices.getTableDescriptors();
     } else {
       this.tableDescriptors = new FSTableDescriptors(rootFS, rootDir, true, true);
     }

     this.walFactory = factory;
     PipelineController controller = new PipelineController();
     this.tmpDirName =
       conf.get(HConstants.TEMPORARY_FS_DIRECTORY_KEY, HConstants.DEFAULT_TEMPORARY_HDFS_DIRECTORY);


     // if we limit the number of writers opened for sinking recovered edits
     boolean splitWriterCreationBounded = conf.getBoolean(SPLIT_WRITER_CREATION_BOUNDED, false);
     boolean splitToHFile = conf.getBoolean(WAL_SPLIT_TO_HFILE, DEFAULT_WAL_SPLIT_TO_HFILE);
     long bufferSize = this.conf.getLong(SPLIT_WAL_BUFFER_SIZE, 128 * 1024 * 1024);
     int numWriterThreads = this.conf.getInt(SPLIT_WAL_WRITER_THREADS, 3);

     if (splitToHFile) {
       entryBuffers = new BoundedEntryBuffers(controller, bufferSize);
       outputSink =
           new BoundedRecoveredHFilesOutputSink(this, controller, entryBuffers, numWriterThreads);
     } else if (splitWriterCreationBounded) {
       entryBuffers = new BoundedEntryBuffers(controller, bufferSize);
       outputSink =
           new BoundedRecoveredEditsOutputSink(this, controller, entryBuffers, numWriterThreads);
     } else {
       entryBuffers = new EntryBuffers(controller, bufferSize);
       outputSink = new RecoveredEditsOutputSink(this, controller, entryBuffers, numWriterThreads);
     }
   }

   WALFactory getWalFactory(){
     return this.walFactory;
   }

   FileStatus getFileBeingSplit() {
     return fileBeingSplit;
   }

   String getTmpDirName() {
     return this.tmpDirName;
   }

   Map<String, Map<byte[], Long>> getRegionMaxSeqIdInStores() {
     return regionMaxSeqIdInStores;
   }

   /**
    * Splits a WAL file into region's recovered-edits directory.
    * This is the main entry point for distributed log splitting from SplitLogWorker.
    * <p>
    * If the log file has N regions then N recovered.edits files will be produced.
    * <p>
    * @return false if it is interrupted by the progress-able.
    */
   public static boolean splitLogFile(Path walDir, FileStatus logfile, FileSystem walFS,
       Configuration conf, CancelableProgressable reporter, LastSequenceId idChecker,
       SplitLogWorkerCoordination splitLogWorkerCoordination, WALFactory factory,
       RegionServerServices rsServices) throws IOException {
     Path rootDir = CommonFSUtils.getRootDir(conf);
     FileSystem rootFS = rootDir.getFileSystem(conf);
     WALSplitter s = new WALSplitter(factory, conf, walDir, walFS, rootDir, rootFS, idChecker,
         splitLogWorkerCoordination, rsServices);
     return s.splitLogFile(logfile, reporter);
   }

   // A wrapper to split one log folder using the method used by distributed
   // log splitting. Used by tools and unit tests. It should be package private.
   // It is public only because TestWALObserver is in a different package,
   // which uses this method to do log splitting.
   @VisibleForTesting
   public static List<Path> split(Path walDir, Path logDir, Path oldLogDir, FileSystem walFS,
       Configuration conf, final WALFactory factory) throws IOException {
     Path rootDir = CommonFSUtils.getRootDir(conf);
     FileSystem rootFS = rootDir.getFileSystem(conf);
     final FileStatus[] logfiles =
         SplitLogManager.getFileList(conf, Collections.singletonList(logDir), null);
     List<Path> splits = new ArrayList<>();
     if (ArrayUtils.isNotEmpty(logfiles)) {
       for (FileStatus logfile : logfiles) {
         WALSplitter s =
             new WALSplitter(factory, conf, walDir, walFS, rootDir, rootFS, null, null, null);
         if (s.splitLogFile(logfile, null)) {
           finishSplitLogFile(walDir, oldLogDir, logfile.getPath(), conf);
           if (s.outputSink.splits != null) {
             splits.addAll(s.outputSink.splits);
           }
         }
       }
     }
     if (!walFS.delete(logDir, true)) {
       throw new IOException("Unable to delete src dir: " + logDir);
     }
     return splits;
   }

   /**
    * log splitting implementation, splits one log file.
    * @param logfile should be an actual log file.
    */
   @VisibleForTesting
   boolean splitLogFile(FileStatus logfile, CancelableProgressable reporter) throws IOException {
     Preconditions.checkState(status == null);
     Preconditions.checkArgument(logfile.isFile(),
         "passed in file status is for something other than a regular file.");
     boolean isCorrupted = false;
     boolean skipErrors = conf.getBoolean("hbase.hlog.split.skip.errors",
       SPLIT_SKIP_ERRORS_DEFAULT);
     int interval = conf.getInt("hbase.splitlog.report.interval.loglines", 1024);
     Path logPath = logfile.getPath();
     boolean outputSinkStarted = false;
     boolean progressFailed = false;
     int editsCount = 0;
     int editsSkipped = 0;

     status = TaskMonitor.get().createStatus(
           "Splitting log file " + logfile.getPath() + "into a temporary staging area.");
     Reader logFileReader = null;
     this.fileBeingSplit = logfile;
     long startTS = EnvironmentEdgeManager.currentTime();
     try {
       long logLength = logfile.getLen();
       LOG.info("Splitting WAL={}, size={} ({} bytes)", logPath, StringUtils.humanSize(logLength),
           logLength);
       status.setStatus("Opening log file");
       if (reporter != null && !reporter.progress()) {
         progressFailed = true;
         return false;
       }
       logFileReader = getReader(logfile, skipErrors, reporter);
       if (logFileReader == null) {
         LOG.warn("Nothing to split in WAL={}", logPath);
         return true;
       }
       long openCost = EnvironmentEdgeManager.currentTime() - startTS;
       LOG.info("Open WAL={} cost {} ms", logPath, openCost);
       int numOpenedFilesBeforeReporting = conf.getInt("hbase.splitlog.report.openedfiles", 3);
       int numOpenedFilesLastCheck = 0;
       outputSink.setReporter(reporter);
       outputSink.startWriterThreads();
       outputSinkStarted = true;
       Entry entry;
       Long lastFlushedSequenceId = -1L;
       startTS = EnvironmentEdgeManager.currentTime();
       while ((entry = getNextLogLine(logFileReader, logPath, skipErrors)) != null) {
         byte[] region = entry.getKey().getEncodedRegionName();
         String encodedRegionNameAsStr = Bytes.toString(region);
         lastFlushedSequenceId = lastFlushedSequenceIds.get(encodedRegionNameAsStr);
         if (lastFlushedSequenceId == null) {
           if (sequenceIdChecker != null) {
             RegionStoreSequenceIds ids = sequenceIdChecker.getLastSequenceId(region);
             Map<byte[], Long> maxSeqIdInStores = new TreeMap<>(Bytes.BYTES_COMPARATOR);
             for (StoreSequenceId storeSeqId : ids.getStoreSequenceIdList()) {
               maxSeqIdInStores.put(storeSeqId.getFamilyName().toByteArray(),
                 storeSeqId.getSequenceId());
             }
             regionMaxSeqIdInStores.put(encodedRegionNameAsStr, maxSeqIdInStores);
             lastFlushedSequenceId = ids.getLastFlushedSequenceId();
             if (LOG.isDebugEnabled()) {
               LOG.debug("DLS Last flushed sequenceid for " + encodedRegionNameAsStr + ": " +
                   TextFormat.shortDebugString(ids));
             }
           }
           if (lastFlushedSequenceId == null) {
             lastFlushedSequenceId = -1L;
           }
           lastFlushedSequenceIds.put(encodedRegionNameAsStr, lastFlushedSequenceId);
         }
         if (lastFlushedSequenceId >= entry.getKey().getSequenceId()) {
           editsSkipped++;
           continue;
         }
         // Don't send Compaction/Close/Open region events to recovered edit type sinks.
         if (entry.getEdit().isMetaEdit() && !outputSink.keepRegionEvent(entry)) {
           editsSkipped++;
           continue;
         }
         entryBuffers.appendEntry(entry);
         editsCount++;
         int moreWritersFromLastCheck = this.getNumOpenWriters() - numOpenedFilesLastCheck;
         // If sufficient edits have passed, check if we should report progress.
         if (editsCount % interval == 0
             || moreWritersFromLastCheck > numOpenedFilesBeforeReporting) {
           numOpenedFilesLastCheck = this.getNumOpenWriters();
           String countsStr = (editsCount - (editsSkipped + outputSink.getTotalSkippedEdits()))
               + " edits, skipped " + editsSkipped + " edits.";
           status.setStatus("Split " + countsStr);
           if (reporter != null && !reporter.progress()) {
             progressFailed = true;
             return false;
           }
         }
       }
     } catch (InterruptedException ie) {
       IOException iie = new InterruptedIOException();
       iie.initCause(ie);
       throw iie;
     } catch (CorruptedLogFileException e) {
       LOG.warn("Could not parse, corrupted WAL={}", logPath, e);
       if (splitLogWorkerCoordination != null) {
         // Some tests pass in a csm of null.
         splitLogWorkerCoordination.markCorrupted(walDir, logfile.getPath().getName(), walFS);
       } else {
         // for tests only
         ZKSplitLog.markCorrupted(walDir, logfile.getPath().getName(), walFS);
       }
       isCorrupted = true;
     } catch (IOException e) {
       e = e instanceof RemoteException ? ((RemoteException) e).unwrapRemoteException() : e;
       throw e;
     } finally {
       LOG.debug("Finishing writing output logs and closing down");
       try {
         if (null != logFileReader) {
           logFileReader.close();
         }
       } catch (IOException exception) {
         LOG.warn("Could not close WAL reader", exception);
       }
       try {
         if (outputSinkStarted) {
           // Set progress_failed to true as the immediate following statement will reset its value
           // when close() throws exception, progress_failed has the right value
           progressFailed = true;
           progressFailed = outputSink.close() == null;
         }
       } finally {
         long processCost = EnvironmentEdgeManager.currentTime() - startTS;
         // See if length got updated post lease recovery
         String msg = "Processed " + editsCount + " edits across " +
             outputSink.getNumberOfRecoveredRegions() + " regions cost " + processCost +
             " ms; edits skipped=" + editsSkipped + "; WAL=" + logPath + ", size=" +
             StringUtils.humanSize(logfile.getLen()) + ", length=" + logfile.getLen() +
             ", corrupted=" + isCorrupted + ", progress failed=" + progressFailed;
         LOG.info(msg);
         status.markComplete(msg);
       }
     }
     return !progressFailed;
   }

   /**
    * Create a new {@link Reader} for reading logs to split.
    */
   private Reader getReader(FileStatus file, boolean skipErrors, CancelableProgressable reporter)
       throws IOException, CorruptedLogFileException {
     Path path = file.getPath();
     long length = file.getLen();
     Reader in;

     // Check for possibly empty file. With appends, currently Hadoop reports a
     // zero length even if the file has been sync'd. Revisit if HDFS-376 or
     // HDFS-878 is committed.
     if (length <= 0) {
       LOG.warn("File {} might be still open, length is 0", path);
     }

     try {
       RecoverLeaseFSUtils.recoverFileLease(walFS, path, conf, reporter);
       try {
         in = getReader(path, reporter);
       } catch (EOFException e) {
         if (length <= 0) {
           // TODO should we ignore an empty, not-last log file if skip.errors
           // is false? Either way, the caller should decide what to do. E.g.
           // ignore if this is the last log in sequence.
           // TODO is this scenario still possible if the log has been
           // recovered (i.e. closed)
           LOG.warn("Could not open {} for reading. File is empty", path, e);
         }
         // EOFException being ignored
         return null;
       }
     } catch (IOException e) {
       if (e instanceof FileNotFoundException) {
         // A wal file may not exist anymore. Nothing can be recovered so move on
         LOG.warn("File {} does not exist anymore", path, e);
         return null;
       }
       if (!skipErrors || e instanceof InterruptedIOException) {
         throw e; // Don't mark the file corrupted if interrupted, or not skipErrors
       }
       throw new CorruptedLogFileException("skipErrors=true Could not open wal "
         + path + " ignoring", e);
     }
     return in;
   }

   private Entry getNextLogLine(Reader in, Path path, boolean skipErrors)
       throws CorruptedLogFileException, IOException {
     try {
       return in.next();
     } catch (EOFException eof) {
       // truncated files are expected if a RS crashes (see HBASE-2643)
       LOG.info("EOF from wal {}. Continuing.", path);
       return null;
     } catch (IOException e) {
       // If the IOE resulted from bad file format,
       // then this problem is idempotent and retrying won't help
       if (e.getCause() != null && (e.getCause() instanceof ParseException
           || e.getCause() instanceof org.apache.hadoop.fs.ChecksumException)) {
         LOG.warn("Parse exception from wal {}. Continuing", path, e);
         return null;
       }
       if (!skipErrors) {
         throw e;
       }
       throw new CorruptedLogFileException("skipErrors=true Ignoring exception"
         + " while parsing wal " + path + ". Marking as corrupted", e);
     }
   }

   /**
    * Create a new {@link WALProvider.Writer} for writing log splits.
    * @return a new Writer instance, caller should close
    */
   protected WALProvider.Writer createWriter(Path logfile) throws IOException {
     return walFactory.createRecoveredEditsWriter(walFS, logfile);
   }

   /**
    * Create a new {@link Reader} for reading logs to split.
    * @return new Reader instance, caller should close
    */
   protected Reader getReader(Path curLogFile, CancelableProgressable reporter) throws IOException {
     return walFactory.createReader(walFS, curLogFile, reporter);
   }

   /**
    * Get current open writers
    */
   private int getNumOpenWriters() {
     int result = 0;
     if (this.outputSink != null) {
       result += this.outputSink.getNumOpenWriters();
     }
     return result;
   }

   /**
    * Contains some methods to control WAL-entries producer / consumer interactions
    */
   public static class PipelineController {
     // If an exception is thrown by one of the other threads, it will be
     // stored here.
     AtomicReference<Throwable> thrown = new AtomicReference<>();

     // Wait/notify for when data has been produced by the writer thread,
     // consumed by the reader thread, or an exception occurred
     final Object dataAvailable = new Object();

     void writerThreadError(Throwable t) {
       thrown.compareAndSet(null, t);
     }

     /**
      * Check for errors in the writer threads. If any is found, rethrow it.
      */
     void checkForErrors() throws IOException {
       Throwable thrown = this.thrown.get();
       if (thrown == null) {
         return;
       }
       if (thrown instanceof IOException) {
         throw new IOException(thrown);
       } else {
         throw new RuntimeException(thrown);
       }
     }
   }

   static class CorruptedLogFileException extends Exception {
     private static final long serialVersionUID = 1L;

     CorruptedLogFileException(String s) {
       super(s);
     }

     /**
      * CorruptedLogFileException with cause
      *
      * @param message the message for this exception
      * @param cause the cause for this exception
      */
     CorruptedLogFileException(String message, Throwable cause) {
       super(message, cause);
     }
   }
 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.hadoop.hbase.wal;

	import static org.apache.hadoop.hbase.wal.BoundedRecoveredHFilesOutputSink.DEFAULT_WAL_SPLIT_TO_HFILE;
	import static org.apache.hadoop.hbase.wal.BoundedRecoveredHFilesOutputSink.WAL_SPLIT_TO_HFILE;
	import static org.apache.hadoop.hbase.wal.WALSplitUtil.finishSplitLogFile;

	import java.io.EOFException;
	import java.io.FileNotFoundException;
	import java.io.IOException;
	import java.io.InterruptedIOException;
	import java.text.ParseException;
	import java.util.ArrayList;
	import java.util.Collections;
	import java.util.List;
	import java.util.Map;
	import java.util.TreeMap;
	import java.util.concurrent.ConcurrentHashMap;
	import java.util.concurrent.atomic.AtomicReference;
	import org.apache.commons.lang3.ArrayUtils;
	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.fs.FileStatus;
	import org.apache.hadoop.fs.FileSystem;
	import org.apache.hadoop.fs.Path;
	import org.apache.hadoop.hbase.HBaseConfiguration;
	import org.apache.hadoop.hbase.HConstants;
	import org.apache.hadoop.hbase.TableDescriptors;
	import org.apache.hadoop.hbase.coordination.SplitLogWorkerCoordination;
	import org.apache.hadoop.hbase.master.SplitLogManager;
	import org.apache.hadoop.hbase.monitoring.MonitoredTask;
	import org.apache.hadoop.hbase.monitoring.TaskMonitor;
	import org.apache.hadoop.hbase.procedure2.util.StringUtils;
	import org.apache.hadoop.hbase.regionserver.LastSequenceId;
	import org.apache.hadoop.hbase.regionserver.RegionServerServices;
	import org.apache.hadoop.hbase.regionserver.wal.WALCellCodec;
	import org.apache.hadoop.hbase.util.Bytes;
	import org.apache.hadoop.hbase.util.CancelableProgressable;
	import org.apache.hadoop.hbase.util.CommonFSUtils;
	import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
	import org.apache.hadoop.hbase.util.FSTableDescriptors;
	import org.apache.hadoop.hbase.util.RecoverLeaseFSUtils;
	import org.apache.hadoop.hbase.wal.WAL.Entry;
	import org.apache.hadoop.hbase.wal.WAL.Reader;
	import org.apache.hadoop.hbase.zookeeper.ZKSplitLog;
	import org.apache.hadoop.ipc.RemoteException;
	import org.apache.yetus.audience.InterfaceAudience;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
	import org.apache.hbase.thirdparty.com.google.common.base.Preconditions;
	import org.apache.hbase.thirdparty.com.google.protobuf.TextFormat;

	import org.apache.hadoop.hbase.shaded.protobuf.generated.ClusterStatusProtos.RegionStoreSequenceIds;
	import org.apache.hadoop.hbase.shaded.protobuf.generated.ClusterStatusProtos.StoreSequenceId;

	/**
	* This class is responsible for splitting up a bunch of regionserver commit log
	* files that are no longer being written to, into new files, one per region, for
	* recovering data on startup. Delete the old log files when finished.
	*/
	@InterfaceAudience.Private
	public class WALSplitter {
	private static final Logger LOG = LoggerFactory.getLogger(WALSplitter.class);

	/** By default we retry errors in splitting, rather than skipping. */
	public static final boolean SPLIT_SKIP_ERRORS_DEFAULT = false;

	// Parameters for split process
	protected final Path walDir;
	protected final FileSystem walFS;
	protected final Configuration conf;
	final Path rootDir;
	final FileSystem rootFS;
	final RegionServerServices rsServices;
	final TableDescriptors tableDescriptors;

	// Major subcomponents of the split process.
	// These are separated into inner classes to make testing easier.
	OutputSink outputSink;
	private EntryBuffers entryBuffers;

	private SplitLogWorkerCoordination splitLogWorkerCoordination;
	private final WALFactory walFactory;

	private MonitoredTask status;

	// For checking the latest flushed sequence id
	protected final LastSequenceId sequenceIdChecker;

	// Map encodedRegionName -> lastFlushedSequenceId
	protected Map<String, Long> lastFlushedSequenceIds = new ConcurrentHashMap<>();

	// Map encodedRegionName -> maxSeqIdInStores
	protected Map<String, Map<byte[], Long>> regionMaxSeqIdInStores = new ConcurrentHashMap<>();

	// the file being split currently
	private FileStatus fileBeingSplit;

	private final String tmpDirName;

	public final static String SPLIT_WRITER_CREATION_BOUNDED = "hbase.split.writer.creation.bounded";
	public final static String SPLIT_WAL_BUFFER_SIZE = "hbase.regionserver.hlog.splitlog.buffersize";
	public final static String SPLIT_WAL_WRITER_THREADS =
	"hbase.regionserver.hlog.splitlog.writer.threads";

	@VisibleForTesting
	WALSplitter(final WALFactory factory, Configuration conf, Path walDir, FileSystem walFS,
	Path rootDir, FileSystem rootFS, LastSequenceId idChecker,
	SplitLogWorkerCoordination splitLogWorkerCoordination, RegionServerServices rsServices) {
	this.conf = HBaseConfiguration.create(conf);
	String codecClassName =
	conf.get(WALCellCodec.WAL_CELL_CODEC_CLASS_KEY, WALCellCodec.class.getName());
	this.conf.set(HConstants.RPC_CODEC_CONF_KEY, codecClassName);
	this.walDir = walDir;
	this.walFS = walFS;
	this.rootDir = rootDir;
	this.rootFS = rootFS;
	this.sequenceIdChecker = idChecker;
	this.splitLogWorkerCoordination = splitLogWorkerCoordination;
	this.rsServices = rsServices;
	if (rsServices != null) {
	this.tableDescriptors = rsServices.getTableDescriptors();
	} else {
	this.tableDescriptors = new FSTableDescriptors(rootFS, rootDir, true, true);
	}

	this.walFactory = factory;
	PipelineController controller = new PipelineController();
	this.tmpDirName =
	conf.get(HConstants.TEMPORARY_FS_DIRECTORY_KEY, HConstants.DEFAULT_TEMPORARY_HDFS_DIRECTORY);


	// if we limit the number of writers opened for sinking recovered edits
	boolean splitWriterCreationBounded = conf.getBoolean(SPLIT_WRITER_CREATION_BOUNDED, false);
	boolean splitToHFile = conf.getBoolean(WAL_SPLIT_TO_HFILE, DEFAULT_WAL_SPLIT_TO_HFILE);
	long bufferSize = this.conf.getLong(SPLIT_WAL_BUFFER_SIZE, 128 * 1024 * 1024);
	int numWriterThreads = this.conf.getInt(SPLIT_WAL_WRITER_THREADS, 3);

	if (splitToHFile) {
	entryBuffers = new BoundedEntryBuffers(controller, bufferSize);
	outputSink =
	new BoundedRecoveredHFilesOutputSink(this, controller, entryBuffers, numWriterThreads);
	} else if (splitWriterCreationBounded) {
	entryBuffers = new BoundedEntryBuffers(controller, bufferSize);
	outputSink =
	new BoundedRecoveredEditsOutputSink(this, controller, entryBuffers, numWriterThreads);
	} else {
	entryBuffers = new EntryBuffers(controller, bufferSize);
	outputSink = new RecoveredEditsOutputSink(this, controller, entryBuffers, numWriterThreads);
	}
	}

	WALFactory getWalFactory(){
	return this.walFactory;
	}

	FileStatus getFileBeingSplit() {
	return fileBeingSplit;
	}

	String getTmpDirName() {
	return this.tmpDirName;
	}

	Map<String, Map<byte[], Long>> getRegionMaxSeqIdInStores() {
	return regionMaxSeqIdInStores;
	}

	/**
	* Splits a WAL file into region's recovered-edits directory.
	* This is the main entry point for distributed log splitting from SplitLogWorker.
	* <p>
	* If the log file has N regions then N recovered.edits files will be produced.
	* <p>
	* @return false if it is interrupted by the progress-able.
	*/
	public static boolean splitLogFile(Path walDir, FileStatus logfile, FileSystem walFS,
	Configuration conf, CancelableProgressable reporter, LastSequenceId idChecker,
	SplitLogWorkerCoordination splitLogWorkerCoordination, WALFactory factory,
	RegionServerServices rsServices) throws IOException {
	Path rootDir = CommonFSUtils.getRootDir(conf);
	FileSystem rootFS = rootDir.getFileSystem(conf);
	WALSplitter s = new WALSplitter(factory, conf, walDir, walFS, rootDir, rootFS, idChecker,
	splitLogWorkerCoordination, rsServices);
	return s.splitLogFile(logfile, reporter);
	}

	// A wrapper to split one log folder using the method used by distributed
	// log splitting. Used by tools and unit tests. It should be package private.
	// It is public only because TestWALObserver is in a different package,
	// which uses this method to do log splitting.
	@VisibleForTesting
	public static List<Path> split(Path walDir, Path logDir, Path oldLogDir, FileSystem walFS,
	Configuration conf, final WALFactory factory) throws IOException {
	Path rootDir = CommonFSUtils.getRootDir(conf);
	FileSystem rootFS = rootDir.getFileSystem(conf);
	final FileStatus[] logfiles =
	SplitLogManager.getFileList(conf, Collections.singletonList(logDir), null);
	List<Path> splits = new ArrayList<>();
	if (ArrayUtils.isNotEmpty(logfiles)) {
	for (FileStatus logfile : logfiles) {
	WALSplitter s =
	new WALSplitter(factory, conf, walDir, walFS, rootDir, rootFS, null, null, null);
	if (s.splitLogFile(logfile, null)) {
	finishSplitLogFile(walDir, oldLogDir, logfile.getPath(), conf);
	if (s.outputSink.splits != null) {
	splits.addAll(s.outputSink.splits);
	}
	}
	}
	}
	if (!walFS.delete(logDir, true)) {
	throw new IOException("Unable to delete src dir: " + logDir);
	}
	return splits;
	}

	/**
	* log splitting implementation, splits one log file.
	* @param logfile should be an actual log file.
	*/
	@VisibleForTesting
	boolean splitLogFile(FileStatus logfile, CancelableProgressable reporter) throws IOException {
	Preconditions.checkState(status == null);
	Preconditions.checkArgument(logfile.isFile(),
	"passed in file status is for something other than a regular file.");
	boolean isCorrupted = false;
	boolean skipErrors = conf.getBoolean("hbase.hlog.split.skip.errors",
	SPLIT_SKIP_ERRORS_DEFAULT);
	int interval = conf.getInt("hbase.splitlog.report.interval.loglines", 1024);
	Path logPath = logfile.getPath();
	boolean outputSinkStarted = false;
	boolean progressFailed = false;
	int editsCount = 0;
	int editsSkipped = 0;

	status = TaskMonitor.get().createStatus(
	"Splitting log file " + logfile.getPath() + "into a temporary staging area.");
	Reader logFileReader = null;
	this.fileBeingSplit = logfile;
	long startTS = EnvironmentEdgeManager.currentTime();
	try {
	long logLength = logfile.getLen();
	LOG.info("Splitting WAL={}, size={} ({} bytes)", logPath, StringUtils.humanSize(logLength),
	logLength);
	status.setStatus("Opening log file");
	if (reporter != null && !reporter.progress()) {
	progressFailed = true;
	return false;
	}
	logFileReader = getReader(logfile, skipErrors, reporter);
	if (logFileReader == null) {
	LOG.warn("Nothing to split in WAL={}", logPath);
	return true;
	}
	long openCost = EnvironmentEdgeManager.currentTime() - startTS;
	LOG.info("Open WAL={} cost {} ms", logPath, openCost);
	int numOpenedFilesBeforeReporting = conf.getInt("hbase.splitlog.report.openedfiles", 3);
	int numOpenedFilesLastCheck = 0;
	outputSink.setReporter(reporter);
	outputSink.startWriterThreads();
	outputSinkStarted = true;
	Entry entry;
	Long lastFlushedSequenceId = -1L;
	startTS = EnvironmentEdgeManager.currentTime();
	while ((entry = getNextLogLine(logFileReader, logPath, skipErrors)) != null) {
	byte[] region = entry.getKey().getEncodedRegionName();
	String encodedRegionNameAsStr = Bytes.toString(region);
	lastFlushedSequenceId = lastFlushedSequenceIds.get(encodedRegionNameAsStr);
	if (lastFlushedSequenceId == null) {
	if (sequenceIdChecker != null) {
	RegionStoreSequenceIds ids = sequenceIdChecker.getLastSequenceId(region);
	Map<byte[], Long> maxSeqIdInStores = new TreeMap<>(Bytes.BYTES_COMPARATOR);
	for (StoreSequenceId storeSeqId : ids.getStoreSequenceIdList()) {
	maxSeqIdInStores.put(storeSeqId.getFamilyName().toByteArray(),
	storeSeqId.getSequenceId());
	}
	regionMaxSeqIdInStores.put(encodedRegionNameAsStr, maxSeqIdInStores);
	lastFlushedSequenceId = ids.getLastFlushedSequenceId();
	if (LOG.isDebugEnabled()) {
	LOG.debug("DLS Last flushed sequenceid for " + encodedRegionNameAsStr + ": " +
	TextFormat.shortDebugString(ids));
	}
	}
	if (lastFlushedSequenceId == null) {
	lastFlushedSequenceId = -1L;
	}
	lastFlushedSequenceIds.put(encodedRegionNameAsStr, lastFlushedSequenceId);
	}
	if (lastFlushedSequenceId >= entry.getKey().getSequenceId()) {
	editsSkipped++;
	continue;
	}
	// Don't send Compaction/Close/Open region events to recovered edit type sinks.
	if (entry.getEdit().isMetaEdit() && !outputSink.keepRegionEvent(entry)) {
	editsSkipped++;
	continue;
	}
	entryBuffers.appendEntry(entry);
	editsCount++;
	int moreWritersFromLastCheck = this.getNumOpenWriters() - numOpenedFilesLastCheck;
	// If sufficient edits have passed, check if we should report progress.
	if (editsCount % interval == 0
	\|\| moreWritersFromLastCheck > numOpenedFilesBeforeReporting) {
	numOpenedFilesLastCheck = this.getNumOpenWriters();
	String countsStr = (editsCount - (editsSkipped + outputSink.getTotalSkippedEdits()))
	+ " edits, skipped " + editsSkipped + " edits.";
	status.setStatus("Split " + countsStr);
	if (reporter != null && !reporter.progress()) {
	progressFailed = true;
	return false;
	}
	}
	}
	} catch (InterruptedException ie) {
	IOException iie = new InterruptedIOException();
	iie.initCause(ie);
	throw iie;
	} catch (CorruptedLogFileException e) {
	LOG.warn("Could not parse, corrupted WAL={}", logPath, e);
	if (splitLogWorkerCoordination != null) {
	// Some tests pass in a csm of null.
	splitLogWorkerCoordination.markCorrupted(walDir, logfile.getPath().getName(), walFS);
	} else {
	// for tests only
	ZKSplitLog.markCorrupted(walDir, logfile.getPath().getName(), walFS);
	}
	isCorrupted = true;
	} catch (IOException e) {
	e = e instanceof RemoteException ? ((RemoteException) e).unwrapRemoteException() : e;
	throw e;
	} finally {
	LOG.debug("Finishing writing output logs and closing down");
	try {
	if (null != logFileReader) {
	logFileReader.close();
	}
	} catch (IOException exception) {
	LOG.warn("Could not close WAL reader", exception);
	}
	try {
	if (outputSinkStarted) {
	// Set progress_failed to true as the immediate following statement will reset its value
	// when close() throws exception, progress_failed has the right value
	progressFailed = true;
	progressFailed = outputSink.close() == null;
	}
	} finally {
	long processCost = EnvironmentEdgeManager.currentTime() - startTS;
	// See if length got updated post lease recovery
	String msg = "Processed " + editsCount + " edits across " +
	outputSink.getNumberOfRecoveredRegions() + " regions cost " + processCost +
	" ms; edits skipped=" + editsSkipped + "; WAL=" + logPath + ", size=" +
	StringUtils.humanSize(logfile.getLen()) + ", length=" + logfile.getLen() +
	", corrupted=" + isCorrupted + ", progress failed=" + progressFailed;
	LOG.info(msg);
	status.markComplete(msg);
	}
	}
	return !progressFailed;
	}

	/**
	* Create a new {@link Reader} for reading logs to split.
	*/
	private Reader getReader(FileStatus file, boolean skipErrors, CancelableProgressable reporter)
	throws IOException, CorruptedLogFileException {
	Path path = file.getPath();
	long length = file.getLen();
	Reader in;

	// Check for possibly empty file. With appends, currently Hadoop reports a
	// zero length even if the file has been sync'd. Revisit if HDFS-376 or
	// HDFS-878 is committed.
	if (length <= 0) {
	LOG.warn("File {} might be still open, length is 0", path);
	}

	try {
	RecoverLeaseFSUtils.recoverFileLease(walFS, path, conf, reporter);
	try {
	in = getReader(path, reporter);
	} catch (EOFException e) {
	if (length <= 0) {
	// TODO should we ignore an empty, not-last log file if skip.errors
	// is false? Either way, the caller should decide what to do. E.g.
	// ignore if this is the last log in sequence.
	// TODO is this scenario still possible if the log has been
	// recovered (i.e. closed)
	LOG.warn("Could not open {} for reading. File is empty", path, e);
	}
	// EOFException being ignored
	return null;
	}
	} catch (IOException e) {
	if (e instanceof FileNotFoundException) {
	// A wal file may not exist anymore. Nothing can be recovered so move on
	LOG.warn("File {} does not exist anymore", path, e);
	return null;
	}
	if (!skipErrors \|\| e instanceof InterruptedIOException) {
	throw e; // Don't mark the file corrupted if interrupted, or not skipErrors
	}
	throw new CorruptedLogFileException("skipErrors=true Could not open wal "
	+ path + " ignoring", e);
	}
	return in;
	}

	private Entry getNextLogLine(Reader in, Path path, boolean skipErrors)
	throws CorruptedLogFileException, IOException {
	try {
	return in.next();
	} catch (EOFException eof) {
	// truncated files are expected if a RS crashes (see HBASE-2643)
	LOG.info("EOF from wal {}. Continuing.", path);
	return null;
	} catch (IOException e) {
	// If the IOE resulted from bad file format,
	// then this problem is idempotent and retrying won't help
	if (e.getCause() != null && (e.getCause() instanceof ParseException
	\|\| e.getCause() instanceof org.apache.hadoop.fs.ChecksumException)) {
	LOG.warn("Parse exception from wal {}. Continuing", path, e);
	return null;
	}
	if (!skipErrors) {
	throw e;
	}
	throw new CorruptedLogFileException("skipErrors=true Ignoring exception"
	+ " while parsing wal " + path + ". Marking as corrupted", e);
	}
	}

	/**
	* Create a new {@link WALProvider.Writer} for writing log splits.
	* @return a new Writer instance, caller should close
	*/
	protected WALProvider.Writer createWriter(Path logfile) throws IOException {
	return walFactory.createRecoveredEditsWriter(walFS, logfile);
	}

	/**
	* Create a new {@link Reader} for reading logs to split.
	* @return new Reader instance, caller should close
	*/
	protected Reader getReader(Path curLogFile, CancelableProgressable reporter) throws IOException {
	return walFactory.createReader(walFS, curLogFile, reporter);
	}

	/**
	* Get current open writers
	*/
	private int getNumOpenWriters() {
	int result = 0;
	if (this.outputSink != null) {
	result += this.outputSink.getNumOpenWriters();
	}
	return result;
	}

	/**
	* Contains some methods to control WAL-entries producer / consumer interactions
	*/
	public static class PipelineController {
	// If an exception is thrown by one of the other threads, it will be
	// stored here.
	AtomicReference<Throwable> thrown = new AtomicReference<>();

	// Wait/notify for when data has been produced by the writer thread,
	// consumed by the reader thread, or an exception occurred
	final Object dataAvailable = new Object();

	void writerThreadError(Throwable t) {
	thrown.compareAndSet(null, t);
	}

	/**
	* Check for errors in the writer threads. If any is found, rethrow it.
	*/
	void checkForErrors() throws IOException {
	Throwable thrown = this.thrown.get();
	if (thrown == null) {
	return;
	}
	if (thrown instanceof IOException) {
	throw new IOException(thrown);
	} else {
	throw new RuntimeException(thrown);
	}
	}
	}

	static class CorruptedLogFileException extends Exception {
	private static final long serialVersionUID = 1L;

	CorruptedLogFileException(String s) {
	super(s);
	}

	/**
	* CorruptedLogFileException with cause
	*
	* @param message the message for this exception
	* @param cause the cause for this exception
	*/
	CorruptedLogFileException(String message, Throwable cause) {
	super(message, cause);
	}
	}
	}