blob: b24f79494045c25dba5902800cbd9f22557d1e63 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.master.snapshot;
import static org.apache.hadoop.hbase.HConstants.DEFAULT_HBASE_RPC_TIMEOUT;
import static org.apache.hadoop.hbase.HConstants.HBASE_RPC_TIMEOUT_KEY;
import java.io.IOException;
import java.util.List;
import java.util.concurrent.CancellationException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.DoNotRetryIOException;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.client.TableDescriptor;
import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
import org.apache.hadoop.hbase.errorhandling.ForeignException;
import org.apache.hadoop.hbase.errorhandling.ForeignExceptionDispatcher;
import org.apache.hadoop.hbase.errorhandling.ForeignExceptionSnare;
import org.apache.hadoop.hbase.executor.EventHandler;
import org.apache.hadoop.hbase.executor.EventType;
import org.apache.hadoop.hbase.master.MasterServices;
import org.apache.hadoop.hbase.master.MetricsSnapshot;
import org.apache.hadoop.hbase.master.SnapshotSentinel;
import org.apache.hadoop.hbase.master.locking.LockManager;
import org.apache.hadoop.hbase.master.locking.LockManager.MasterLock;
import org.apache.hadoop.hbase.monitoring.MonitoredTask;
import org.apache.hadoop.hbase.monitoring.TaskMonitor;
import org.apache.hadoop.hbase.procedure2.LockType;
import org.apache.hadoop.hbase.snapshot.ClientSnapshotDescriptionUtils;
import org.apache.hadoop.hbase.snapshot.SnapshotDescriptionUtils;
import org.apache.hadoop.hbase.snapshot.SnapshotManifest;
import org.apache.hadoop.hbase.util.CommonFSUtils;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.yetus.audience.InterfaceAudience;
import org.apache.zookeeper.KeeperException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hbase.thirdparty.com.google.common.base.Preconditions;
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.shaded.protobuf.generated.SnapshotProtos.SnapshotDescription;
/**
* A handler for taking snapshots from the master. This is not a subclass of TableEventHandler
* because using that would incur an extra hbase:meta scan. The {@link #snapshotRegions(List)} call
* should get implemented for each snapshot flavor.
*/
@InterfaceAudience.Private
public abstract class TakeSnapshotHandler extends EventHandler
implements SnapshotSentinel, ForeignExceptionSnare {
private static final Logger LOG = LoggerFactory.getLogger(TakeSnapshotHandler.class);
public static final String HBASE_SNAPSHOT_MASTER_LOCK_ACQUIRE_TIMEOUT =
"hbase.snapshot.master.lock.acquire.timeout";
private volatile boolean finished;
// none of these should ever be null
protected final MasterServices master;
protected final MetricsSnapshot metricsSnapshot = new MetricsSnapshot();
protected final SnapshotDescription snapshot;
protected final Configuration conf;
protected final FileSystem rootFs;
protected final FileSystem workingDirFs;
protected final Path rootDir;
private final Path snapshotDir;
protected final Path workingDir;
private final MasterSnapshotVerifier verifier;
protected final ForeignExceptionDispatcher monitor;
private final LockManager.MasterLock tableLock;
protected final MonitoredTask status;
protected final TableName snapshotTable;
protected final SnapshotManifest snapshotManifest;
protected final SnapshotManager snapshotManager;
/**
* Snapshot creation requires table lock. If any region of the table is in transition, table lock
* cannot be acquired by LockProcedure and hence snapshot creation could hang for potentially very
* long time. This timeout will ensure snapshot creation fails-fast by waiting for only given
* timeout.
*/
private final long lockAcquireTimeoutMs;
protected TableDescriptor htd;
/**
* @param snapshot descriptor of the snapshot to take
* @param masterServices master services provider
* @throws IllegalArgumentException if the working snapshot directory set from the configuration
* is the same as the completed snapshot directory
* @throws IOException if the file system of the working snapshot directory cannot be
* determined
*/
public TakeSnapshotHandler(SnapshotDescription snapshot, final MasterServices masterServices,
final SnapshotManager snapshotManager) throws IOException {
super(masterServices, EventType.C_M_SNAPSHOT_TABLE);
assert snapshot != null : "SnapshotDescription must not be nul1";
assert masterServices != null : "MasterServices must not be nul1";
this.master = masterServices;
this.conf = this.master.getConfiguration();
this.rootDir = this.master.getMasterFileSystem().getRootDir();
this.workingDir = SnapshotDescriptionUtils.getWorkingSnapshotDir(snapshot, rootDir, conf);
Preconditions.checkArgument(
!SnapshotDescriptionUtils.isSubDirectoryOf(workingDir, rootDir)
|| SnapshotDescriptionUtils.isWithinDefaultWorkingDir(workingDir, conf),
"The working directory " + workingDir + " cannot be in the root directory unless it is "
+ "within the default working directory");
this.snapshot = snapshot;
this.snapshotManager = snapshotManager;
this.snapshotTable = TableName.valueOf(snapshot.getTable());
this.rootFs = this.master.getMasterFileSystem().getFileSystem();
this.snapshotDir = SnapshotDescriptionUtils.getCompletedSnapshotDir(snapshot, rootDir);
this.workingDirFs = this.workingDir.getFileSystem(this.conf);
this.monitor = new ForeignExceptionDispatcher(snapshot.getName());
this.tableLock = master.getLockManager().createMasterLock(snapshotTable, LockType.EXCLUSIVE,
this.getClass().getName() + ": take snapshot " + snapshot.getName());
// prepare the verify
this.verifier = new MasterSnapshotVerifier(masterServices, snapshot, workingDirFs);
// update the running tasks
this.status = TaskMonitor.get().createStatus(
"Taking " + snapshot.getType() + " snapshot on table: " + snapshotTable, false, true);
this.snapshotManifest =
SnapshotManifest.create(conf, rootFs, workingDir, snapshot, monitor, status);
this.lockAcquireTimeoutMs = conf.getLong(HBASE_SNAPSHOT_MASTER_LOCK_ACQUIRE_TIMEOUT,
conf.getLong(HBASE_RPC_TIMEOUT_KEY, DEFAULT_HBASE_RPC_TIMEOUT));
}
private TableDescriptor loadTableDescriptor() throws IOException {
TableDescriptor htd = this.master.getTableDescriptors().get(snapshotTable);
if (htd == null) {
throw new IOException("TableDescriptor missing for " + snapshotTable);
}
if (htd.getMaxFileSize() == -1 && this.snapshot.getMaxFileSize() > 0) {
htd = TableDescriptorBuilder.newBuilder(htd).setValue(TableDescriptorBuilder.MAX_FILESIZE,
Long.toString(this.snapshot.getMaxFileSize())).build();
}
return htd;
}
@Override
public TakeSnapshotHandler prepare() throws Exception {
super.prepare();
// after this, you should ensure to release this lock in case of exceptions
if (this.tableLock.tryAcquire(this.lockAcquireTimeoutMs)) {
try {
this.htd = loadTableDescriptor(); // check that .tableinfo is present
} catch (Exception e) {
this.tableLock.release();
throw e;
}
} else {
LOG.error("Master lock could not be acquired in {} ms", lockAcquireTimeoutMs);
throw new DoNotRetryIOException("Master lock could not be acquired");
}
return this;
}
/**
* Execute the core common portions of taking a snapshot. The {@link #snapshotRegions(List)} call
* should get implemented for each snapshot flavor.
*/
@Override
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "REC_CATCH_EXCEPTION",
justification = "Intentional")
public void process() {
String msg = "Running " + snapshot.getType() + " table snapshot " + snapshot.getName() + " "
+ eventType + " on table " + snapshotTable;
LOG.info(msg);
MasterLock tableLockToRelease = this.tableLock;
status.setStatus(msg);
try {
if (downgradeToSharedTableLock()) {
// release the exclusive lock and hold the shared lock instead
tableLockToRelease = master.getLockManager().createMasterLock(snapshotTable,
LockType.SHARED, this.getClass().getName() + ": take snapshot " + snapshot.getName());
tableLock.release();
boolean isTableLockAcquired = tableLockToRelease.tryAcquire(this.lockAcquireTimeoutMs);
if (!isTableLockAcquired) {
LOG.error("Could not acquire shared lock on table {} in {} ms", snapshotTable,
lockAcquireTimeoutMs);
throw new IOException("Could not acquire shared lock on table " + snapshotTable);
}
}
// If regions move after this meta scan, the region specific snapshot should fail, triggering
// an external exception that gets captured here.
// write down the snapshot info in the working directory
SnapshotDescriptionUtils.writeSnapshotInfo(snapshot, workingDir, workingDirFs);
snapshotManifest.addTableDescriptor(this.htd);
monitor.rethrowException();
List<Pair<RegionInfo, ServerName>> regionsAndLocations =
master.getAssignmentManager().getTableRegionsAndLocations(snapshotTable, false);
// run the snapshot
snapshotRegions(regionsAndLocations);
monitor.rethrowException();
// flush the in-memory state, and write the single manifest
status.setStatus("Consolidate snapshot: " + snapshot.getName());
snapshotManifest.consolidate();
// verify the snapshot is valid
status.setStatus("Verifying snapshot: " + snapshot.getName());
verifier.verifySnapshot(workingDir, true);
// complete the snapshot, atomically moving from tmp to .snapshot dir.
SnapshotDescriptionUtils.completeSnapshot(this.snapshotDir, this.workingDir, this.rootFs,
this.workingDirFs, this.conf);
finished = true;
msg = "Snapshot " + snapshot.getName() + " of table " + snapshotTable + " completed";
status.markComplete(msg);
LOG.info(msg);
metricsSnapshot.addSnapshot(status.getCompletionTimestamp() - status.getStartTime());
if (master.getMasterCoprocessorHost() != null) {
master.getMasterCoprocessorHost()
.postCompletedSnapshotAction(ProtobufUtil.createSnapshotDesc(snapshot), this.htd);
}
} catch (Exception e) { // FindBugs: REC_CATCH_EXCEPTION
status.abort("Failed to complete snapshot " + snapshot.getName() + " on table "
+ snapshotTable + " because " + e.getMessage());
String reason = "Failed taking snapshot " + ClientSnapshotDescriptionUtils.toString(snapshot)
+ " due to exception:" + e.getMessage();
LOG.error(reason, e);
ForeignException ee = new ForeignException(reason, e);
monitor.receive(ee);
// need to mark this completed to close off and allow cleanup to happen.
cancel(reason);
} finally {
LOG.debug("Launching cleanup of working dir:" + workingDir);
try {
// if the working dir is still present, the snapshot has failed. it is present we delete
// it.
if (workingDirFs.exists(workingDir) && !workingDirFs.delete(workingDir, true)) {
LOG.error("Couldn't delete snapshot working directory: {}", workingDir);
}
} catch (IOException e) {
LOG.error("Couldn't get or delete snapshot working directory: {}", workingDir, e);
}
if (LOG.isDebugEnabled()) {
LOG.debug("Table snapshot journal : \n" + status.prettyPrintJournal());
}
tableLockToRelease.release();
}
}
/**
* When taking snapshot, first we must acquire the exclusive table lock to confirm that there are
* no ongoing merge/split procedures. But later, we should try our best to release the exclusive
* lock as this may hurt the availability, because we need to hold the shared lock when assigning
* regions.
* <p/>
* See HBASE-21480 for more details.
*/
protected abstract boolean downgradeToSharedTableLock();
/**
* Snapshot the specified regions
*/
protected abstract void snapshotRegions(List<Pair<RegionInfo, ServerName>> regions)
throws IOException, KeeperException;
/**
* Take a snapshot of the specified disabled region
*/
protected void snapshotDisabledRegion(final RegionInfo regionInfo) throws IOException {
snapshotManifest.addRegion(CommonFSUtils.getTableDir(rootDir, snapshotTable), regionInfo);
monitor.rethrowException();
status.setStatus("Completed referencing HFiles for offline region " + regionInfo.toString()
+ " of table: " + snapshotTable);
}
@Override
public void cancel(String why) {
if (finished) return;
this.finished = true;
LOG.info("Stop taking snapshot=" + ClientSnapshotDescriptionUtils.toString(snapshot)
+ " because: " + why);
CancellationException ce = new CancellationException(why);
monitor.receive(new ForeignException(master.getServerName().toString(), ce));
}
@Override
public boolean isFinished() {
return finished;
}
@Override
public long getCompletionTimestamp() {
return this.status.getCompletionTimestamp();
}
@Override
public SnapshotDescription getSnapshot() {
return snapshot;
}
@Override
public ForeignException getExceptionIfFailed() {
return monitor.getException();
}
@Override
public void rethrowExceptionIfFailed() throws ForeignException {
monitor.rethrowException();
}
@Override
public void rethrowException() throws ForeignException {
monitor.rethrowException();
}
@Override
public boolean hasException() {
return monitor.hasException();
}
@Override
public ForeignException getException() {
return monitor.getException();
}
}