blob: 61b432710d8de3c8bfc521f037e25874217e06a9 [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.master;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.regionserver.wal.AbstractFSWAL;
import org.apache.hadoop.hbase.util.CommonFSUtils;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.hbase.wal.AbstractFSWALProvider;
import org.apache.hadoop.hbase.wal.WALSplitter;
import org.apache.yetus.audience.InterfaceAudience;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* This class abstracts a bunch of operations the HMaster needs
* when splitting log files e.g. finding log files, dirs etc.
*/
@InterfaceAudience.Private
public class MasterWalManager {
private static final Logger LOG = LoggerFactory.getLogger(MasterWalManager.class);
/**
* Filter *in* WAL files that are for the hbase:meta Region.
*/
final static PathFilter META_FILTER = new PathFilter() {
@Override
public boolean accept(Path p) {
return AbstractFSWALProvider.isMetaFile(p);
}
};
/**
* Filter *out* WAL files that are for the hbase:meta Region; i.e. return user-space WALs only.
*/
public final static PathFilter NON_META_FILTER = new PathFilter() {
@Override
public boolean accept(Path p) {
return !AbstractFSWALProvider.isMetaFile(p);
}
};
// metrics for master
// TODO: Rename it, since those metrics are split-manager related
private final MetricsMasterFileSystem metricsMasterFilesystem = new MetricsMasterFileSystem();
// Keep around for convenience.
private final MasterServices services;
private final Configuration conf;
private final FileSystem fs;
// The Path to the old logs dir
private final Path oldLogDir;
private final Path rootDir;
// create the split log lock
private final Lock splitLogLock = new ReentrantLock();
/**
* Superceded by {@link SplitWALManager}; i.e. procedure-based WAL splitting rather than
* 'classic' zk-coordinated WAL splitting.
* @deprecated since 2.3.0 and 3.0.0 to be removed in 4.0.0; replaced by {@link SplitWALManager}.
* @see SplitWALManager
*/
@Deprecated
private final SplitLogManager splitLogManager;
// Is the fileystem ok?
private volatile boolean fsOk = true;
public MasterWalManager(MasterServices services) throws IOException {
this(services.getConfiguration(), services.getMasterFileSystem().getWALFileSystem(),
services.getMasterFileSystem().getWALRootDir(), services);
}
public MasterWalManager(Configuration conf, FileSystem fs, Path rootDir, MasterServices services)
throws IOException {
this.fs = fs;
this.conf = conf;
this.rootDir = rootDir;
this.services = services;
this.splitLogManager = new SplitLogManager(services, conf);
this.oldLogDir = new Path(rootDir, HConstants.HREGION_OLDLOGDIR_NAME);
}
public void stop() {
if (splitLogManager != null) {
splitLogManager.stop();
}
}
SplitLogManager getSplitLogManager() {
return this.splitLogManager;
}
/**
* Get the directory where old logs go
* @return the dir
*/
Path getOldLogDir() {
return this.oldLogDir;
}
public FileSystem getFileSystem() {
return this.fs;
}
/**
* Checks to see if the file system is still accessible.
* If not, sets closed
* @return false if file system is not available
*/
private boolean checkFileSystem() {
if (this.fsOk) {
try {
FSUtils.checkFileSystemAvailable(this.fs);
FSUtils.checkDfsSafeMode(this.conf);
} catch (IOException e) {
services.abort("Shutting down HBase cluster: file system not available", e);
this.fsOk = false;
}
}
return this.fsOk;
}
/**
* Get Servernames which are currently splitting; paths have a '-splitting' suffix.
* @return ServerName
* @throws IOException IOException
*/
public Set<ServerName> getSplittingServersFromWALDir() throws IOException {
return getServerNamesFromWALDirPath(
p -> p.getName().endsWith(AbstractFSWALProvider.SPLITTING_EXT));
}
/**
* Get Servernames that COULD BE 'alive'; excludes those that have a '-splitting' suffix as these
* are already being split -- they cannot be 'alive'.
* @return ServerName
* @throws IOException IOException
*/
public Set<ServerName> getLiveServersFromWALDir() throws IOException {
return getServerNamesFromWALDirPath(
p -> !p.getName().endsWith(AbstractFSWALProvider.SPLITTING_EXT));
}
/**
* @return listing of ServerNames found by parsing WAL directory paths in FS.
*/
public Set<ServerName> getServerNamesFromWALDirPath(final PathFilter filter) throws IOException {
FileStatus[] walDirForServerNames = getWALDirPaths(filter);
return Stream.of(walDirForServerNames).map(s -> {
ServerName serverName = AbstractFSWALProvider.getServerNameFromWALDirectoryName(s.getPath());
if (serverName == null) {
LOG.warn("Log folder {} doesn't look like its name includes a " +
"region server name; leaving in place. If you see later errors about missing " +
"write ahead logs they may be saved in this location.", s.getPath());
return null;
}
return serverName;
}).filter(s -> s != null).collect(Collectors.toSet());
}
/**
* @return List of all RegionServer WAL dirs; i.e. this.rootDir/HConstants.HREGION_LOGDIR_NAME.
*/
public FileStatus[] getWALDirPaths(final PathFilter filter) throws IOException {
Path walDirPath = new Path(CommonFSUtils.getWALRootDir(conf), HConstants.HREGION_LOGDIR_NAME);
FileStatus[] walDirForServerNames = CommonFSUtils.listStatus(fs, walDirPath, filter);
return walDirForServerNames == null? new FileStatus[0]: walDirForServerNames;
}
/**
* Inspect the log directory to find dead servers which need recovery work
* @return A set of ServerNames which aren't running but still have WAL files left in file system
* @deprecated With proc-v2, we can record the crash server with procedure store, so do not need
* to scan the wal directory to find out the splitting wal directory any more. Leave
* it here only because {@code RecoverMetaProcedure}(which is also deprecated) uses
* it.
*/
@Deprecated
public Set<ServerName> getFailedServersFromLogFolders() throws IOException {
boolean retrySplitting = !conf.getBoolean(WALSplitter.SPLIT_SKIP_ERRORS_KEY,
WALSplitter.SPLIT_SKIP_ERRORS_DEFAULT);
Set<ServerName> serverNames = new HashSet<>();
Path logsDirPath = new Path(CommonFSUtils.getWALRootDir(conf), HConstants.HREGION_LOGDIR_NAME);
do {
if (services.isStopped()) {
LOG.warn("Master stopped while trying to get failed servers.");
break;
}
try {
if (!this.fs.exists(logsDirPath)) return serverNames;
FileStatus[] logFolders = CommonFSUtils.listStatus(this.fs, logsDirPath, null);
// Get online servers after getting log folders to avoid log folder deletion of newly
// checked in region servers . see HBASE-5916
Set<ServerName> onlineServers = services.getServerManager().getOnlineServers().keySet();
if (logFolders == null || logFolders.length == 0) {
LOG.debug("No log files to split, proceeding...");
return serverNames;
}
for (FileStatus status : logFolders) {
FileStatus[] curLogFiles = CommonFSUtils.listStatus(this.fs, status.getPath(), null);
if (curLogFiles == null || curLogFiles.length == 0) {
// Empty log folder. No recovery needed
continue;
}
final ServerName serverName = AbstractFSWALProvider.getServerNameFromWALDirectoryName(
status.getPath());
if (null == serverName) {
LOG.warn("Log folder " + status.getPath() + " doesn't look like its name includes a " +
"region server name; leaving in place. If you see later errors about missing " +
"write ahead logs they may be saved in this location.");
} else if (!onlineServers.contains(serverName)) {
LOG.info("Log folder " + status.getPath() + " doesn't belong "
+ "to a known region server, splitting");
serverNames.add(serverName);
} else {
LOG.info("Log folder " + status.getPath() + " belongs to an existing region server");
}
}
retrySplitting = false;
} catch (IOException ioe) {
LOG.warn("Failed getting failed servers to be recovered.", ioe);
if (!checkFileSystem()) {
LOG.warn("Bad Filesystem, exiting");
Runtime.getRuntime().halt(1);
}
try {
if (retrySplitting) {
Thread.sleep(conf.getInt("hbase.hlog.split.failure.retry.interval", 30 * 1000));
}
} catch (InterruptedException e) {
LOG.warn("Interrupted, aborting since cannot return w/o splitting");
Thread.currentThread().interrupt();
retrySplitting = false;
Runtime.getRuntime().halt(1);
}
}
} while (retrySplitting);
return serverNames;
}
public void splitLog(final ServerName serverName) throws IOException {
splitLog(Collections.<ServerName>singleton(serverName));
}
/**
* Specialized method to handle the splitting for meta WAL
* @param serverName logs belonging to this server will be split
*/
public void splitMetaLog(final ServerName serverName) throws IOException {
splitMetaLog(Collections.<ServerName>singleton(serverName));
}
/**
* Specialized method to handle the splitting for meta WAL
* @param serverNames logs belonging to these servers will be split
*/
public void splitMetaLog(final Set<ServerName> serverNames) throws IOException {
splitLog(serverNames, META_FILTER);
}
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value="UL_UNRELEASED_LOCK", justification=
"We only release this lock when we set it. Updates to code that uses it should verify use " +
"of the guard boolean.")
List<Path> getLogDirs(final Set<ServerName> serverNames) throws IOException {
List<Path> logDirs = new ArrayList<>();
boolean needReleaseLock = false;
if (!this.services.isInitialized()) {
// during master initialization, we could have multiple places splitting a same wal
// XXX: Does this still exist after we move to proc-v2?
this.splitLogLock.lock();
needReleaseLock = true;
}
try {
for (ServerName serverName : serverNames) {
Path logDir = new Path(this.rootDir,
AbstractFSWALProvider.getWALDirectoryName(serverName.toString()));
Path splitDir = logDir.suffix(AbstractFSWALProvider.SPLITTING_EXT);
// Rename the directory so a rogue RS doesn't create more WALs
if (fs.exists(logDir)) {
if (!this.fs.rename(logDir, splitDir)) {
throw new IOException("Failed fs.rename for log split: " + logDir);
}
logDir = splitDir;
LOG.debug("Renamed region directory: " + splitDir);
} else if (!fs.exists(splitDir)) {
LOG.info("Log dir for server " + serverName + " does not exist");
continue;
}
logDirs.add(splitDir);
}
} catch (IOException ioe) {
if (!checkFileSystem()) {
this.services.abort("Aborting due to filesystem unavailable", ioe);
throw ioe;
}
} finally {
if (needReleaseLock) {
this.splitLogLock.unlock();
}
}
return logDirs;
}
public void splitLog(final Set<ServerName> serverNames) throws IOException {
splitLog(serverNames, NON_META_FILTER);
}
/**
* This method is the base split method that splits WAL files matching a filter. Callers should
* pass the appropriate filter for meta and non-meta WALs.
* @param serverNames logs belonging to these servers will be split; this will rename the log
* directory out from under a soft-failed server
*/
public void splitLog(final Set<ServerName> serverNames, PathFilter filter) throws IOException {
long splitTime = 0, splitLogSize = 0;
List<Path> logDirs = getLogDirs(serverNames);
splitLogManager.handleDeadWorkers(serverNames);
splitTime = EnvironmentEdgeManager.currentTime();
splitLogSize = splitLogManager.splitLogDistributed(serverNames, logDirs, filter);
splitTime = EnvironmentEdgeManager.currentTime() - splitTime;
if (this.metricsMasterFilesystem != null) {
if (filter == META_FILTER) {
this.metricsMasterFilesystem.addMetaWALSplit(splitTime, splitLogSize);
} else {
this.metricsMasterFilesystem.addSplit(splitTime, splitLogSize);
}
}
}
/**
* The hbase:meta region may OPEN and CLOSE without issue on a server and then move elsewhere.
* On CLOSE, the WAL for the hbase:meta table may not be archived yet (The WAL is only needed if
* hbase:meta did not close cleanaly). Since meta region is no long on this server,
* the ServerCrashProcedure won't split these leftover hbase:meta WALs, just leaving them in
* the WAL splitting dir. If we try to delete the WAL splitting for the server, it fail since
* the dir is not totally empty. We can safely archive these hbase:meta log; then the
* WAL dir can be deleted.
* @param serverName the server to archive meta log
*/
public void archiveMetaLog(final ServerName serverName) {
try {
Path logDir = new Path(this.rootDir,
AbstractFSWALProvider.getWALDirectoryName(serverName.toString()));
Path splitDir = logDir.suffix(AbstractFSWALProvider.SPLITTING_EXT);
if (fs.exists(splitDir)) {
FileStatus[] logfiles = CommonFSUtils.listStatus(fs, splitDir, META_FILTER);
if (logfiles != null) {
for (FileStatus status : logfiles) {
if (!status.isDir()) {
Path newPath = AbstractFSWAL.getWALArchivePath(this.oldLogDir,
status.getPath());
if (!CommonFSUtils.renameAndSetModifyTime(fs, status.getPath(), newPath)) {
LOG.warn("Unable to move " + status.getPath() + " to " + newPath);
} else {
LOG.debug("Archived meta log " + status.getPath() + " to " + newPath);
}
}
}
}
if (!fs.delete(splitDir, false)) {
LOG.warn("Unable to delete log dir. Ignoring. " + splitDir);
}
}
} catch (IOException ie) {
LOG.warn("Failed archiving meta log for server " + serverName, ie);
}
}
}