| /** |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.hadoop.ha; |
| |
| import java.io.IOException; |
| import java.io.PrintStream; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Collection; |
| import java.util.Map; |
| |
| import org.apache.commons.cli.Options; |
| import org.apache.commons.cli.CommandLine; |
| import org.apache.commons.cli.GnuParser; |
| import org.apache.commons.cli.ParseException; |
| import org.apache.commons.logging.Log; |
| import org.apache.commons.logging.LogFactory; |
| |
| import org.apache.hadoop.classification.InterfaceAudience; |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.hadoop.conf.Configured; |
| import org.apache.hadoop.fs.CommonConfigurationKeys; |
| import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState; |
| import org.apache.hadoop.ha.HAServiceProtocol.StateChangeRequestInfo; |
| import org.apache.hadoop.ha.HAServiceProtocol.RequestSource; |
| import org.apache.hadoop.util.Tool; |
| import org.apache.hadoop.util.ToolRunner; |
| |
| import com.google.common.base.Preconditions; |
| import com.google.common.collect.ImmutableMap; |
| |
| /** |
| * A command-line tool for making calls in the HAServiceProtocol. |
| * For example,. this can be used to force a service to standby or active |
| * mode, or to trigger a health-check. |
| */ |
| @InterfaceAudience.Private |
| |
| public abstract class HAAdmin extends Configured implements Tool { |
| |
| private static final String FORCEFENCE = "forcefence"; |
| private static final String FORCEACTIVE = "forceactive"; |
| |
| /** |
| * Undocumented flag which allows an administrator to use manual failover |
| * state transitions even when auto-failover is enabled. This is an unsafe |
| * operation, which is why it is not documented in the usage below. |
| */ |
| private static final String FORCEMANUAL = "forcemanual"; |
| private static final Log LOG = LogFactory.getLog(HAAdmin.class); |
| |
| private int rpcTimeoutForChecks = -1; |
| |
| protected final static Map<String, UsageInfo> USAGE = |
| ImmutableMap.<String, UsageInfo>builder() |
| .put("-transitionToActive", |
| new UsageInfo("[--"+FORCEACTIVE+"] <serviceId>", "Transitions the service into Active state")) |
| .put("-transitionToStandby", |
| new UsageInfo("<serviceId>", "Transitions the service into Standby state")) |
| .put("-failover", |
| new UsageInfo("[--"+FORCEFENCE+"] [--"+FORCEACTIVE+"] <serviceId> <serviceId>", |
| "Failover from the first service to the second.\n" + |
| "Unconditionally fence services if the --"+FORCEFENCE+" option is used.\n" + |
| "Try to failover to the target service even if it is not ready if the " + |
| "--" + FORCEACTIVE + " option is used.")) |
| .put("-getServiceState", |
| new UsageInfo("<serviceId>", "Returns the state of the service")) |
| .put("-checkHealth", |
| new UsageInfo("<serviceId>", |
| "Requests that the service perform a health check.\n" + |
| "The HAAdmin tool will exit with a non-zero exit code\n" + |
| "if the check fails.")) |
| .put("-help", |
| new UsageInfo("<command>", "Displays help on the specified command")) |
| .build(); |
| |
| /** Output stream for errors, for use in tests */ |
| protected PrintStream errOut = System.err; |
| protected PrintStream out = System.out; |
| private RequestSource requestSource = RequestSource.REQUEST_BY_USER; |
| |
| protected HAAdmin() { |
| super(); |
| } |
| |
| protected HAAdmin(Configuration conf) { |
| super(conf); |
| } |
| |
| protected abstract HAServiceTarget resolveTarget(String string); |
| |
| protected Collection<String> getTargetIds(String targetNodeToActivate) { |
| return new ArrayList<String>( |
| Arrays.asList(new String[]{targetNodeToActivate})); |
| } |
| |
| protected String getUsageString() { |
| return "Usage: HAAdmin"; |
| } |
| |
| protected void printUsage(PrintStream errOut) { |
| errOut.println(getUsageString()); |
| for (Map.Entry<String, UsageInfo> e : USAGE.entrySet()) { |
| String cmd = e.getKey(); |
| UsageInfo usage = e.getValue(); |
| |
| errOut.println(" [" + cmd + " " + usage.args + "]"); |
| } |
| errOut.println(); |
| ToolRunner.printGenericCommandUsage(errOut); |
| } |
| |
| private void printUsage(PrintStream errOut, String cmd) { |
| UsageInfo usage = USAGE.get(cmd); |
| if (usage == null) { |
| throw new RuntimeException("No usage for cmd " + cmd); |
| } |
| errOut.println(getUsageString() + " [" + cmd + " " + usage.args + "]"); |
| } |
| |
| private int transitionToActive(final CommandLine cmd) |
| throws IOException, ServiceFailedException { |
| String[] argv = cmd.getArgs(); |
| if (argv.length != 1) { |
| errOut.println("transitionToActive: incorrect number of arguments"); |
| printUsage(errOut, "-transitionToActive"); |
| return -1; |
| } |
| /* returns true if other target node is active or some exception occurred |
| and forceActive was not set */ |
| if(!cmd.hasOption(FORCEACTIVE)) { |
| if(isOtherTargetNodeActive(argv[0], cmd.hasOption(FORCEACTIVE))) { |
| return -1; |
| } |
| } |
| HAServiceTarget target = resolveTarget(argv[0]); |
| if (!checkManualStateManagementOK(target)) { |
| return -1; |
| } |
| HAServiceProtocol proto = target.getProxy( |
| getConf(), 0); |
| HAServiceProtocolHelper.transitionToActive(proto, createReqInfo()); |
| return 0; |
| } |
| |
| /** |
| * Checks whether other target node is active or not |
| * @param targetNodeToActivate |
| * @return true if other target node is active or some other exception |
| * occurred and forceActive was set otherwise false |
| * @throws IOException |
| */ |
| private boolean isOtherTargetNodeActive(String targetNodeToActivate, boolean forceActive) |
| throws IOException { |
| Collection<String> targetIds = getTargetIds(targetNodeToActivate); |
| targetIds.remove(targetNodeToActivate); |
| for(String targetId : targetIds) { |
| HAServiceTarget target = resolveTarget(targetId); |
| if (!checkManualStateManagementOK(target)) { |
| return true; |
| } |
| try { |
| HAServiceProtocol proto = target.getProxy(getConf(), 5000); |
| if(proto.getServiceStatus().getState() == HAServiceState.ACTIVE) { |
| errOut.println("transitionToActive: Node " + targetId +" is already active"); |
| printUsage(errOut, "-transitionToActive"); |
| return true; |
| } |
| } catch (Exception e) { |
| //If forceActive switch is false then return true |
| if(!forceActive) { |
| errOut.println("Unexpected error occurred " + e.getMessage()); |
| printUsage(errOut, "-transitionToActive"); |
| return true; |
| } |
| } |
| } |
| return false; |
| } |
| |
| private int transitionToStandby(final CommandLine cmd) |
| throws IOException, ServiceFailedException { |
| String[] argv = cmd.getArgs(); |
| if (argv.length != 1) { |
| errOut.println("transitionToStandby: incorrect number of arguments"); |
| printUsage(errOut, "-transitionToStandby"); |
| return -1; |
| } |
| |
| HAServiceTarget target = resolveTarget(argv[0]); |
| if (!checkManualStateManagementOK(target)) { |
| return -1; |
| } |
| HAServiceProtocol proto = target.getProxy( |
| getConf(), 0); |
| HAServiceProtocolHelper.transitionToStandby(proto, createReqInfo()); |
| return 0; |
| } |
| /** |
| * Ensure that we are allowed to manually manage the HA state of the target |
| * service. If automatic failover is configured, then the automatic |
| * failover controllers should be doing state management, and it is generally |
| * an error to use the HAAdmin command line to do so. |
| * |
| * @param target the target to check |
| * @return true if manual state management is allowed |
| */ |
| private boolean checkManualStateManagementOK(HAServiceTarget target) { |
| if (target.isAutoFailoverEnabled()) { |
| if (requestSource != RequestSource.REQUEST_BY_USER_FORCED) { |
| errOut.println( |
| "Automatic failover is enabled for " + target + "\n" + |
| "Refusing to manually manage HA state, since it may cause\n" + |
| "a split-brain scenario or other incorrect state.\n" + |
| "If you are very sure you know what you are doing, please \n" + |
| "specify the --" + FORCEMANUAL + " flag."); |
| return false; |
| } else { |
| LOG.warn("Proceeding with manual HA state management even though\n" + |
| "automatic failover is enabled for " + target); |
| return true; |
| } |
| } |
| return true; |
| } |
| |
| private StateChangeRequestInfo createReqInfo() { |
| return new StateChangeRequestInfo(requestSource); |
| } |
| |
| private int failover(CommandLine cmd) |
| throws IOException, ServiceFailedException { |
| boolean forceFence = cmd.hasOption(FORCEFENCE); |
| boolean forceActive = cmd.hasOption(FORCEACTIVE); |
| |
| int numOpts = cmd.getOptions() == null ? 0 : cmd.getOptions().length; |
| final String[] args = cmd.getArgs(); |
| |
| if (numOpts > 3 || args.length != 2) { |
| errOut.println("failover: incorrect arguments"); |
| printUsage(errOut, "-failover"); |
| return -1; |
| } |
| |
| HAServiceTarget fromNode = resolveTarget(args[0]); |
| HAServiceTarget toNode = resolveTarget(args[1]); |
| |
| // Check that auto-failover is consistently configured for both nodes. |
| Preconditions.checkState( |
| fromNode.isAutoFailoverEnabled() == |
| toNode.isAutoFailoverEnabled(), |
| "Inconsistent auto-failover configs between %s and %s!", |
| fromNode, toNode); |
| |
| if (fromNode.isAutoFailoverEnabled()) { |
| if (forceFence || forceActive) { |
| // -forceActive doesn't make sense with auto-HA, since, if the node |
| // is not healthy, then its ZKFC will immediately quit the election |
| // again the next time a health check runs. |
| // |
| // -forceFence doesn't seem to have any real use cases with auto-HA |
| // so it isn't implemented. |
| errOut.println(FORCEFENCE + " and " + FORCEACTIVE + " flags not " + |
| "supported with auto-failover enabled."); |
| return -1; |
| } |
| try { |
| return gracefulFailoverThroughZKFCs(toNode); |
| } catch (UnsupportedOperationException e){ |
| errOut.println("Failover command is not supported with " + |
| "auto-failover enabled: " + e.getLocalizedMessage()); |
| return -1; |
| } |
| } |
| |
| FailoverController fc = new FailoverController(getConf(), |
| requestSource); |
| |
| try { |
| fc.failover(fromNode, toNode, forceFence, forceActive); |
| out.println("Failover from "+args[0]+" to "+args[1]+" successful"); |
| } catch (FailoverFailedException ffe) { |
| errOut.println("Failover failed: " + ffe.getLocalizedMessage()); |
| return -1; |
| } |
| return 0; |
| } |
| |
| |
| /** |
| * Initiate a graceful failover by talking to the target node's ZKFC. |
| * This sends an RPC to the ZKFC, which coordinates the failover. |
| * |
| * @param toNode the node to fail to |
| * @return status code (0 for success) |
| * @throws IOException if failover does not succeed |
| */ |
| private int gracefulFailoverThroughZKFCs(HAServiceTarget toNode) |
| throws IOException { |
| |
| int timeout = FailoverController.getRpcTimeoutToNewActive(getConf()); |
| ZKFCProtocol proxy = toNode.getZKFCProxy(getConf(), timeout); |
| try { |
| proxy.gracefulFailover(); |
| out.println("Failover to " + toNode + " successful"); |
| } catch (ServiceFailedException sfe) { |
| errOut.println("Failover failed: " + sfe.getLocalizedMessage()); |
| return -1; |
| } |
| |
| return 0; |
| } |
| |
| private int checkHealth(final CommandLine cmd) |
| throws IOException, ServiceFailedException { |
| String[] argv = cmd.getArgs(); |
| if (argv.length != 1) { |
| errOut.println("checkHealth: incorrect number of arguments"); |
| printUsage(errOut, "-checkHealth"); |
| return -1; |
| } |
| HAServiceProtocol proto = resolveTarget(argv[0]).getProxy( |
| getConf(), rpcTimeoutForChecks); |
| try { |
| HAServiceProtocolHelper.monitorHealth(proto, createReqInfo()); |
| } catch (HealthCheckFailedException e) { |
| errOut.println("Health check failed: " + e.getLocalizedMessage()); |
| return -1; |
| } |
| return 0; |
| } |
| |
| private int getServiceState(final CommandLine cmd) |
| throws IOException, ServiceFailedException { |
| String[] argv = cmd.getArgs(); |
| if (argv.length != 1) { |
| errOut.println("getServiceState: incorrect number of arguments"); |
| printUsage(errOut, "-getServiceState"); |
| return -1; |
| } |
| |
| HAServiceProtocol proto = resolveTarget(argv[0]).getProxy( |
| getConf(), rpcTimeoutForChecks); |
| out.println(proto.getServiceStatus().getState()); |
| return 0; |
| } |
| |
| /** |
| * Return the serviceId as is, we are assuming it was |
| * given as a service address of form <host:ipcport>. |
| */ |
| protected String getServiceAddr(String serviceId) { |
| return serviceId; |
| } |
| |
| @Override |
| public void setConf(Configuration conf) { |
| super.setConf(conf); |
| if (conf != null) { |
| rpcTimeoutForChecks = conf.getInt( |
| CommonConfigurationKeys.HA_FC_CLI_CHECK_TIMEOUT_KEY, |
| CommonConfigurationKeys.HA_FC_CLI_CHECK_TIMEOUT_DEFAULT); |
| } |
| } |
| |
| @Override |
| public int run(String[] argv) throws Exception { |
| try { |
| return runCmd(argv); |
| } catch (IllegalArgumentException iae) { |
| errOut.println("Illegal argument: " + iae.getLocalizedMessage()); |
| return -1; |
| } catch (IOException ioe) { |
| errOut.println("Operation failed: " + ioe.getLocalizedMessage()); |
| if (LOG.isDebugEnabled()) { |
| LOG.debug("Operation failed", ioe); |
| } |
| return -1; |
| } |
| } |
| |
| protected int runCmd(String[] argv) throws Exception { |
| if (argv.length < 1) { |
| printUsage(errOut); |
| return -1; |
| } |
| |
| String cmd = argv[0]; |
| |
| if (!cmd.startsWith("-")) { |
| errOut.println("Bad command '" + cmd + "': expected command starting with '-'"); |
| printUsage(errOut); |
| return -1; |
| } |
| |
| if (!USAGE.containsKey(cmd)) { |
| errOut.println(cmd.substring(1) + ": Unknown command"); |
| printUsage(errOut); |
| return -1; |
| } |
| |
| Options opts = new Options(); |
| |
| // Add command-specific options |
| if ("-failover".equals(cmd)) { |
| addFailoverCliOpts(opts); |
| } |
| if("-transitionToActive".equals(cmd)) { |
| addTransitionToActiveCliOpts(opts); |
| } |
| // Mutative commands take FORCEMANUAL option |
| if ("-transitionToActive".equals(cmd) || |
| "-transitionToStandby".equals(cmd) || |
| "-failover".equals(cmd)) { |
| opts.addOption(FORCEMANUAL, false, |
| "force manual control even if auto-failover is enabled"); |
| } |
| |
| CommandLine cmdLine = parseOpts(cmd, opts, argv); |
| if (cmdLine == null) { |
| // error already printed |
| return -1; |
| } |
| |
| if (cmdLine.hasOption(FORCEMANUAL)) { |
| if (!confirmForceManual()) { |
| LOG.fatal("Aborted"); |
| return -1; |
| } |
| // Instruct the NNs to honor this request even if they're |
| // configured for manual failover. |
| requestSource = RequestSource.REQUEST_BY_USER_FORCED; |
| } |
| |
| if ("-transitionToActive".equals(cmd)) { |
| return transitionToActive(cmdLine); |
| } else if ("-transitionToStandby".equals(cmd)) { |
| return transitionToStandby(cmdLine); |
| } else if ("-failover".equals(cmd)) { |
| return failover(cmdLine); |
| } else if ("-getServiceState".equals(cmd)) { |
| return getServiceState(cmdLine); |
| } else if ("-checkHealth".equals(cmd)) { |
| return checkHealth(cmdLine); |
| } else if ("-help".equals(cmd)) { |
| return help(argv); |
| } else { |
| // we already checked command validity above, so getting here |
| // would be a coding error |
| throw new AssertionError("Should not get here, command: " + cmd); |
| } |
| } |
| |
| private boolean confirmForceManual() throws IOException { |
| return ToolRunner.confirmPrompt( |
| "You have specified the --" + FORCEMANUAL + " flag. This flag is " + |
| "dangerous, as it can induce a split-brain scenario that WILL " + |
| "CORRUPT your HDFS namespace, possibly irrecoverably.\n" + |
| "\n" + |
| "It is recommended not to use this flag, but instead to shut down the " + |
| "cluster and disable automatic failover if you prefer to manually " + |
| "manage your HA state.\n" + |
| "\n" + |
| "You may abort safely by answering 'n' or hitting ^C now.\n" + |
| "\n" + |
| "Are you sure you want to continue?"); |
| } |
| |
| /** |
| * Add CLI options which are specific to the failover command and no |
| * others. |
| */ |
| private void addFailoverCliOpts(Options failoverOpts) { |
| failoverOpts.addOption(FORCEFENCE, false, "force fencing"); |
| failoverOpts.addOption(FORCEACTIVE, false, "force failover"); |
| // Don't add FORCEMANUAL, since that's added separately for all commands |
| // that change state. |
| } |
| |
| /** |
| * Add CLI options which are specific to the transitionToActive command and |
| * no others. |
| */ |
| private void addTransitionToActiveCliOpts(Options transitionToActiveCliOpts) { |
| transitionToActiveCliOpts.addOption(FORCEACTIVE, false, "force active"); |
| } |
| |
| private CommandLine parseOpts(String cmdName, Options opts, String[] argv) { |
| try { |
| // Strip off the first arg, since that's just the command name |
| argv = Arrays.copyOfRange(argv, 1, argv.length); |
| return new GnuParser().parse(opts, argv); |
| } catch (ParseException pe) { |
| errOut.println(cmdName.substring(1) + |
| ": incorrect arguments"); |
| printUsage(errOut, cmdName); |
| return null; |
| } |
| } |
| |
| private int help(String[] argv) { |
| if (argv.length == 1) { // only -help |
| printUsage(out); |
| return 0; |
| } else if (argv.length != 2) { |
| printUsage(errOut, "-help"); |
| return -1; |
| } |
| String cmd = argv[1]; |
| if (!cmd.startsWith("-")) { |
| cmd = "-" + cmd; |
| } |
| UsageInfo usageInfo = USAGE.get(cmd); |
| if (usageInfo == null) { |
| errOut.println(cmd + ": Unknown command"); |
| printUsage(errOut); |
| return -1; |
| } |
| |
| out.println(cmd + " [" + usageInfo.args + "]: " + usageInfo.help); |
| return 0; |
| } |
| |
| protected static class UsageInfo { |
| public final String args; |
| public final String help; |
| |
| public UsageInfo(String args, String help) { |
| this.args = args; |
| this.help = help; |
| } |
| } |
| } |