blob: eb44ae0c8eb8ea7293c73c0befc5030ae2a0cd7b [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.slider.providers.hbase.minicluster.failures
import groovy.transform.CompileStatic
import groovy.util.logging.Slf4j
import org.apache.hadoop.hbase.ClusterStatus
import org.apache.hadoop.yarn.api.records.ApplicationReport
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus
import org.apache.slider.api.ResourceKeys
import org.apache.slider.core.main.ServiceLauncher
import org.apache.slider.common.SliderExitCodes
import org.apache.slider.api.ClusterDescription
import org.apache.slider.core.exceptions.BadClusterStateException
import org.apache.slider.core.exceptions.ErrorStrings
import org.apache.slider.common.params.Arguments
import org.apache.slider.client.SliderClient
import org.apache.slider.providers.hbase.minicluster.HBaseMiniClusterTestBase
import org.junit.Test
import static org.apache.slider.providers.hbase.HBaseKeys.ROLE_WORKER
/**
* test that if a container is killed too many times,
* the AM stays down
*/
@CompileStatic
@Slf4j
class TestRegionServerFailureThreshold extends HBaseMiniClusterTestBase {
@Test
public void testRegionServerFailureThreshold() throws Throwable {
failureThresholdTestRun("", true, 2, 5)
}
/**
* Sets the failure threshold then runs the #of kill attempts
* @param testName
* @param toKill
* @param threshold
* @param killAttempts
*/
private void failureThresholdTestRun(
String testName,
boolean toKill,
int threshold,
int killAttempts) {
String action = toKill ? "kill" : "stop"
int regionServerCount = 1
String clustername = createMiniCluster(testName, configuration, 1, 1, 1, true, true)
describe(
"Create a single region service HBase instance" +
"then $action the RS $killAttempts times with a threshold of $threshold");
//now launch the cluster
def globalThreshold = threshold - 1
ServiceLauncher<SliderClient> launcher = createHBaseCluster(
clustername,
regionServerCount,
[
Arguments.ARG_RES_COMP_OPT,
ROLE_WORKER,
ResourceKeys.CONTAINER_FAILURE_THRESHOLD,
Integer.toString(threshold),
Arguments.ARG_RESOURCE_OPT,
ResourceKeys.CONTAINER_FAILURE_THRESHOLD,
Integer.toString(globalThreshold)
],
true,
true)
SliderClient client = launcher.service
addToTeardown(client);
def aggregateConf = client.loadPersistedClusterDescription(clustername)
log.info aggregateConf.toString()
def resourceOperations = aggregateConf.resourceOperations
def failureOptValue = resourceOperations.globalOptions.getMandatoryOptionInt(
ResourceKeys.CONTAINER_FAILURE_THRESHOLD)
assert globalThreshold == failureOptValue
def workerThreshold = resourceOperations.getComponentOptInt(ROLE_WORKER,
ResourceKeys.CONTAINER_FAILURE_THRESHOLD, 0)
assert threshold == workerThreshold
ClusterDescription status = client.getClusterDescription(clustername)
ClusterStatus clustat = basicHBaseClusterStartupSequence(client)
ClusterStatus hbaseStat
try {
for (restarts in 1..killAttempts) {
status = waitForWorkerInstanceCount(
client,
regionServerCount,
hbaseClusterStartupToLiveTime)
//get the hbase status
/*
hbaseStat = waitForHBaseRegionServerCount(
client,
clustername,
regionServerCount,
HBASE_CLUSTER_STARTUP_TO_LIVE_TIME)
log.info("Initial cluster status : ${hbaseStatusToString(hbaseStat)}");
*/
describe("running processes")
lsJavaProcesses()
describe("about to " + action + " servers")
if (toKill) {
killAllRegionServers()
} else {
stopAllRegionServers()
}
//sleep a bit
sleep(toKill ? 15000 : 25000);
describe("waiting for recovery")
//and expect a recovery
if (restarts <= threshold) {
def restartTime = 1000
status = waitForWorkerInstanceCount(
client,
regionServerCount,
restartTime)
hbaseStat = waitForHBaseRegionServerCount(
client,
clustername,
regionServerCount,
restartTime)
} else {
//expect the cluster to have failed
try {
def finalCD = client.getClusterDescription(clustername)
describe( "failure threshold ignored")
dumpClusterDescription("expected the cluster to have failed", finalCD)
describe "stopping cluster"
maybeStopCluster(
client,
"",
"stopping cluster that isn't failing correctly")
fail("AM had not failed after $restarts worker kills")
} catch (BadClusterStateException e) {
assertExceptionDetails(e,
SliderExitCodes.EXIT_BAD_STATE,
ErrorStrings.E_APPLICATION_NOT_RUNNING)
//success
break;
}
}
}
} catch (BadClusterStateException e) {
assertExceptionDetails(e,
SliderExitCodes.EXIT_BAD_STATE,
ErrorStrings.E_APPLICATION_NOT_RUNNING)
}
ApplicationReport report = client.applicationReport
log.info(report.diagnostics)
assert report.finalApplicationStatus == FinalApplicationStatus.FAILED
assert report.diagnostics.contains(ErrorStrings.E_UNSTABLE_CLUSTER)
}
}