blob: e43ad81a9f59f32e8db9fa611717844bf56e6685 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.slider.providers.hbase.minicluster.failures
import groovy.transform.CompileStatic
import groovy.util.logging.Slf4j
import org.apache.hadoop.hbase.ClusterStatus
import org.apache.hadoop.yarn.api.records.ApplicationReport
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus
import org.apache.slider.core.main.ServiceLauncher
import org.apache.slider.common.SliderExitCodes
import org.apache.slider.api.ClusterDescription
import org.apache.slider.api.OptionKeys
import org.apache.slider.core.exceptions.BadClusterStateException
import org.apache.slider.core.exceptions.ErrorStrings
import org.apache.slider.common.params.Arguments
import org.apache.slider.client.SliderClient
import org.apache.slider.providers.hbase.minicluster.HBaseMiniClusterTestBase
import org.junit.Test
/**
* test that if a container is killed too many times,
* the AM stays down
*/
@CompileStatic
@Slf4j
class TestFailureThreshold extends HBaseMiniClusterTestBase {
@Test
public void testFailedRegionService() throws Throwable {
failureThresholdTestRun("test_failure_threshold", true, 2, 5)
}
private void failureThresholdTestRun(
String testName,
boolean toKill,
int threshold,
int killAttempts) {
String clustername = testName
String action = toKill ? "kill" : "stop"
int regionServerCount = 2
createMiniCluster(clustername, configuration, 1, 1, 1, true, true)
describe(
"Create a single region service cluster then " + action + " the RS");
//now launch the cluster
ServiceLauncher<SliderClient> launcher = createHBaseCluster(
clustername,
regionServerCount,
[
Arguments.ARG_OPTION, OptionKeys.INTERNAL_CONTAINER_FAILURE_THRESHOLD,
Integer.toString(threshold)
],
true,
true)
SliderClient client = launcher.service
addToTeardown(client);
ClusterDescription status = client.getClusterDescription(clustername)
ClusterStatus clustat = basicHBaseClusterStartupSequence(client)
ClusterStatus hbaseStat
try {
for (restarts in 1..killAttempts) {
status = waitForWorkerInstanceCount(
client,
regionServerCount,
hbaseClusterStartupToLiveTime)
//get the hbase status
/*
hbaseStat = waitForHBaseRegionServerCount(
client,
clustername,
regionServerCount,
HBASE_CLUSTER_STARTUP_TO_LIVE_TIME)
log.info("Initial cluster status : ${hbaseStatusToString(hbaseStat)}");
*/
describe("running processes")
lsJavaProcesses()
describe("about to " + action + " servers")
if (toKill) {
killAllRegionServers()
} else {
stopAllRegionServers()
}
//sleep a bit
sleep(toKill ? 15000 : 25000);
describe("waiting for recovery")
//and expect a recovery
if (restarts < threshold) {
def restartTime = 1000
status = waitForWorkerInstanceCount(
client,
regionServerCount,
restartTime)
hbaseStat = waitForHBaseRegionServerCount(
client,
clustername,
regionServerCount,
restartTime)
} else {
//expect the cluster to have failed
try {
def finalCD = client.getClusterDescription(clustername)
dumpClusterDescription("expected the AM to have failed", finalCD)
fail("AM had not failed after $restarts worker kills")
} catch (BadClusterStateException e) {
assert e.toString().contains(ErrorStrings.E_APPLICATION_NOT_RUNNING)
assert e.exitCode == SliderExitCodes.EXIT_BAD_STATE
//success
break;
}
}
}
} catch (BadClusterStateException e) {
assert e.toString().contains(ErrorStrings.E_APPLICATION_NOT_RUNNING)
assert e.exitCode == SliderExitCodes.EXIT_BAD_STATE
}
ApplicationReport report = client.applicationReport
log.info(report.diagnostics)
assert report.finalApplicationStatus == FinalApplicationStatus.FAILED
assert report.diagnostics.contains(ErrorStrings.E_UNSTABLE_CLUSTER)
}
}