| /** |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.hadoop.ha; |
| |
| import java.util.Random; |
| |
| |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.hadoop.fs.CommonConfigurationKeys; |
| import org.apache.hadoop.util.Time; |
| import org.junit.After; |
| import org.junit.Before; |
| import org.junit.Test; |
| import org.mockito.Mockito; |
| import org.mockito.invocation.InvocationOnMock; |
| import org.mockito.stubbing.Answer; |
| |
| /** |
| * Stress test for ZKFailoverController. |
| * Starts multiple ZKFCs for dummy services, and then performs many automatic |
| * failovers. While doing so, ensures that a fake "shared resource" |
| * (simulating the shared edits dir) is only owned by one service at a time. |
| */ |
| public class TestZKFailoverControllerStress extends ClientBaseWithFixes { |
| |
| private static final int STRESS_RUNTIME_SECS = 30; |
| private static final int EXTRA_TIMEOUT_SECS = 10; |
| |
| private Configuration conf; |
| private MiniZKFCCluster cluster; |
| |
| @Before |
| public void setupConfAndServices() throws Exception { |
| conf = new Configuration(); |
| conf.set(ZKFailoverController.ZK_QUORUM_KEY, hostPort); |
| this.cluster = new MiniZKFCCluster(conf, getServer(serverFactory)); |
| } |
| |
| @After |
| public void stopCluster() throws Exception { |
| if (cluster != null) { |
| cluster.stop(); |
| } |
| } |
| |
| /** |
| * Simply fail back and forth between two services for the |
| * configured amount of time, via expiring their ZK sessions. |
| */ |
| @Test(timeout=(STRESS_RUNTIME_SECS + EXTRA_TIMEOUT_SECS) * 1000) |
| public void testExpireBackAndForth() throws Exception { |
| cluster.start(); |
| long st = Time.now(); |
| long runFor = STRESS_RUNTIME_SECS * 1000; |
| |
| int i = 0; |
| while (Time.now() - st < runFor) { |
| // flip flop the services back and forth |
| int from = i % 2; |
| int to = (i + 1) % 2; |
| |
| // Expire one service, it should fail over to the other |
| LOG.info("Failing over via expiration from " + from + " to " + to); |
| cluster.expireAndVerifyFailover(from, to); |
| |
| i++; |
| } |
| } |
| |
| /** |
| * Randomly expire the ZK sessions of the two ZKFCs. This differs |
| * from the above test in that it is not a controlled failover - |
| * we just do random expirations and expect neither one to ever |
| * generate fatal exceptions. |
| */ |
| @Test(timeout=(STRESS_RUNTIME_SECS + EXTRA_TIMEOUT_SECS) * 1000) |
| public void testRandomExpirations() throws Exception { |
| cluster.start(); |
| long st = Time.now(); |
| long runFor = STRESS_RUNTIME_SECS * 1000; |
| |
| Random r = new Random(); |
| while (Time.now() - st < runFor) { |
| cluster.getTestContext().checkException(); |
| int targetIdx = r.nextInt(2); |
| ActiveStandbyElector target = cluster.getElector(targetIdx); |
| long sessId = target.getZKSessionIdForTests(); |
| if (sessId != -1) { |
| LOG.info(String.format("Expiring session %x for svc %d", |
| sessId, targetIdx)); |
| getServer(serverFactory).closeSession(sessId); |
| } |
| Thread.sleep(r.nextInt(300)); |
| } |
| } |
| |
| /** |
| * Have the services fail their health checks half the time, |
| * causing the master role to bounce back and forth in the |
| * cluster. Meanwhile, causes ZK to disconnect clients every |
| * 50ms, to trigger the retry code and failures to become active. |
| */ |
| @Test(timeout=(STRESS_RUNTIME_SECS + EXTRA_TIMEOUT_SECS) * 1000) |
| public void testRandomHealthAndDisconnects() throws Exception { |
| long runFor = STRESS_RUNTIME_SECS * 1000; |
| Mockito.doAnswer(new RandomlyThrow(0)) |
| .when(cluster.getService(0).proxy).monitorHealth(); |
| Mockito.doAnswer(new RandomlyThrow(1)) |
| .when(cluster.getService(1).proxy).monitorHealth(); |
| conf.setInt(CommonConfigurationKeys.HA_FC_ELECTOR_ZK_OP_RETRIES_KEY, 100); |
| // Don't start until after the above mocking. Otherwise we can get |
| // Mockito errors if the HM calls the proxy in the middle of |
| // setting up the mock. |
| cluster.start(); |
| |
| long st = Time.now(); |
| while (Time.now() - st < runFor) { |
| cluster.getTestContext().checkException(); |
| serverFactory.closeAll(); |
| Thread.sleep(50); |
| } |
| } |
| |
| |
| /** |
| * Randomly throw an exception half the time the method is called |
| */ |
| @SuppressWarnings("rawtypes") |
| private static class RandomlyThrow implements Answer { |
| private Random r = new Random(); |
| private final int svcIdx; |
| public RandomlyThrow(int svcIdx) { |
| this.svcIdx = svcIdx; |
| } |
| @Override |
| public Object answer(InvocationOnMock invocation) throws Throwable { |
| if (r.nextBoolean()) { |
| LOG.info("Throwing an exception for svc " + svcIdx); |
| throw new HealthCheckFailedException("random failure"); |
| } |
| return invocation.callRealMethod(); |
| } |
| } |
| } |