| /** |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.hadoop.hbase.regionserver; |
| |
| import static org.junit.Assert.assertEquals; |
| import static org.junit.Assert.assertTrue; |
| |
| import java.io.IOException; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.concurrent.atomic.AtomicBoolean; |
| import java.util.concurrent.atomic.AtomicReference; |
| |
| import org.apache.commons.logging.Log; |
| import org.apache.commons.logging.LogFactory; |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.hadoop.hbase.CategoryBasedTimeout; |
| import org.apache.hadoop.hbase.CoordinatedStateManager; |
| import org.apache.hadoop.hbase.HBaseConfiguration; |
| import org.apache.hadoop.hbase.HBaseTestingUtility; |
| import org.apache.hadoop.hbase.HRegionInfo; |
| import org.apache.hadoop.hbase.LocalHBaseCluster; |
| import org.apache.hadoop.hbase.MiniHBaseCluster; |
| import org.apache.hadoop.hbase.ServerName; |
| import org.apache.hadoop.hbase.master.HMaster; |
| import org.apache.hadoop.hbase.master.LoadBalancer; |
| import org.apache.hadoop.hbase.master.ServerListener; |
| import org.apache.hadoop.hbase.master.ServerManager; |
| import org.apache.hadoop.hbase.master.balancer.BaseLoadBalancer; |
| import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionServerStartupResponse; |
| import org.apache.hadoop.hbase.testclassification.MediumTests; |
| import org.apache.hadoop.hbase.testclassification.RegionServerTests; |
| import org.apache.hadoop.hbase.util.Bytes; |
| import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread; |
| import org.apache.hadoop.hbase.util.Threads; |
| import org.junit.Rule; |
| import org.junit.Test; |
| import org.junit.experimental.categories.Category; |
| import org.junit.rules.TestName; |
| import org.junit.rules.TestRule; |
| |
| /** |
| * Tests that a regionserver that dies after reporting for duty gets removed |
| * from list of online regions. See HBASE-9593. |
| */ |
| @Category({RegionServerTests.class, MediumTests.class}) |
| public class TestRSKilledWhenInitializing { |
| private static final Log LOG = LogFactory.getLog(TestRSKilledWhenInitializing.class); |
| @Rule public TestName testName = new TestName(); |
| @Rule public final TestRule timeout = CategoryBasedTimeout.builder(). |
| withTimeout(this.getClass()).withLookingForStuckThread(true).build(); |
| |
| // This boolean needs to be globally available. It is used below in our |
| // mocked up regionserver so it knows when to die. |
| private static AtomicBoolean masterActive = new AtomicBoolean(false); |
| // Ditto for this variable. It also is used in the mocked regionserver class. |
| private static final AtomicReference<ServerName> killedRS = new AtomicReference<ServerName>(); |
| |
| private static final int NUM_MASTERS = 1; |
| private static final int NUM_RS = 2; |
| |
| /** |
| * Test verifies whether a region server is removed from online servers list in master if it went |
| * down after registering with master. Test will TIMEOUT if an error!!!! |
| * @throws Exception |
| */ |
| @Test |
| public void testRSTerminationAfterRegisteringToMasterBeforeCreatingEphemeralNode() |
| throws Exception { |
| // Create config to use for this cluster |
| Configuration conf = HBaseConfiguration.create(); |
| conf.setInt(ServerManager.WAIT_ON_REGIONSERVERS_MINTOSTART, 1); |
| // Start the cluster |
| final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf); |
| TEST_UTIL.startMiniDFSCluster(3); |
| TEST_UTIL.startMiniZKCluster(); |
| TEST_UTIL.createRootDir(); |
| final LocalHBaseCluster cluster = |
| new LocalHBaseCluster(conf, NUM_MASTERS, NUM_RS, HMaster.class, |
| RegisterAndDieRegionServer.class); |
| final MasterThread master = startMaster(cluster.getMasters().get(0)); |
| try { |
| // Master is up waiting on RegionServers to check in. Now start RegionServers. |
| for (int i = 0; i < NUM_RS; i++) { |
| cluster.getRegionServers().get(i).start(); |
| } |
| // Expected total regionservers depends on whether Master can host regions or not. |
| int expectedTotalRegionServers = NUM_RS + (LoadBalancer.isTablesOnMaster(conf)? 1: 0); |
| List<ServerName> onlineServersList = null; |
| do { |
| onlineServersList = master.getMaster().getServerManager().getOnlineServersList(); |
| } while (onlineServersList.size() < expectedTotalRegionServers); |
| // Wait until killedRS is set. Means RegionServer is starting to go down. |
| while (killedRS.get() == null) { |
| Threads.sleep(1); |
| } |
| // Wait on the RegionServer to fully die. |
| while (cluster.getLiveRegionServers().size() >= expectedTotalRegionServers) { |
| Threads.sleep(1); |
| } |
| // Make sure Master is fully up before progressing. Could take a while if regions |
| // being reassigned. |
| while (!master.getMaster().isInitialized()) { |
| Threads.sleep(1); |
| } |
| |
| // Now in steady state. How many regions open? Master should have too many regionservers |
| // showing still. The downed RegionServer should still be showing as registered. |
| assertTrue(master.getMaster().getServerManager().isServerOnline(killedRS.get())); |
| // Find non-meta region (namespace?) and assign to the killed server. That'll trigger cleanup. |
| Map<HRegionInfo, ServerName> assignments = null; |
| do { |
| assignments = master.getMaster().getAssignmentManager().getRegionStates().getRegionAssignments(); |
| } while (assignments == null || assignments.size() < 2); |
| HRegionInfo hri = null; |
| for (Map.Entry<HRegionInfo, ServerName> e: assignments.entrySet()) { |
| if (e.getKey().isMetaRegion()) continue; |
| hri = e.getKey(); |
| break; |
| } |
| // Try moving region to the killed server. It will fail. As by-product, we will |
| // remove the RS from Master online list because no corresponding znode. |
| assertEquals(expectedTotalRegionServers, |
| master.getMaster().getServerManager().getOnlineServersList().size()); |
| LOG.info("Move " + hri.getEncodedName() + " to " + killedRS.get()); |
| master.getMaster().move(hri.getEncodedNameAsBytes(), |
| Bytes.toBytes(killedRS.get().toString())); |
| // Wait until the RS no longer shows as registered in Master. |
| while (onlineServersList.size() > (NUM_RS + 1)) { |
| Thread.sleep(100); |
| onlineServersList = master.getMaster().getServerManager().getOnlineServersList(); |
| } |
| } finally { |
| // Shutdown is messy with complaints about fs being closed. Why? TODO. |
| cluster.shutdown(); |
| cluster.join(); |
| TEST_UTIL.shutdownMiniDFSCluster(); |
| TEST_UTIL.shutdownMiniZKCluster(); |
| TEST_UTIL.cleanupTestDir(); |
| } |
| } |
| |
| /** |
| * Start Master. Get as far as the state where Master is waiting on |
| * RegionServers to check in, then return. |
| */ |
| private MasterThread startMaster(MasterThread master) { |
| master.start(); |
| // It takes a while until ServerManager creation to happen inside Master startup. |
| while (master.getMaster().getServerManager() == null) { |
| continue; |
| } |
| // Set a listener for the waiting-on-RegionServers state. We want to wait |
| // until this condition before we leave this method and start regionservers. |
| final AtomicBoolean waiting = new AtomicBoolean(false); |
| if (master.getMaster().getServerManager() == null) throw new NullPointerException("SM"); |
| master.getMaster().getServerManager().registerListener(new ServerListener() { |
| @Override |
| public void waiting() { |
| waiting.set(true); |
| } |
| }); |
| // Wait until the Master gets to place where it is waiting on RegionServers to check in. |
| while (!waiting.get()) { |
| continue; |
| } |
| // Set the global master-is-active; gets picked up by regionservers later. |
| masterActive.set(true); |
| return master; |
| } |
| |
| /** |
| * A RegionServer that reports for duty and then immediately dies if it is the first to receive |
| * the response to a reportForDuty. When it dies, it clears its ephemeral znode which the master |
| * notices and so removes the region from its set of online regionservers. |
| */ |
| static class RegisterAndDieRegionServer extends MiniHBaseCluster.MiniHBaseClusterRegionServer { |
| public RegisterAndDieRegionServer(Configuration conf, CoordinatedStateManager cp) |
| throws IOException, InterruptedException { |
| super(conf, cp); |
| } |
| |
| @Override |
| protected void handleReportForDutyResponse(RegionServerStartupResponse c) |
| throws IOException { |
| if (killedRS.compareAndSet(null, getServerName())) { |
| // Make sure Master is up so it will see the removal of the ephemeral znode for this RS. |
| while (!masterActive.get()) { |
| Threads.sleep(100); |
| } |
| super.kill(); |
| } else { |
| super.handleReportForDutyResponse(c); |
| } |
| } |
| } |
| } |