blob: 251e0228b49a71c4821a6523817373988512dc38 [file] [log] [blame]
/**
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.master;
import static org.junit.Assert.assertEquals;
import java.io.IOException;
import java.util.List;
import java.util.NavigableSet;
import java.util.Set;
import java.util.TreeSet;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.MiniHBaseCluster;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.RegionLocator;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.testclassification.LargeTests;
import org.apache.hadoop.hbase.testclassification.MasterTests;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
import org.junit.Test;
import org.junit.experimental.categories.Category;
/**
* Tests the restarting of everything as done during rolling restarts.
*/
@Category({MasterTests.class, LargeTests.class})
public class TestRollingRestart {
private static final Log LOG = LogFactory.getLog(TestRollingRestart.class);
@Test (timeout=500000)
public void testBasicRollingRestart() throws Exception {
// Start a cluster with 2 masters and 4 regionservers
final int NUM_MASTERS = 2;
final int NUM_RS = 3;
final int NUM_REGIONS_TO_CREATE = 20;
int expectedNumRS = 3;
// Start the cluster
log("Starting cluster");
Configuration conf = HBaseConfiguration.create();
HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
log("Waiting for active/ready master");
cluster.waitForActiveAndReadyMaster();
// Create a table with regions
TableName table = TableName.valueOf("tableRestart");
byte [] family = Bytes.toBytes("family");
log("Creating table with " + NUM_REGIONS_TO_CREATE + " regions");
Table ht = TEST_UTIL.createMultiRegionTable(table, family, NUM_REGIONS_TO_CREATE);
int numRegions = -1;
try (RegionLocator r = TEST_UTIL.getConnection().getRegionLocator(table)) {
numRegions = r.getStartKeys().length;
}
numRegions += 1; // catalogs
log("Waiting for no more RIT\n");
TEST_UTIL.waitUntilNoRegionsInTransition(60000);
log("Disabling table\n");
TEST_UTIL.getHBaseAdmin().disableTable(table);
log("Waiting for no more RIT\n");
TEST_UTIL.waitUntilNoRegionsInTransition(60000);
NavigableSet<String> regions = HBaseTestingUtility.getAllOnlineRegions(cluster);
log("Verifying only catalog and namespace regions are assigned\n");
if (regions.size() != 2) {
for (String oregion : regions) log("Region still online: " + oregion);
}
assertEquals(2, regions.size());
log("Enabling table\n");
TEST_UTIL.getHBaseAdmin().enableTable(table);
log("Waiting for no more RIT\n");
TEST_UTIL.waitUntilNoRegionsInTransition(60000);
log("Verifying there are " + numRegions + " assigned on cluster\n");
regions = HBaseTestingUtility.getAllOnlineRegions(cluster);
assertRegionsAssigned(cluster, regions);
assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
// Add a new regionserver
log("Adding a fourth RS");
RegionServerThread restarted = cluster.startRegionServer();
expectedNumRS++;
restarted.waitForServerOnline();
log("Additional RS is online");
log("Waiting for no more RIT");
TEST_UTIL.waitUntilNoRegionsInTransition(60000);
log("Verifying there are " + numRegions + " assigned on cluster");
assertRegionsAssigned(cluster, regions);
assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
// Master Restarts
List<MasterThread> masterThreads = cluster.getMasterThreads();
MasterThread activeMaster = null;
MasterThread backupMaster = null;
assertEquals(2, masterThreads.size());
if (masterThreads.get(0).getMaster().isActiveMaster()) {
activeMaster = masterThreads.get(0);
backupMaster = masterThreads.get(1);
} else {
activeMaster = masterThreads.get(1);
backupMaster = masterThreads.get(0);
}
// Bring down the backup master
log("Stopping backup master\n\n");
backupMaster.getMaster().stop("Stop of backup during rolling restart");
cluster.hbaseCluster.waitOnMaster(backupMaster);
// Bring down the primary master
log("Stopping primary master\n\n");
activeMaster.getMaster().stop("Stop of active during rolling restart");
cluster.hbaseCluster.waitOnMaster(activeMaster);
// Start primary master
log("Restarting primary master\n\n");
activeMaster = cluster.startMaster();
cluster.waitForActiveAndReadyMaster();
// Start backup master
log("Restarting backup master\n\n");
backupMaster = cluster.startMaster();
assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
// RegionServer Restarts
// Bring them down, one at a time, waiting between each to complete
List<RegionServerThread> regionServers =
cluster.getLiveRegionServerThreads();
int num = 1;
int total = regionServers.size();
for (RegionServerThread rst : regionServers) {
ServerName serverName = rst.getRegionServer().getServerName();
log("Stopping region server " + num + " of " + total + " [ " +
serverName + "]");
rst.getRegionServer().stop("Stopping RS during rolling restart");
cluster.hbaseCluster.waitOnRegionServer(rst);
log("Waiting for RS shutdown to be handled by master");
waitForRSShutdownToStartAndFinish(activeMaster, serverName);
log("RS shutdown done, waiting for no more RIT");
TEST_UTIL.waitUntilNoRegionsInTransition(60000);
log("Verifying there are " + numRegions + " assigned on cluster");
assertRegionsAssigned(cluster, regions);
expectedNumRS--;
assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
log("Restarting region server " + num + " of " + total);
restarted = cluster.startRegionServer();
restarted.waitForServerOnline();
expectedNumRS++;
log("Region server " + num + " is back online");
log("Waiting for no more RIT");
TEST_UTIL.waitUntilNoRegionsInTransition(60000);
log("Verifying there are " + numRegions + " assigned on cluster");
assertRegionsAssigned(cluster, regions);
assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
num++;
}
Thread.sleep(1000);
assertRegionsAssigned(cluster, regions);
// TODO: Bring random 3 of 4 RS down at the same time
ht.close();
// Stop the cluster
TEST_UTIL.shutdownMiniCluster();
}
private void waitForRSShutdownToStartAndFinish(MasterThread activeMaster,
ServerName serverName) throws InterruptedException {
ServerManager sm = activeMaster.getMaster().getServerManager();
// First wait for it to be in dead list
while (!sm.getDeadServers().isDeadServer(serverName)) {
log("Waiting for [" + serverName + "] to be listed as dead in master");
Thread.sleep(1);
}
log("Server [" + serverName + "] marked as dead, waiting for it to " +
"finish dead processing");
while (sm.areDeadServersInProgress()) {
log("Server [" + serverName + "] still being processed, waiting");
Thread.sleep(100);
}
log("Server [" + serverName + "] done with server shutdown processing");
}
private void log(String msg) {
LOG.debug("\n\nTRR: " + msg + "\n");
}
private int getNumberOfOnlineRegions(MiniHBaseCluster cluster) {
int numFound = 0;
for (RegionServerThread rst : cluster.getLiveRegionServerThreads()) {
numFound += rst.getRegionServer().getNumberOfOnlineRegions();
}
for (MasterThread mt : cluster.getMasterThreads()) {
numFound += mt.getMaster().getNumberOfOnlineRegions();
}
return numFound;
}
private void assertRegionsAssigned(MiniHBaseCluster cluster,
Set<String> expectedRegions) throws IOException {
int numFound = getNumberOfOnlineRegions(cluster);
if (expectedRegions.size() > numFound) {
log("Expected to find " + expectedRegions.size() + " but only found"
+ " " + numFound);
NavigableSet<String> foundRegions =
HBaseTestingUtility.getAllOnlineRegions(cluster);
for (String region : expectedRegions) {
if (!foundRegions.contains(region)) {
log("Missing region: " + region);
}
}
assertEquals(expectedRegions.size(), numFound);
} else if (expectedRegions.size() < numFound) {
int doubled = numFound - expectedRegions.size();
log("Expected to find " + expectedRegions.size() + " but found"
+ " " + numFound + " (" + doubled + " double assignments?)");
NavigableSet<String> doubleRegions = getDoubleAssignedRegions(cluster);
for (String region : doubleRegions) {
log("Region is double assigned: " + region);
}
assertEquals(expectedRegions.size(), numFound);
} else {
log("Success! Found expected number of " + numFound + " regions");
}
}
private NavigableSet<String> getDoubleAssignedRegions(
MiniHBaseCluster cluster) throws IOException {
NavigableSet<String> online = new TreeSet<String>();
NavigableSet<String> doubled = new TreeSet<String>();
for (RegionServerThread rst : cluster.getLiveRegionServerThreads()) {
for (HRegionInfo region : ProtobufUtil.getOnlineRegions(
rst.getRegionServer().getRSRpcServices())) {
if(!online.add(region.getRegionNameAsString())) {
doubled.add(region.getRegionNameAsString());
}
}
}
return doubled;
}
}