| /** |
| * |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.hadoop.hbase.master; |
| |
| import java.io.IOException; |
| |
| import org.apache.commons.logging.Log; |
| import org.apache.commons.logging.LogFactory; |
| import org.apache.hadoop.hbase.HBaseTestingUtility; |
| import org.apache.hadoop.hbase.HConstants; |
| import org.apache.hadoop.hbase.HRegionInfo; |
| import org.apache.hadoop.hbase.LargeTests; |
| import org.apache.hadoop.hbase.client.HTable; |
| import org.apache.hadoop.hbase.client.Put; |
| import org.apache.hadoop.hbase.client.Result; |
| import org.apache.hadoop.hbase.client.ResultScanner; |
| import org.apache.hadoop.hbase.client.Scan; |
| import org.apache.hadoop.hbase.util.Bytes; |
| import org.junit.AfterClass; |
| import org.junit.Assert; |
| import org.junit.Before; |
| import org.junit.BeforeClass; |
| import org.junit.Ignore; |
| import org.junit.Test; |
| import org.junit.experimental.categories.Category; |
| |
| /** |
| * Test transitions of state across the master. Sets up the cluster once and |
| * then runs a couple of tests. |
| */ |
| @Category(LargeTests.class) |
| public class TestMasterTransitions { |
| private static final Log LOG = LogFactory.getLog(TestMasterTransitions.class); |
| private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(); |
| private static final String TABLENAME = "master_transitions"; |
| private static final byte [][] FAMILIES = new byte [][] {Bytes.toBytes("a"), |
| Bytes.toBytes("b"), Bytes.toBytes("c")}; |
| |
| /** |
| * Start up a mini cluster and put a small table of many empty regions into it. |
| * @throws Exception |
| */ |
| @BeforeClass public static void beforeAllTests() throws Exception { |
| TEST_UTIL.getConfiguration().setBoolean("dfs.support.append", true); |
| TEST_UTIL.startMiniCluster(2); |
| // Create a table of three families. This will assign a region. |
| TEST_UTIL.createTable(Bytes.toBytes(TABLENAME), FAMILIES); |
| HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME); |
| int countOfRegions = TEST_UTIL.createMultiRegions(t, getTestFamily()); |
| TEST_UTIL.waitUntilAllRegionsAssigned(countOfRegions); |
| addToEachStartKey(countOfRegions); |
| t.close(); |
| } |
| |
| @AfterClass public static void afterAllTests() throws Exception { |
| TEST_UTIL.shutdownMiniCluster(); |
| } |
| |
| @Before public void setup() throws IOException { |
| TEST_UTIL.ensureSomeRegionServersAvailable(2); |
| } |
| |
| /** |
| * Listener for regionserver events testing hbase-2428 (Infinite loop of |
| * region closes if META region is offline). In particular, listen |
| * for the close of the 'metaServer' and when it comes in, requeue it with a |
| * delay as though there were an issue processing the shutdown. As part of |
| * the requeuing, send over a close of a region on 'otherServer' so it comes |
| * into a master that has its meta region marked as offline. |
| */ |
| /* |
| static class HBase2428Listener implements RegionServerOperationListener { |
| // Map of what we've delayed so we don't do do repeated delays. |
| private final Set<RegionServerOperation> postponed = |
| new CopyOnWriteArraySet<RegionServerOperation>(); |
| private boolean done = false;; |
| private boolean metaShutdownReceived = false; |
| private final HServerAddress metaAddress; |
| private final MiniHBaseCluster cluster; |
| private final int otherServerIndex; |
| private final HRegionInfo hri; |
| private int closeCount = 0; |
| static final int SERVER_DURATION = 3 * 1000; |
| static final int CLOSE_DURATION = 1 * 1000; |
| |
| HBase2428Listener(final MiniHBaseCluster c, final HServerAddress metaAddress, |
| final HRegionInfo closingHRI, final int otherServerIndex) { |
| this.cluster = c; |
| this.metaAddress = metaAddress; |
| this.hri = closingHRI; |
| this.otherServerIndex = otherServerIndex; |
| } |
| |
| @Override |
| public boolean process(final RegionServerOperation op) throws IOException { |
| // If a regionserver shutdown and its of the meta server, then we want to |
| // delay the processing of the shutdown and send off a close of a region on |
| // the 'otherServer. |
| boolean result = true; |
| if (op instanceof ProcessServerShutdown) { |
| ProcessServerShutdown pss = (ProcessServerShutdown)op; |
| if (pss.getDeadServerAddress().equals(this.metaAddress)) { |
| // Don't postpone more than once. |
| if (!this.postponed.contains(pss)) { |
| // Close some region. |
| this.cluster.addMessageToSendRegionServer(this.otherServerIndex, |
| new HMsg(HMsg.Type.MSG_REGION_CLOSE, hri, |
| Bytes.toBytes("Forcing close in test"))); |
| this.postponed.add(pss); |
| // Put off the processing of the regionserver shutdown processing. |
| pss.setDelay(SERVER_DURATION); |
| this.metaShutdownReceived = true; |
| // Return false. This will add this op to the delayed queue. |
| result = false; |
| } |
| } |
| } else { |
| // Have the close run frequently. |
| if (isWantedCloseOperation(op) != null) { |
| op.setDelay(CLOSE_DURATION); |
| // Count how many times it comes through here. |
| this.closeCount++; |
| } |
| } |
| return result; |
| } |
| |
| public void processed(final RegionServerOperation op) { |
| if (isWantedCloseOperation(op) != null) return; |
| this.done = true; |
| } |
| */ |
| /* |
| * @param op |
| * @return Null if not the wanted ProcessRegionClose, else <code>op</code> |
| * cast as a ProcessRegionClose. |
| */ |
| /* |
| private ProcessRegionClose isWantedCloseOperation(final RegionServerOperation op) { |
| // Count every time we get a close operation. |
| if (op instanceof ProcessRegionClose) { |
| ProcessRegionClose c = (ProcessRegionClose)op; |
| if (c.regionInfo.equals(hri)) { |
| return c; |
| } |
| } |
| return null; |
| } |
| |
| boolean isDone() { |
| return this.done; |
| } |
| |
| boolean isMetaShutdownReceived() { |
| return metaShutdownReceived; |
| } |
| |
| int getCloseCount() { |
| return this.closeCount; |
| } |
| |
| @Override |
| public boolean process(HServerInfo serverInfo, HMsg incomingMsg) { |
| return true; |
| } |
| } |
| */ |
| /** |
| * In 2428, the meta region has just been set offline and then a close comes |
| * in. |
| * @see <a href="https://issues.apache.org/jira/browse/HBASE-2428">HBASE-2428</a> |
| */ |
| @Ignore @Test (timeout=300000) public void testRegionCloseWhenNoMetaHBase2428() |
| throws Exception { |
| /* |
| LOG.info("Running testRegionCloseWhenNoMetaHBase2428"); |
| MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); |
| final HMaster master = cluster.getMaster(); |
| int metaIndex = cluster.getServerWithMeta(); |
| // Figure the index of the server that is not server the .META. |
| int otherServerIndex = -1; |
| for (int i = 0; i < cluster.getRegionServerThreads().size(); i++) { |
| if (i == metaIndex) continue; |
| otherServerIndex = i; |
| break; |
| } |
| final HRegionServer otherServer = cluster.getRegionServer(otherServerIndex); |
| final HRegionServer metaHRS = cluster.getRegionServer(metaIndex); |
| |
| // Get a region out on the otherServer. |
| final HRegionInfo hri = |
| otherServer.getOnlineRegions().iterator().next().getRegionInfo(); |
| |
| // Add our RegionServerOperationsListener |
| HBase2428Listener listener = new HBase2428Listener(cluster, |
| metaHRS.getHServerInfo().getServerAddress(), hri, otherServerIndex); |
| master.getRegionServerOperationQueue(). |
| registerRegionServerOperationListener(listener); |
| try { |
| // Now close the server carrying meta. |
| cluster.abortRegionServer(metaIndex); |
| |
| // First wait on receipt of meta server shutdown message. |
| while(!listener.metaShutdownReceived) Threads.sleep(100); |
| while(!listener.isDone()) Threads.sleep(10); |
| // We should not have retried the close more times than it took for the |
| // server shutdown message to exit the delay queue and get processed |
| // (Multiple by two to add in some slop in case of GC or something). |
| assertTrue(listener.getCloseCount() > 1); |
| assertTrue(listener.getCloseCount() < |
| ((HBase2428Listener.SERVER_DURATION/HBase2428Listener.CLOSE_DURATION) * 2)); |
| |
| // Assert the closed region came back online |
| assertRegionIsBackOnline(hri); |
| } finally { |
| master.getRegionServerOperationQueue(). |
| unregisterRegionServerOperationListener(listener); |
| } |
| */ |
| } |
| |
| /** |
| * Test adding in a new server before old one on same host+port is dead. |
| * Make the test more onerous by having the server under test carry the meta. |
| * If confusion between old and new, purportedly meta never comes back. Test |
| * that meta gets redeployed. |
| */ |
| @Ignore @Test (timeout=300000) public void testAddingServerBeforeOldIsDead2413() |
| throws IOException { |
| /* |
| LOG.info("Running testAddingServerBeforeOldIsDead2413"); |
| MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); |
| int count = count(); |
| int metaIndex = cluster.getServerWithMeta(); |
| MiniHBaseClusterRegionServer metaHRS = |
| (MiniHBaseClusterRegionServer)cluster.getRegionServer(metaIndex); |
| int port = metaHRS.getServerInfo().getServerAddress().getPort(); |
| Configuration c = TEST_UTIL.getConfiguration(); |
| String oldPort = c.get(HConstants.REGIONSERVER_PORT, "0"); |
| try { |
| LOG.info("KILLED=" + metaHRS); |
| metaHRS.kill(); |
| c.set(HConstants.REGIONSERVER_PORT, Integer.toString(port)); |
| // Try and start new regionserver. It might clash with the old |
| // regionserver port so keep trying to get past the BindException. |
| HRegionServer hrs = null; |
| while (true) { |
| try { |
| hrs = cluster.startRegionServer().getRegionServer(); |
| break; |
| } catch (IOException e) { |
| if (e.getCause() != null && e.getCause() instanceof InvocationTargetException) { |
| InvocationTargetException ee = (InvocationTargetException)e.getCause(); |
| if (ee.getCause() != null && ee.getCause() instanceof BindException) { |
| LOG.info("BindException; retrying: " + e.toString()); |
| } |
| } |
| } |
| } |
| LOG.info("STARTED=" + hrs); |
| // Wait until he's been given at least 3 regions before we go on to try |
| // and count rows in table. |
| while (hrs.getOnlineRegions().size() < 3) Threads.sleep(100); |
| LOG.info(hrs.toString() + " has " + hrs.getOnlineRegions().size() + |
| " regions"); |
| assertEquals(count, count()); |
| } finally { |
| c.set(HConstants.REGIONSERVER_PORT, oldPort); |
| } |
| */ |
| } |
| |
| /** |
| * HBase2482 is about outstanding region openings. If any are outstanding |
| * when a regionserver goes down, then they'll never deploy. They'll be |
| * stuck in the regions-in-transition list for ever. This listener looks |
| * for a region opening HMsg and if its from the server passed on construction, |
| * then we kill it. It also looks out for a close message on the victim |
| * server because that signifies start of the fireworks. |
| */ |
| /* |
| static class HBase2482Listener implements RegionServerOperationListener { |
| private final HRegionServer victim; |
| private boolean abortSent = false; |
| // We closed regions on new server. |
| private volatile boolean closed = false; |
| // Copy of regions on new server |
| private final Collection<HRegion> copyOfOnlineRegions; |
| // This is the region that was in transition on the server we aborted. Test |
| // passes if this region comes back online successfully. |
| private HRegionInfo regionToFind; |
| |
| HBase2482Listener(final HRegionServer victim) { |
| this.victim = victim; |
| // Copy regions currently open on this server so I can notice when |
| // there is a close. |
| this.copyOfOnlineRegions = |
| this.victim.getCopyOfOnlineRegionsSortedBySize().values(); |
| } |
| |
| @Override |
| public boolean process(HServerInfo serverInfo, HMsg incomingMsg) { |
| if (!victim.getServerInfo().equals(serverInfo) || |
| this.abortSent || !this.closed) { |
| return true; |
| } |
| if (!incomingMsg.isType(HMsg.Type.MSG_REPORT_PROCESS_OPEN)) return true; |
| // Save the region that is in transition so can test later it came back. |
| this.regionToFind = incomingMsg.getRegionInfo(); |
| String msg = "ABORTING " + this.victim + " because got a " + |
| HMsg.Type.MSG_REPORT_PROCESS_OPEN + " on this server for " + |
| incomingMsg.getRegionInfo().getRegionNameAsString(); |
| this.victim.abort(msg); |
| this.abortSent = true; |
| return true; |
| } |
| |
| @Override |
| public boolean process(RegionServerOperation op) throws IOException { |
| return true; |
| } |
| |
| @Override |
| public void processed(RegionServerOperation op) { |
| if (this.closed || !(op instanceof ProcessRegionClose)) return; |
| ProcessRegionClose close = (ProcessRegionClose)op; |
| for (HRegion r: this.copyOfOnlineRegions) { |
| if (r.getRegionInfo().equals(close.regionInfo)) { |
| // We've closed one of the regions that was on the victim server. |
| // Now can start testing for when all regions are back online again |
| LOG.info("Found close of " + |
| r.getRegionInfo().getRegionNameAsString() + |
| "; setting close happened flag"); |
| this.closed = true; |
| break; |
| } |
| } |
| } |
| } |
| */ |
| /** |
| * In 2482, a RS with an opening region on it dies. The said region is then |
| * stuck in the master's regions-in-transition and never leaves it. This |
| * test works by bringing up a new regionserver, waiting for the load |
| * balancer to give it some regions. Then, we close all on the new server. |
| * After sending all the close messages, we send the new regionserver the |
| * special blocking message so it can not process any more messages. |
| * Meantime reopening of the just-closed regions is backed up on the new |
| * server. Soon as master gets an opening region from the new regionserver, |
| * we kill it. We then wait on all regions to come back on line. If bug |
| * is fixed, this should happen soon as the processing of the killed server is |
| * done. |
| * @see <a href="https://issues.apache.org/jira/browse/HBASE-2482">HBASE-2482</a> |
| */ |
| @Ignore @Test (timeout=300000) public void testKillRSWithOpeningRegion2482() |
| throws Exception { |
| /* |
| LOG.info("Running testKillRSWithOpeningRegion2482"); |
| MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); |
| if (cluster.getLiveRegionServerThreads().size() < 2) { |
| // Need at least two servers. |
| cluster.startRegionServer(); |
| } |
| // Count how many regions are online. They need to be all back online for |
| // this test to succeed. |
| int countOfMetaRegions = countOfMetaRegions(); |
| // Add a listener on the server. |
| HMaster m = cluster.getMaster(); |
| // Start new regionserver. |
| MiniHBaseClusterRegionServer hrs = |
| (MiniHBaseClusterRegionServer)cluster.startRegionServer().getRegionServer(); |
| LOG.info("Started new regionserver: " + hrs.toString()); |
| // Wait until has some regions before proceeding. Balancer will give it some. |
| int minimumRegions = |
| countOfMetaRegions/(cluster.getRegionServerThreads().size() * 2); |
| while (hrs.getOnlineRegions().size() < minimumRegions) Threads.sleep(100); |
| // Set the listener only after some regions have been opened on new server. |
| HBase2482Listener listener = new HBase2482Listener(hrs); |
| m.getRegionServerOperationQueue(). |
| registerRegionServerOperationListener(listener); |
| try { |
| // Go close all non-catalog regions on this new server |
| closeAllNonCatalogRegions(cluster, hrs); |
| // After all closes, add blocking message before the region opens start to |
| // come in. |
| cluster.addMessageToSendRegionServer(hrs, |
| new HMsg(HMsg.Type.TESTING_BLOCK_REGIONSERVER)); |
| // Wait till one of the above close messages has an effect before we start |
| // wait on all regions back online. |
| while (!listener.closed) Threads.sleep(100); |
| LOG.info("Past close"); |
| // Make sure the abort server message was sent. |
| while(!listener.abortSent) Threads.sleep(100); |
| LOG.info("Past abort send; waiting on all regions to redeploy"); |
| // Now wait for regions to come back online. |
| assertRegionIsBackOnline(listener.regionToFind); |
| } finally { |
| m.getRegionServerOperationQueue(). |
| unregisterRegionServerOperationListener(listener); |
| } |
| */ |
| } |
| |
| /* |
| * @return Count of all non-catalog regions on the designated server |
| */ |
| /* |
| private int closeAllNonCatalogRegions(final MiniHBaseCluster cluster, |
| final MiniHBaseCluster.MiniHBaseClusterRegionServer hrs) |
| throws IOException { |
| int countOfRegions = 0; |
| for (HRegion r: hrs.getOnlineRegions()) { |
| if (r.getRegionInfo().isMetaRegion()) continue; |
| cluster.addMessageToSendRegionServer(hrs, |
| new HMsg(HMsg.Type.MSG_REGION_CLOSE, r.getRegionInfo())); |
| LOG.info("Sent close of " + r.getRegionInfo().getRegionNameAsString() + |
| " on " + hrs.toString()); |
| countOfRegions++; |
| } |
| return countOfRegions; |
| } |
| |
| private void assertRegionIsBackOnline(final HRegionInfo hri) |
| throws IOException { |
| // Region should have an entry in its startkey because of addRowToEachRegion. |
| byte [] row = getStartKey(hri); |
| HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME); |
| Get g = new Get(row); |
| assertTrue((t.get(g)).size() > 0); |
| } |
| |
| /* |
| * @return Count of regions in meta table. |
| * @throws IOException |
| */ |
| /* |
| private static int countOfMetaRegions() |
| throws IOException { |
| HTable meta = new HTable(TEST_UTIL.getConfiguration(), |
| HConstants.META_TABLE_NAME); |
| int rows = 0; |
| Scan scan = new Scan(); |
| scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER); |
| ResultScanner s = meta.getScanner(scan); |
| for (Result r = null; (r = s.next()) != null;) { |
| byte [] b = |
| r.getValue(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER); |
| if (b == null || b.length <= 0) break; |
| rows++; |
| } |
| s.close(); |
| return rows; |
| } |
| */ |
| /* |
| * Add to each of the regions in .META. a value. Key is the startrow of the |
| * region (except its 'aaa' for first region). Actual value is the row name. |
| * @param expected |
| * @return |
| * @throws IOException |
| */ |
| private static int addToEachStartKey(final int expected) throws IOException { |
| HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME); |
| HTable meta = new HTable(TEST_UTIL.getConfiguration(), |
| HConstants.META_TABLE_NAME); |
| int rows = 0; |
| Scan scan = new Scan(); |
| scan.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER); |
| ResultScanner s = meta.getScanner(scan); |
| for (Result r = null; (r = s.next()) != null;) { |
| HRegionInfo hri = HRegionInfo.getHRegionInfo(r); |
| if (hri == null) break; |
| |
| // If start key, add 'aaa'. |
| byte [] row = getStartKey(hri); |
| Put p = new Put(row); |
| p.setWriteToWAL(false); |
| p.add(getTestFamily(), getTestQualifier(), row); |
| t.put(p); |
| rows++; |
| } |
| s.close(); |
| Assert.assertEquals(expected, rows); |
| t.close(); |
| meta.close(); |
| return rows; |
| } |
| |
| /* |
| * @return Count of rows in TABLENAME |
| * @throws IOException |
| */ |
| private static int count() throws IOException { |
| HTable t = new HTable(TEST_UTIL.getConfiguration(), TABLENAME); |
| int rows = 0; |
| Scan scan = new Scan(); |
| ResultScanner s = t.getScanner(scan); |
| for (Result r = null; (r = s.next()) != null;) { |
| rows++; |
| } |
| s.close(); |
| LOG.info("Counted=" + rows); |
| t.close(); |
| return rows; |
| } |
| |
| /* |
| * @param hri |
| * @return Start key for hri (If start key is '', then return 'aaa'. |
| */ |
| private static byte [] getStartKey(final HRegionInfo hri) { |
| return Bytes.equals(HConstants.EMPTY_START_ROW, hri.getStartKey())? |
| Bytes.toBytes("aaa"): hri.getStartKey(); |
| } |
| |
| private static byte [] getTestFamily() { |
| return FAMILIES[0]; |
| } |
| |
| private static byte [] getTestQualifier() { |
| return getTestFamily(); |
| } |
| |
| @org.junit.Rule |
| public org.apache.hadoop.hbase.ResourceCheckerJUnitRule cu = |
| new org.apache.hadoop.hbase.ResourceCheckerJUnitRule(); |
| } |
| |