blob: e03a0d5c4964578be4362edf3561ef7d759c7dbc [file] [log] [blame]
/**
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.util;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HColumnDescriptor;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HRegionLocation;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.MetaTableAccessor;
import org.apache.hadoop.hbase.MiniHBaseCluster;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.ClusterConnection;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Delete;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.RegionLocator;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.client.replication.ReplicationAdmin;
import org.apache.hadoop.hbase.coprocessor.CoprocessorHost;
import org.apache.hadoop.hbase.io.hfile.TestHFile;
import org.apache.hadoop.hbase.master.AssignmentManager;
import org.apache.hadoop.hbase.master.RegionState;
import org.apache.hadoop.hbase.master.RegionStates;
import org.apache.hadoop.hbase.master.TableLockManager;
import org.apache.hadoop.hbase.regionserver.HRegion;
import org.apache.hadoop.hbase.regionserver.HRegionServer;
import org.apache.hadoop.hbase.regionserver.SplitTransactionImpl;
import org.apache.hadoop.hbase.regionserver.SplitTransactionFactory;
import org.apache.hadoop.hbase.regionserver.TestEndToEndSplitTransaction;
import org.apache.hadoop.hbase.replication.ReplicationFactory;
import org.apache.hadoop.hbase.replication.ReplicationPeerConfig;
import org.apache.hadoop.hbase.replication.ReplicationQueues;
import org.apache.hadoop.hbase.replication.ReplicationQueuesArguments;
import org.apache.hadoop.hbase.testclassification.LargeTests;
import org.apache.hadoop.hbase.testclassification.MiscTests;
import org.apache.hadoop.hbase.util.hbck.HFileCorruptionChecker;
import org.apache.hadoop.hbase.util.hbck.HbckTestingUtil;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Ignore;
import org.junit.Test;
import org.junit.experimental.categories.Category;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.ScheduledThreadPoolExecutor;
import java.util.concurrent.SynchronousQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import static org.apache.hadoop.hbase.util.hbck.HbckTestingUtil.*;
import static org.junit.Assert.*;
@Category({MiscTests.class, LargeTests.class})
public class TestHBaseFsckOneRS extends BaseTestHBaseFsck {
@BeforeClass
public static void setUpBeforeClass() throws Exception {
TEST_UTIL.getConfiguration().set(CoprocessorHost.MASTER_COPROCESSOR_CONF_KEY,
MasterSyncObserver.class.getName());
conf.setInt("hbase.regionserver.handler.count", 2);
conf.setInt("hbase.regionserver.metahandler.count", 30);
conf.setInt("hbase.htable.threads.max", POOL_SIZE);
conf.setInt("hbase.hconnection.threads.max", 2 * POOL_SIZE);
conf.setInt("hbase.hbck.close.timeout", 2 * REGION_ONLINE_TIMEOUT);
conf.setInt(HConstants.HBASE_RPC_TIMEOUT_KEY, 8 * REGION_ONLINE_TIMEOUT);
TEST_UTIL.startMiniCluster(1);
tableExecutorService = new ThreadPoolExecutor(1, POOL_SIZE, 60, TimeUnit.SECONDS,
new SynchronousQueue<Runnable>(), Threads.newDaemonThreadFactory("testhbck"));
hbfsckExecutorService = new ScheduledThreadPoolExecutor(POOL_SIZE);
AssignmentManager assignmentManager =
TEST_UTIL.getHBaseCluster().getMaster().getAssignmentManager();
regionStates = assignmentManager.getRegionStates();
connection = (ClusterConnection) TEST_UTIL.getConnection();
admin = connection.getAdmin();
admin.setBalancerRunning(false, true);
TEST_UTIL.waitUntilAllRegionsAssigned(TableName.META_TABLE_NAME);
TEST_UTIL.waitUntilAllRegionsAssigned(TableName.NAMESPACE_TABLE_NAME);
}
@AfterClass
public static void tearDownAfterClass() throws Exception {
tableExecutorService.shutdown();
hbfsckExecutorService.shutdown();
admin.close();
TEST_UTIL.shutdownMiniCluster();
}
@Before
public void setUp() {
EnvironmentEdgeManager.reset();
}
/**
* This creates a clean table and confirms that the table is clean.
*/
@Test(timeout=180000)
public void testHBaseFsckClean() throws Exception {
assertNoErrors(doFsck(conf, false));
TableName table = TableName.valueOf("tableClean");
try {
HBaseFsck hbck = doFsck(conf, false);
assertNoErrors(hbck);
setupTable(table);
assertEquals(ROWKEYS.length, countRows());
// We created 1 table, should be fine
hbck = doFsck(conf, false);
assertNoErrors(hbck);
assertEquals(0, hbck.getOverlapGroups(table).size());
assertEquals(ROWKEYS.length, countRows());
} finally {
cleanupTable(table);
}
}
/**
* Test thread pooling in the case where there are more regions than threads
*/
@Test (timeout=180000)
public void testHbckThreadpooling() throws Exception {
TableName table =
TableName.valueOf("tableDupeStartKey");
try {
// Create table with 4 regions
setupTable(table);
// limit number of threads to 1.
Configuration newconf = new Configuration(conf);
newconf.setInt("hbasefsck.numthreads", 1);
assertNoErrors(doFsck(newconf, false));
// We should pass without triggering a RejectedExecutionException
} finally {
cleanupTable(table);
}
}
@Test (timeout=180000)
public void testTableWithNoRegions() throws Exception {
// We might end up with empty regions in a table
// see also testNoHdfsTable()
TableName table =
TableName.valueOf(name.getMethodName());
try {
// create table with one region
HTableDescriptor desc = new HTableDescriptor(table);
HColumnDescriptor hcd = new HColumnDescriptor(Bytes.toString(FAM));
desc.addFamily(hcd); // If a table has no CF's it doesn't get checked
createTable(TEST_UTIL, desc, null);
tbl = connection.getTable(table, tableExecutorService);
// Mess it up by leaving a hole in the assignment, meta, and hdfs data
deleteRegion(conf, tbl.getTableDescriptor(), HConstants.EMPTY_START_ROW,
HConstants.EMPTY_END_ROW, false, false, true);
HBaseFsck hbck = doFsck(conf, false);
assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] { HBaseFsck.ErrorReporter.ERROR_CODE.NOT_IN_HDFS });
doFsck(conf, true);
// fix hole
doFsck(conf, true);
// check that hole fixed
assertNoErrors(doFsck(conf, false));
} finally {
cleanupTable(table);
}
}
@Test (timeout=180000)
public void testHbckFixOrphanTable() throws Exception {
TableName table = TableName.valueOf("tableInfo");
FileSystem fs = null;
Path tableinfo = null;
try {
setupTable(table);
Path hbaseTableDir = FSUtils.getTableDir(
FSUtils.getRootDir(conf), table);
fs = hbaseTableDir.getFileSystem(conf);
FileStatus status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
tableinfo = status.getPath();
fs.rename(tableinfo, new Path("/.tableinfo"));
//to report error if .tableinfo is missing.
HBaseFsck hbck = doFsck(conf, false);
assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] { HBaseFsck.ErrorReporter.ERROR_CODE.NO_TABLEINFO_FILE });
// fix OrphanTable with default .tableinfo (htd not yet cached on master)
hbck = doFsck(conf, true);
assertNoErrors(hbck);
status = null;
status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
assertNotNull(status);
HTableDescriptor htd = admin.getTableDescriptor(table);
htd.setValue("NOT_DEFAULT", "true");
admin.disableTable(table);
admin.modifyTable(table, htd);
admin.enableTable(table);
fs.delete(status.getPath(), true);
// fix OrphanTable with cache
htd = admin.getTableDescriptor(table); // warms up cached htd on master
hbck = doFsck(conf, true);
assertNoErrors(hbck);
status = FSTableDescriptors.getTableInfoPath(fs, hbaseTableDir);
assertNotNull(status);
htd = admin.getTableDescriptor(table);
assertEquals(htd.getValue("NOT_DEFAULT"), "true");
} finally {
if (fs != null) {
fs.rename(new Path("/.tableinfo"), tableinfo);
}
cleanupTable(table);
}
}
@Test (timeout=180000)
public void testReadOnlyProperty() throws Exception {
HBaseFsck hbck = doFsck(conf, false);
Assert.assertEquals("shouldIgnorePreCheckPermission", true,
hbck.shouldIgnorePreCheckPermission());
hbck = doFsck(conf, true);
Assert.assertEquals("shouldIgnorePreCheckPermission", false,
hbck.shouldIgnorePreCheckPermission());
hbck = doFsck(conf, true);
hbck.setIgnorePreCheckPermission(true);
Assert.assertEquals("shouldIgnorePreCheckPermission", true,
hbck.shouldIgnorePreCheckPermission());
}
/**
* This creates and fixes a bad table where a region is completely contained
* by another region, and there is a hole (sort of like a bad split)
*/
@Test (timeout=180000)
public void testOverlapAndOrphan() throws Exception {
TableName table =
TableName.valueOf("tableOverlapAndOrphan");
try {
setupTable(table);
assertEquals(ROWKEYS.length, countRows());
// Mess it up by creating an overlap in the metadata
admin.disableTable(table);
deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"), Bytes.toBytes("B"), true,
true, false, true, HRegionInfo.DEFAULT_REPLICA_ID);
admin.enableTable(table);
HRegionInfo hriOverlap =
createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A2"), Bytes.toBytes("B"));
TEST_UTIL.assignRegion(hriOverlap);
ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
HBaseFsck hbck = doFsck(conf, false);
assertErrors(hbck,
new HBaseFsck.ErrorReporter.ERROR_CODE[] { HBaseFsck.ErrorReporter.ERROR_CODE.ORPHAN_HDFS_REGION, HBaseFsck.ErrorReporter.ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
HBaseFsck.ErrorReporter.ERROR_CODE.HOLE_IN_REGION_CHAIN });
// fix the problem.
doFsck(conf, true);
// verify that overlaps are fixed
HBaseFsck hbck2 = doFsck(conf,false);
assertNoErrors(hbck2);
assertEquals(0, hbck2.getOverlapGroups(table).size());
assertEquals(ROWKEYS.length, countRows());
} finally {
cleanupTable(table);
}
}
/**
* This creates and fixes a bad table where a region overlaps two regions --
* a start key contained in another region and its end key is contained in
* yet another region.
*/
@Test (timeout=180000)
public void testCoveredStartKey() throws Exception {
TableName table =
TableName.valueOf("tableCoveredStartKey");
try {
setupTable(table);
assertEquals(ROWKEYS.length, countRows());
// Mess it up by creating an overlap in the metadata
HRegionInfo hriOverlap =
createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A2"), Bytes.toBytes("B2"));
TEST_UTIL.assignRegion(hriOverlap);
ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
HBaseFsck hbck = doFsck(conf, false);
assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] { HBaseFsck.ErrorReporter.ERROR_CODE.OVERLAP_IN_REGION_CHAIN,
HBaseFsck.ErrorReporter.ERROR_CODE.OVERLAP_IN_REGION_CHAIN });
assertEquals(3, hbck.getOverlapGroups(table).size());
assertEquals(ROWKEYS.length, countRows());
// fix the problem.
doFsck(conf, true);
// verify that overlaps are fixed
HBaseFsck hbck2 = doFsck(conf, false);
assertErrors(hbck2, new HBaseFsck.ErrorReporter.ERROR_CODE[0]);
assertEquals(0, hbck2.getOverlapGroups(table).size());
assertEquals(ROWKEYS.length, countRows());
} finally {
cleanupTable(table);
}
}
/**
* This creates and fixes a bad table with a missing region -- hole in meta
* and data missing in the fs.
*/
@Test (timeout=180000)
public void testRegionHole() throws Exception {
TableName table =
TableName.valueOf("tableRegionHole");
try {
setupTable(table);
assertEquals(ROWKEYS.length, countRows());
// Mess it up by leaving a hole in the assignment, meta, and hdfs data
admin.disableTable(table);
deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"), Bytes.toBytes("C"), true,
true, true);
admin.enableTable(table);
HBaseFsck hbck = doFsck(conf, false);
assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] { HBaseFsck.ErrorReporter.ERROR_CODE.HOLE_IN_REGION_CHAIN });
// holes are separate from overlap groups
assertEquals(0, hbck.getOverlapGroups(table).size());
// fix hole
doFsck(conf, true);
// check that hole fixed
assertNoErrors(doFsck(conf,false));
assertEquals(ROWKEYS.length - 2, countRows()); // lost a region so lost a row
} finally {
cleanupTable(table);
}
}
/**
* The region is not deployed when the table is disabled.
*/
@Test (timeout=180000)
public void testRegionShouldNotBeDeployed() throws Exception {
TableName table =
TableName.valueOf("tableRegionShouldNotBeDeployed");
try {
LOG.info("Starting testRegionShouldNotBeDeployed.");
MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
assertTrue(cluster.waitForActiveAndReadyMaster());
byte[][] SPLIT_KEYS = new byte[][] { new byte[0], Bytes.toBytes("aaa"),
Bytes.toBytes("bbb"), Bytes.toBytes("ccc"), Bytes.toBytes("ddd") };
HTableDescriptor htdDisabled = new HTableDescriptor(table);
htdDisabled.addFamily(new HColumnDescriptor(FAM));
// Write the .tableinfo
FSTableDescriptors fstd = new FSTableDescriptors(conf);
fstd.createTableDescriptor(htdDisabled);
List<HRegionInfo> disabledRegions =
TEST_UTIL.createMultiRegionsInMeta(conf, htdDisabled, SPLIT_KEYS);
// Let's just assign everything to first RS
HRegionServer hrs = cluster.getRegionServer(0);
// Create region files.
admin.disableTable(table);
admin.enableTable(table);
// Disable the table and close its regions
admin.disableTable(table);
HRegionInfo region = disabledRegions.remove(0);
byte[] regionName = region.getRegionName();
// The region should not be assigned currently
assertTrue(cluster.getServerWith(regionName) == -1);
// Directly open a region on a region server.
// If going through AM/ZK, the region won't be open.
// Even it is opened, AM will close it which causes
// flakiness of this test.
HRegion r = HRegion.openHRegion(
region, htdDisabled, hrs.getWAL(region), conf);
hrs.addToOnlineRegions(r);
HBaseFsck hbck = doFsck(conf, false);
assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] { HBaseFsck.ErrorReporter.ERROR_CODE.SHOULD_NOT_BE_DEPLOYED });
// fix this fault
doFsck(conf, true);
// check result
assertNoErrors(doFsck(conf, false));
} finally {
admin.enableTable(table);
cleanupTable(table);
}
}
/**
* This test makes sure that parallel instances of Hbck is disabled.
*
* @throws Exception
*/
@Test(timeout=180000)
public void testParallelHbck() throws Exception {
final ExecutorService service;
final Future<HBaseFsck> hbck1,hbck2;
class RunHbck implements Callable<HBaseFsck> {
boolean fail = true;
@Override
public HBaseFsck call(){
Configuration c = new Configuration(conf);
c.setInt("hbase.hbck.lockfile.attempts", 1);
// HBASE-13574 found that in HADOOP-2.6 and later, the create file would internally retry.
// To avoid flakiness of the test, set low max wait time.
c.setInt("hbase.hbck.lockfile.maxwaittime", 3);
try{
return doFsck(c, true); // Exclusive hbck only when fixing
} catch(Exception e){
if (e.getMessage().contains("Duplicate hbck")) {
fail = false;
}
}
// If we reach here, then an exception was caught
if (fail) fail();
return null;
}
}
service = Executors.newFixedThreadPool(2);
hbck1 = service.submit(new RunHbck());
hbck2 = service.submit(new RunHbck());
service.shutdown();
//wait for 15 seconds, for both hbck calls finish
service.awaitTermination(15, TimeUnit.SECONDS);
HBaseFsck h1 = hbck1.get();
HBaseFsck h2 = hbck2.get();
// Make sure only one of the calls was successful
assert(h1 == null || h2 == null);
if (h1 != null) {
assert(h1.getRetCode() >= 0);
}
if (h2 != null) {
assert(h2.getRetCode() >= 0);
}
}
/**
* This test makes sure that with enough retries both parallel instances
* of hbck will be completed successfully.
*
* @throws Exception
*/
@Test (timeout=180000)
public void testParallelWithRetriesHbck() throws Exception {
final ExecutorService service;
final Future<HBaseFsck> hbck1,hbck2;
// With the ExponentialBackoffPolicyWithLimit (starting with 200 milliseconds sleep time, and
// max sleep time of 5 seconds), we can retry around 15 times within 80 seconds before bail out.
//
// Note: the reason to use 80 seconds is that in HADOOP-2.6 and later, the create file would
// retry up to HdfsConstants.LEASE_SOFTLIMIT_PERIOD (60 seconds). See HBASE-13574 for more
// details.
final int timeoutInSeconds = 80;
final int sleepIntervalInMilliseconds = 200;
final int maxSleepTimeInMilliseconds = 6000;
final int maxRetryAttempts = 15;
class RunHbck implements Callable<HBaseFsck>{
@Override
public HBaseFsck call() throws Exception {
// Increase retry attempts to make sure the non-active hbck doesn't get starved
Configuration c = new Configuration(conf);
c.setInt("hbase.hbck.lockfile.maxwaittime", timeoutInSeconds);
c.setInt("hbase.hbck.lockfile.attempt.sleep.interval", sleepIntervalInMilliseconds);
c.setInt("hbase.hbck.lockfile.attempt.maxsleeptime", maxSleepTimeInMilliseconds);
c.setInt("hbase.hbck.lockfile.attempts", maxRetryAttempts);
return doFsck(c, false);
}
}
service = Executors.newFixedThreadPool(2);
hbck1 = service.submit(new RunHbck());
hbck2 = service.submit(new RunHbck());
service.shutdown();
//wait for some time, for both hbck calls finish
service.awaitTermination(timeoutInSeconds * 2, TimeUnit.SECONDS);
HBaseFsck h1 = hbck1.get();
HBaseFsck h2 = hbck2.get();
// Both should be successful
assertNotNull(h1);
assertNotNull(h2);
assert(h1.getRetCode() >= 0);
assert(h2.getRetCode() >= 0);
}
@Test (timeout = 180000)
public void testRegionBoundariesCheck() throws Exception {
HBaseFsck hbck = doFsck(conf, false);
assertNoErrors(hbck); // no errors
try {
hbck.connect(); // need connection to have access to META
hbck.checkRegionBoundaries();
} catch (IllegalArgumentException e) {
if (e.getMessage().endsWith("not a valid DFS filename.")) {
fail("Table directory path is not valid." + e.getMessage());
}
} finally {
hbck.close();
}
}
/**
* test region boundaries and make sure store file had been created.
* @throws Exception
*/
@Test(timeout = 180000)
public void testRegionBoundariesCheckWithFlushTable() throws Exception {
HBaseFsck hbck = doFsck(conf, false);
assertNoErrors(hbck); // no errors
TableName table = TableName.valueOf("testRegionBoundariesCheckWithFlushTable");
try {
setupTable(table);
admin.flush(table);
hbck.connect(); // need connection to have access to META
hbck.checkRegionBoundaries();
assertNoErrors(hbck); // no errors
} catch (IllegalArgumentException e) {
if (e.getMessage().endsWith("not a valid DFS filename.")) {
fail("Table directory path is not valid." + e.getMessage());
}
} finally {
hbck.close();
}
}
@Test (timeout=180000)
public void testHbckAfterRegionMerge() throws Exception {
TableName table = TableName.valueOf("testMergeRegionFilesInHdfs");
Table meta = null;
try {
// disable CatalogJanitor
TEST_UTIL.getHBaseCluster().getMaster().setCatalogJanitorEnabled(false);
setupTable(table);
assertEquals(ROWKEYS.length, countRows());
try(RegionLocator rl = connection.getRegionLocator(tbl.getName())) {
// make sure data in regions, if in wal only there is no data loss
admin.flush(table);
HRegionInfo region1 = rl.getRegionLocation(Bytes.toBytes("A")).getRegionInfo();
HRegionInfo region2 = rl.getRegionLocation(Bytes.toBytes("B")).getRegionInfo();
int regionCountBeforeMerge = rl.getAllRegionLocations().size();
assertNotEquals(region1, region2);
// do a region merge
admin.mergeRegionsAsync(
region1.getEncodedNameAsBytes(), region2.getEncodedNameAsBytes(), false);
// wait until region merged
long timeout = System.currentTimeMillis() + 30 * 1000;
while (true) {
if (rl.getAllRegionLocations().size() < regionCountBeforeMerge) {
break;
} else if (System.currentTimeMillis() > timeout) {
fail("Time out waiting on region " + region1.getEncodedName() + " and " + region2
.getEncodedName() + " be merged");
}
Thread.sleep(10);
}
assertEquals(ROWKEYS.length, countRows());
HBaseFsck hbck = doFsck(conf, false);
assertNoErrors(hbck); // no errors
}
} finally {
TEST_UTIL.getHBaseCluster().getMaster().setCatalogJanitorEnabled(true);
cleanupTable(table);
IOUtils.closeQuietly(meta);
}
}
/**
* This creates entries in hbase:meta with no hdfs data. This should cleanly
* remove the table.
*/
@Test (timeout=180000)
public void testNoHdfsTable() throws Exception {
TableName table = TableName.valueOf("NoHdfsTable");
setupTable(table);
assertEquals(ROWKEYS.length, countRows());
// make sure data in regions, if in wal only there is no data loss
admin.flush(table);
// Mess it up by deleting hdfs dirs
deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes(""),
Bytes.toBytes("A"), false, false, true); // don't rm meta
deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
Bytes.toBytes("B"), false, false, true); // don't rm meta
deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
Bytes.toBytes("C"), false, false, true); // don't rm meta
deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("C"),
Bytes.toBytes(""), false, false, true); // don't rm meta
// also remove the table directory in hdfs
deleteTableDir(table);
HBaseFsck hbck = doFsck(conf, false);
assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] {HBaseFsck.ErrorReporter.ERROR_CODE.NOT_IN_HDFS,
HBaseFsck.ErrorReporter.ERROR_CODE.NOT_IN_HDFS, HBaseFsck.ErrorReporter.ERROR_CODE.NOT_IN_HDFS,
HBaseFsck.ErrorReporter.ERROR_CODE.NOT_IN_HDFS, HBaseFsck.ErrorReporter.ERROR_CODE.ORPHAN_TABLE_STATE, });
// holes are separate from overlap groups
assertEquals(0, hbck.getOverlapGroups(table).size());
// fix hole
doFsck(conf, true); // detect dangling regions and remove those
// check that hole fixed
assertNoErrors(doFsck(conf,false));
assertFalse("Table " + table + " should have been deleted", admin.tableExists(table));
}
/**
* when the hbase.version file missing, It is fix the fault.
*/
@Test (timeout=180000)
public void testNoVersionFile() throws Exception {
// delete the hbase.version file
Path rootDir = FSUtils.getRootDir(conf);
FileSystem fs = rootDir.getFileSystem(conf);
Path versionFile = new Path(rootDir, HConstants.VERSION_FILE_NAME);
fs.delete(versionFile, true);
// test
HBaseFsck hbck = doFsck(conf, false);
assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] { HBaseFsck.ErrorReporter.ERROR_CODE.NO_VERSION_FILE });
// fix hbase.version missing
doFsck(conf, true);
// no version file fixed
assertNoErrors(doFsck(conf, false));
}
@Test (timeout=180000)
public void testNoTableState() throws Exception {
// delete the hbase.version file
TableName table =
TableName.valueOf("testNoTableState");
try {
setupTable(table);
// make sure data in regions, if in wal only there is no data loss
admin.flush(table);
MetaTableAccessor.deleteTableState(TEST_UTIL.getConnection(), table);
// test
HBaseFsck hbck = doFsck(conf, false);
assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] { HBaseFsck.ErrorReporter.ERROR_CODE.NO_TABLE_STATE });
// fix table state missing
doFsck(conf, true);
assertNoErrors(doFsck(conf, false));
assertTrue(TEST_UTIL.getHBaseAdmin().isTableEnabled(table));
} finally {
cleanupTable(table);
}
}
/**
* This creates two tables and mess both of them and fix them one by one
*/
@Test (timeout=180000)
public void testFixByTable() throws Exception {
TableName table1 =
TableName.valueOf("testFixByTable1");
TableName table2 =
TableName.valueOf("testFixByTable2");
try {
setupTable(table1);
// make sure data in regions, if in wal only there is no data loss
admin.flush(table1);
// Mess them up by leaving a hole in the hdfs data
deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
Bytes.toBytes("C"), false, false, true); // don't rm meta
setupTable(table2);
// make sure data in regions, if in wal only there is no data loss
admin.flush(table2);
// Mess them up by leaving a hole in the hdfs data
deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"), Bytes.toBytes("C"), false,
false, true); // don't rm meta
HBaseFsck hbck = doFsck(conf, false);
assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] { HBaseFsck.ErrorReporter.ERROR_CODE.NOT_IN_HDFS, HBaseFsck.ErrorReporter.ERROR_CODE.NOT_IN_HDFS });
// fix hole in table 1
doFsck(conf, true, table1);
// check that hole in table 1 fixed
assertNoErrors(doFsck(conf, false, table1));
// check that hole in table 2 still there
assertErrors(doFsck(conf, false, table2), new HBaseFsck.ErrorReporter.ERROR_CODE[] { HBaseFsck.ErrorReporter.ERROR_CODE.NOT_IN_HDFS });
// fix hole in table 2
doFsck(conf, true, table2);
// check that hole in both tables fixed
assertNoErrors(doFsck(conf, false));
assertEquals(ROWKEYS.length - 2, countRows());
} finally {
cleanupTable(table1);
cleanupTable(table2);
}
}
/**
* A split parent in meta, in hdfs, and not deployed
*/
@Test (timeout=180000)
public void testLingeringSplitParent() throws Exception {
TableName table =
TableName.valueOf("testLingeringSplitParent");
Table meta = null;
try {
setupTable(table);
assertEquals(ROWKEYS.length, countRows());
// make sure data in regions, if in wal only there is no data loss
admin.flush(table);
HRegionLocation location;
try(RegionLocator rl = connection.getRegionLocator(tbl.getName())) {
location = rl.getRegionLocation(Bytes.toBytes("B"));
}
// Delete one region from meta, but not hdfs, unassign it.
deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"),
Bytes.toBytes("C"), true, true, false);
// Create a new meta entry to fake it as a split parent.
meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService);
HRegionInfo hri = location.getRegionInfo();
HRegionInfo a = new HRegionInfo(tbl.getName(),
Bytes.toBytes("B"), Bytes.toBytes("BM"));
HRegionInfo b = new HRegionInfo(tbl.getName(),
Bytes.toBytes("BM"), Bytes.toBytes("C"));
hri.setOffline(true);
hri.setSplit(true);
MetaTableAccessor.addRegionToMeta(meta, hri, a, b);
meta.close();
admin.flush(TableName.META_TABLE_NAME);
HBaseFsck hbck = doFsck(conf, false);
assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] {
HBaseFsck.ErrorReporter.ERROR_CODE.LINGERING_SPLIT_PARENT, HBaseFsck.ErrorReporter.ERROR_CODE.HOLE_IN_REGION_CHAIN});
// regular repair cannot fix lingering split parent
hbck = doFsck(conf, true);
assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] {
HBaseFsck.ErrorReporter.ERROR_CODE.LINGERING_SPLIT_PARENT, HBaseFsck.ErrorReporter.ERROR_CODE.HOLE_IN_REGION_CHAIN });
assertFalse(hbck.shouldRerun());
hbck = doFsck(conf, false);
assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] {
HBaseFsck.ErrorReporter.ERROR_CODE.LINGERING_SPLIT_PARENT, HBaseFsck.ErrorReporter.ERROR_CODE.HOLE_IN_REGION_CHAIN});
// fix lingering split parent
hbck = new HBaseFsck(conf, hbfsckExecutorService);
hbck.connect();
HBaseFsck.setDisplayFullReport(); // i.e. -details
hbck.setTimeLag(0);
hbck.setFixSplitParents(true);
hbck.onlineHbck();
assertTrue(hbck.shouldRerun());
hbck.close();
Get get = new Get(hri.getRegionName());
Result result = meta.get(get);
assertTrue(result.getColumnCells(HConstants.CATALOG_FAMILY,
HConstants.SPLITA_QUALIFIER).isEmpty());
assertTrue(result.getColumnCells(HConstants.CATALOG_FAMILY,
HConstants.SPLITB_QUALIFIER).isEmpty());
admin.flush(TableName.META_TABLE_NAME);
// fix other issues
doFsck(conf, true);
// check that all are fixed
assertNoErrors(doFsck(conf, false));
assertEquals(ROWKEYS.length, countRows());
} finally {
cleanupTable(table);
IOUtils.closeQuietly(meta);
}
}
/**
* Tests that LINGERING_SPLIT_PARENT is not erroneously reported for
* valid cases where the daughters are there.
*/
@Test (timeout=180000)
public void testValidLingeringSplitParent() throws Exception {
TableName table =
TableName.valueOf("testLingeringSplitParent");
Table meta = null;
try {
setupTable(table);
assertEquals(ROWKEYS.length, countRows());
// make sure data in regions, if in wal only there is no data loss
admin.flush(table);
try(RegionLocator rl = connection.getRegionLocator(tbl.getName())) {
HRegionLocation location = rl.getRegionLocation(Bytes.toBytes("B"));
meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService);
HRegionInfo hri = location.getRegionInfo();
// do a regular split
byte[] regionName = location.getRegionInfo().getRegionName();
admin.splitRegion(location.getRegionInfo().getRegionName(), Bytes.toBytes("BM"));
TestEndToEndSplitTransaction.blockUntilRegionSplit(conf, 60000, regionName, true);
// TODO: fixHdfsHoles does not work against splits, since the parent dir lingers on
// for some time until children references are deleted. HBCK erroneously sees this as
// overlapping regions
HBaseFsck hbck = doFsck(conf, true, true, false, false, false, true, true, true, false,
false, false, null);
assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] {}); //no LINGERING_SPLIT_PARENT reported
// assert that the split hbase:meta entry is still there.
Get get = new Get(hri.getRegionName());
Result result = meta.get(get);
assertNotNull(result);
assertNotNull(MetaTableAccessor.getHRegionInfo(result));
assertEquals(ROWKEYS.length, countRows());
// assert that we still have the split regions
assertEquals(rl.getStartKeys().length, SPLITS.length + 1 + 1); //SPLITS + 1 is # regions
// pre-split.
assertNoErrors(doFsck(conf, false));
}
} finally {
cleanupTable(table);
IOUtils.closeQuietly(meta);
}
}
/**
* Split crashed after write to hbase:meta finished for the parent region, but
* failed to write daughters (pre HBASE-7721 codebase)
*/
@Test(timeout=75000)
public void testSplitDaughtersNotInMeta() throws Exception {
TableName table = TableName.valueOf("testSplitdaughtersNotInMeta");
Table meta = connection.getTable(TableName.META_TABLE_NAME, tableExecutorService);
try {
setupTable(table);
assertEquals(ROWKEYS.length, countRows());
// make sure data in regions, if in wal only there is no data loss
admin.flush(table);
try(RegionLocator rl = connection.getRegionLocator(tbl.getName())) {
HRegionLocation location = rl.getRegionLocation(Bytes.toBytes("B"));
HRegionInfo hri = location.getRegionInfo();
// Disable CatalogJanitor to prevent it from cleaning up the parent region
// after split.
admin.enableCatalogJanitor(false);
// do a regular split
byte[] regionName = location.getRegionInfo().getRegionName();
admin.splitRegion(location.getRegionInfo().getRegionName(), Bytes.toBytes("BM"));
TestEndToEndSplitTransaction.blockUntilRegionSplit(conf, 60000, regionName, true);
PairOfSameType<HRegionInfo> daughters = MetaTableAccessor.getDaughterRegions(
meta.get(new Get(regionName)));
// Delete daughter regions from meta, but not hdfs, unassign it.
ServerName firstSN =
rl.getRegionLocation(daughters.getFirst().getStartKey()).getServerName();
ServerName secondSN =
rl.getRegionLocation(daughters.getSecond().getStartKey()).getServerName();
undeployRegion(connection, firstSN, daughters.getFirst());
undeployRegion(connection, secondSN, daughters.getSecond());
List<Delete> deletes = new ArrayList<>();
deletes.add(new Delete(daughters.getFirst().getRegionName()));
deletes.add(new Delete(daughters.getSecond().getRegionName()));
meta.delete(deletes);
// Remove daughters from regionStates
RegionStates regionStates = TEST_UTIL.getMiniHBaseCluster().getMaster().
getAssignmentManager().getRegionStates();
regionStates.deleteRegion(daughters.getFirst());
regionStates.deleteRegion(daughters.getSecond());
HBaseFsck hbck = doFsck(conf, false);
assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] {
HBaseFsck.ErrorReporter.ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
HBaseFsck.ErrorReporter.ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
HBaseFsck.ErrorReporter.ERROR_CODE.HOLE_IN_REGION_CHAIN }); //no LINGERING_SPLIT_PARENT
// now fix it. The fix should not revert the region split, but add daughters to META
hbck = doFsck(conf, true, true, false, false, false, false, false, false, false,
false, false, null);
assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] {
HBaseFsck.ErrorReporter.ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
HBaseFsck.ErrorReporter.ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
HBaseFsck.ErrorReporter.ERROR_CODE.HOLE_IN_REGION_CHAIN });
// assert that the split hbase:meta entry is still there.
Get get = new Get(hri.getRegionName());
Result result = meta.get(get);
assertNotNull(result);
assertNotNull(MetaTableAccessor.getHRegionInfo(result));
assertEquals(ROWKEYS.length, countRows());
// assert that we still have the split regions
assertEquals(rl.getStartKeys().length, SPLITS.length + 1 + 1); //SPLITS + 1 is # regions
// pre-split.
assertNoErrors(doFsck(conf, false)); //should be fixed by now
}
} finally {
admin.enableCatalogJanitor(true);
meta.close();
cleanupTable(table);
}
}
/**
* This creates and fixes a bad table with a missing region which is the 1st region -- hole in
* meta and data missing in the fs.
*/
@Test(timeout=120000)
public void testMissingFirstRegion() throws Exception {
TableName table = TableName.valueOf("testMissingFirstRegion");
try {
setupTable(table);
assertEquals(ROWKEYS.length, countRows());
// Mess it up by leaving a hole in the assignment, meta, and hdfs data
admin.disableTable(table);
deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes(""), Bytes.toBytes("A"), true,
true, true);
admin.enableTable(table);
HBaseFsck hbck = doFsck(conf, false);
assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] { HBaseFsck.ErrorReporter.ERROR_CODE.FIRST_REGION_STARTKEY_NOT_EMPTY });
// fix hole
doFsck(conf, true);
// check that hole fixed
assertNoErrors(doFsck(conf, false));
} finally {
cleanupTable(table);
}
}
/**
* This creates and fixes a bad table with a missing region which is the 1st region -- hole in
* meta and data missing in the fs.
*/
@Test(timeout=120000)
public void testRegionDeployedNotInHdfs() throws Exception {
TableName table =
TableName.valueOf("testSingleRegionDeployedNotInHdfs");
try {
setupTable(table);
admin.flush(table);
// Mess it up by deleting region dir
deleteRegion(conf, tbl.getTableDescriptor(),
HConstants.EMPTY_START_ROW, Bytes.toBytes("A"), false,
false, true);
HBaseFsck hbck = doFsck(conf, false);
assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] { HBaseFsck.ErrorReporter.ERROR_CODE.NOT_IN_HDFS });
// fix hole
doFsck(conf, true);
// check that hole fixed
assertNoErrors(doFsck(conf, false));
} finally {
cleanupTable(table);
}
}
/**
* This creates and fixes a bad table with missing last region -- hole in meta and data missing in
* the fs.
*/
@Test(timeout=120000)
public void testMissingLastRegion() throws Exception {
TableName table =
TableName.valueOf("testMissingLastRegion");
try {
setupTable(table);
assertEquals(ROWKEYS.length, countRows());
// Mess it up by leaving a hole in the assignment, meta, and hdfs data
admin.disableTable(table);
deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("C"), Bytes.toBytes(""), true,
true, true);
admin.enableTable(table);
HBaseFsck hbck = doFsck(conf, false);
assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] { HBaseFsck.ErrorReporter.ERROR_CODE.LAST_REGION_ENDKEY_NOT_EMPTY });
// fix hole
doFsck(conf, true);
// check that hole fixed
assertNoErrors(doFsck(conf, false));
} finally {
cleanupTable(table);
}
}
/**
* Test -noHdfsChecking option can detect and fix assignments issue.
*/
@Test (timeout=180000)
public void testFixAssignmentsAndNoHdfsChecking() throws Exception {
TableName table =
TableName.valueOf("testFixAssignmentsAndNoHdfsChecking");
try {
setupTable(table);
assertEquals(ROWKEYS.length, countRows());
// Mess it up by closing a region
deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"), Bytes.toBytes("B"), true,
false, false, false, HRegionInfo.DEFAULT_REPLICA_ID);
// verify there is no other errors
HBaseFsck hbck = doFsck(conf, false);
assertErrors(hbck,
new HBaseFsck.ErrorReporter.ERROR_CODE[] { HBaseFsck.ErrorReporter.ERROR_CODE.NOT_DEPLOYED, HBaseFsck.ErrorReporter.ERROR_CODE.HOLE_IN_REGION_CHAIN });
// verify that noHdfsChecking report the same errors
HBaseFsck fsck = new HBaseFsck(conf, hbfsckExecutorService);
fsck.connect();
HBaseFsck.setDisplayFullReport(); // i.e. -details
fsck.setTimeLag(0);
fsck.setCheckHdfs(false);
fsck.onlineHbck();
assertErrors(fsck,
new HBaseFsck.ErrorReporter.ERROR_CODE[] { HBaseFsck.ErrorReporter.ERROR_CODE.NOT_DEPLOYED, HBaseFsck.ErrorReporter.ERROR_CODE.HOLE_IN_REGION_CHAIN });
fsck.close();
// verify that fixAssignments works fine with noHdfsChecking
fsck = new HBaseFsck(conf, hbfsckExecutorService);
fsck.connect();
HBaseFsck.setDisplayFullReport(); // i.e. -details
fsck.setTimeLag(0);
fsck.setCheckHdfs(false);
fsck.setFixAssignments(true);
fsck.onlineHbck();
assertTrue(fsck.shouldRerun());
fsck.onlineHbck();
assertNoErrors(fsck);
assertEquals(ROWKEYS.length, countRows());
fsck.close();
} finally {
cleanupTable(table);
}
}
/**
* Test -noHdfsChecking option can detect region is not in meta but deployed.
* However, it can not fix it without checking Hdfs because we need to get
* the region info from Hdfs in this case, then to patch the meta.
*/
@Test (timeout=180000)
public void testFixMetaNotWorkingWithNoHdfsChecking() throws Exception {
TableName table =
TableName.valueOf("testFixMetaNotWorkingWithNoHdfsChecking");
try {
setupTable(table);
assertEquals(ROWKEYS.length, countRows());
// Mess it up by deleting a region from the metadata
deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"),
Bytes.toBytes("B"), false, true, false, false, HRegionInfo.DEFAULT_REPLICA_ID);
// verify there is no other errors
HBaseFsck hbck = doFsck(conf, false);
assertErrors(hbck,
new HBaseFsck.ErrorReporter.ERROR_CODE[] { HBaseFsck.ErrorReporter.ERROR_CODE.NOT_IN_META, HBaseFsck.ErrorReporter.ERROR_CODE.HOLE_IN_REGION_CHAIN });
// verify that noHdfsChecking report the same errors
HBaseFsck fsck = new HBaseFsck(conf, hbfsckExecutorService);
fsck.connect();
HBaseFsck.setDisplayFullReport(); // i.e. -details
fsck.setTimeLag(0);
fsck.setCheckHdfs(false);
fsck.onlineHbck();
assertErrors(fsck,
new HBaseFsck.ErrorReporter.ERROR_CODE[] { HBaseFsck.ErrorReporter.ERROR_CODE.NOT_IN_META, HBaseFsck.ErrorReporter.ERROR_CODE.HOLE_IN_REGION_CHAIN });
fsck.close();
// verify that fixMeta doesn't work with noHdfsChecking
fsck = new HBaseFsck(conf, hbfsckExecutorService);
fsck.connect();
HBaseFsck.setDisplayFullReport(); // i.e. -details
fsck.setTimeLag(0);
fsck.setCheckHdfs(false);
fsck.setFixAssignments(true);
fsck.setFixMeta(true);
fsck.onlineHbck();
assertFalse(fsck.shouldRerun());
assertErrors(fsck,
new HBaseFsck.ErrorReporter.ERROR_CODE[] { HBaseFsck.ErrorReporter.ERROR_CODE.NOT_IN_META, HBaseFsck.ErrorReporter.ERROR_CODE.HOLE_IN_REGION_CHAIN });
fsck.close();
// fix the cluster so other tests won't be impacted
fsck = doFsck(conf, true);
assertTrue(fsck.shouldRerun());
fsck = doFsck(conf, true);
assertNoErrors(fsck);
} finally {
cleanupTable(table);
}
}
/**
* Test -fixHdfsHoles doesn't work with -noHdfsChecking option,
* and -noHdfsChecking can't detect orphan Hdfs region.
*/
@Test (timeout=180000)
public void testFixHdfsHolesNotWorkingWithNoHdfsChecking() throws Exception {
TableName table =
TableName.valueOf("testFixHdfsHolesNotWorkingWithNoHdfsChecking");
try {
setupTable(table);
assertEquals(ROWKEYS.length, countRows());
// Mess it up by creating an overlap in the metadata
admin.disableTable(table);
deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("A"), Bytes.toBytes("B"), true,
true, false, true, HRegionInfo.DEFAULT_REPLICA_ID);
admin.enableTable(table);
HRegionInfo hriOverlap =
createRegion(tbl.getTableDescriptor(), Bytes.toBytes("A2"), Bytes.toBytes("B"));
TEST_UTIL.assignRegion(hriOverlap);
ServerName server = regionStates.getRegionServerOfRegion(hriOverlap);
TEST_UTIL.assertRegionOnServer(hriOverlap, server, REGION_ONLINE_TIMEOUT);
HBaseFsck hbck = doFsck(conf, false);
assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] {
HBaseFsck.ErrorReporter.ERROR_CODE.ORPHAN_HDFS_REGION, HBaseFsck.ErrorReporter.ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
HBaseFsck.ErrorReporter.ERROR_CODE.HOLE_IN_REGION_CHAIN});
// verify that noHdfsChecking can't detect ORPHAN_HDFS_REGION
HBaseFsck fsck = new HBaseFsck(conf, hbfsckExecutorService);
fsck.connect();
HBaseFsck.setDisplayFullReport(); // i.e. -details
fsck.setTimeLag(0);
fsck.setCheckHdfs(false);
fsck.onlineHbck();
assertErrors(fsck, new HBaseFsck.ErrorReporter.ERROR_CODE[] { HBaseFsck.ErrorReporter.ERROR_CODE.HOLE_IN_REGION_CHAIN });
fsck.close();
// verify that fixHdfsHoles doesn't work with noHdfsChecking
fsck = new HBaseFsck(conf, hbfsckExecutorService);
fsck.connect();
HBaseFsck.setDisplayFullReport(); // i.e. -details
fsck.setTimeLag(0);
fsck.setCheckHdfs(false);
fsck.setFixHdfsHoles(true);
fsck.setFixHdfsOverlaps(true);
fsck.setFixHdfsOrphans(true);
fsck.onlineHbck();
assertFalse(fsck.shouldRerun());
assertErrors(fsck, new HBaseFsck.ErrorReporter.ERROR_CODE[] { HBaseFsck.ErrorReporter.ERROR_CODE.HOLE_IN_REGION_CHAIN });
fsck.close();
} finally {
if (admin.isTableDisabled(table)) {
admin.enableTable(table);
}
cleanupTable(table);
}
}
/**
* This creates a table and then corrupts an hfile. Hbck should quarantine the file.
*/
@Test(timeout=180000)
public void testQuarantineCorruptHFile() throws Exception {
TableName table = TableName.valueOf(name.getMethodName());
try {
setupTable(table);
assertEquals(ROWKEYS.length, countRows());
admin.flush(table); // flush is async.
FileSystem fs = FileSystem.get(conf);
Path hfile = getFlushedHFile(fs, table);
// Mess it up by leaving a hole in the assignment, meta, and hdfs data
admin.disableTable(table);
// create new corrupt file called deadbeef (valid hfile name)
Path corrupt = new Path(hfile.getParent(), "deadbeef");
TestHFile.truncateFile(fs, hfile, corrupt);
LOG.info("Created corrupted file " + corrupt);
HBaseFsck.debugLsr(conf, FSUtils.getRootDir(conf));
// we cannot enable here because enable never finished due to the corrupt region.
HBaseFsck res = HbckTestingUtil.doHFileQuarantine(conf, table);
assertEquals(res.getRetCode(), 0);
HFileCorruptionChecker hfcc = res.getHFilecorruptionChecker();
assertEquals(hfcc.getHFilesChecked(), 5);
assertEquals(hfcc.getCorrupted().size(), 1);
assertEquals(hfcc.getFailures().size(), 0);
assertEquals(hfcc.getQuarantined().size(), 1);
assertEquals(hfcc.getMissing().size(), 0);
// Its been fixed, verify that we can enable.
admin.enableTable(table);
} finally {
cleanupTable(table);
}
}
/**
* This creates a table and simulates the race situation where a concurrent compaction or split
* has removed an hfile after the corruption checker learned about it.
*/
@Test(timeout=180000)
public void testQuarantineMissingHFile() throws Exception {
TableName table = TableName.valueOf(name.getMethodName());
// inject a fault in the hfcc created.
final FileSystem fs = FileSystem.get(conf);
HBaseFsck hbck = new HBaseFsck(conf, hbfsckExecutorService) {
@Override
public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles)
throws IOException {
return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
AtomicBoolean attemptedFirstHFile = new AtomicBoolean(false);
@Override
protected void checkHFile(Path p) throws IOException {
if (attemptedFirstHFile.compareAndSet(false, true)) {
assertTrue(fs.delete(p, true)); // make sure delete happened.
}
super.checkHFile(p);
}
};
}
};
doQuarantineTest(table, hbck, 4, 0, 0, 0, 1); // 4 attempted, but 1 missing.
hbck.close();
}
/**
* This creates and fixes a bad table with regions that has startkey == endkey
*/
@Test (timeout=180000)
public void testDegenerateRegions() throws Exception {
TableName table = TableName.valueOf("tableDegenerateRegions");
try {
setupTable(table);
assertNoErrors(doFsck(conf, false));
assertEquals(ROWKEYS.length, countRows());
// Now let's mess it up, by adding a region with a duplicate startkey
HRegionInfo hriDupe =
createRegion(tbl.getTableDescriptor(), Bytes.toBytes("B"), Bytes.toBytes("B"));
TEST_UTIL.assignRegion(hriDupe);
ServerName server = regionStates.getRegionServerOfRegion(hriDupe);
TEST_UTIL.assertRegionOnServer(hriDupe, server, REGION_ONLINE_TIMEOUT);
HBaseFsck hbck = doFsck(conf,false);
assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] { HBaseFsck.ErrorReporter.ERROR_CODE.DEGENERATE_REGION, HBaseFsck.ErrorReporter.ERROR_CODE.DUPE_STARTKEYS,
HBaseFsck.ErrorReporter.ERROR_CODE.DUPE_STARTKEYS });
assertEquals(2, hbck.getOverlapGroups(table).size());
assertEquals(ROWKEYS.length, countRows());
// fix the degenerate region.
doFsck(conf, true);
// check that the degenerate region is gone and no data loss
HBaseFsck hbck2 = doFsck(conf,false);
assertNoErrors(hbck2);
assertEquals(0, hbck2.getOverlapGroups(table).size());
assertEquals(ROWKEYS.length, countRows());
} finally {
cleanupTable(table);
}
}
/**
* Test mission REGIONINFO_QUALIFIER in hbase:meta
*/
@Test (timeout=180000)
public void testMissingRegionInfoQualifier() throws Exception {
Connection connection = ConnectionFactory.createConnection(conf);
TableName table = TableName.valueOf("testMissingRegionInfoQualifier");
try {
setupTable(table);
// Mess it up by removing the RegionInfo for one region.
final List<Delete> deletes = new LinkedList<Delete>();
Table meta = connection.getTable(TableName.META_TABLE_NAME, hbfsckExecutorService);
MetaTableAccessor.fullScanRegions(connection, new MetaTableAccessor.Visitor() {
@Override
public boolean visit(Result rowResult) throws IOException {
HRegionInfo hri = MetaTableAccessor.getHRegionInfo(rowResult);
if (hri != null && !hri.getTable().isSystemTable()) {
Delete delete = new Delete(rowResult.getRow());
delete.addColumn(HConstants.CATALOG_FAMILY, HConstants.REGIONINFO_QUALIFIER);
deletes.add(delete);
}
return true;
}
});
meta.delete(deletes);
// Mess it up by creating a fake hbase:meta entry with no associated RegionInfo
meta.put(new Put(Bytes.toBytes(table + ",,1361911384013.810e28f59a57da91c66"))
.addColumn(HConstants.CATALOG_FAMILY, HConstants.SERVER_QUALIFIER,
Bytes.toBytes("node1:60020")));
meta.put(new Put(Bytes.toBytes(table + ",,1361911384013.810e28f59a57da91c66"))
.addColumn(HConstants.CATALOG_FAMILY, HConstants.STARTCODE_QUALIFIER,
Bytes.toBytes(1362150791183L)));
meta.close();
HBaseFsck hbck = doFsck(conf, false);
assertTrue(hbck.getErrors().getErrorList().contains(HBaseFsck.ErrorReporter.ERROR_CODE.EMPTY_META_CELL));
// fix reference file
hbck = doFsck(conf, true);
// check that reference file fixed
assertFalse(hbck.getErrors().getErrorList().contains(HBaseFsck.ErrorReporter.ERROR_CODE.EMPTY_META_CELL));
} finally {
cleanupTable(table);
}
connection.close();
}
/**
* Test pluggable error reporter. It can be plugged in
* from system property or configuration.
*/
@Test (timeout=180000)
public void testErrorReporter() throws Exception {
try {
MockErrorReporter.calledCount = 0;
doFsck(conf, false);
assertEquals(MockErrorReporter.calledCount, 0);
conf.set("hbasefsck.errorreporter", MockErrorReporter.class.getName());
doFsck(conf, false);
assertTrue(MockErrorReporter.calledCount > 20);
} finally {
conf.set("hbasefsck.errorreporter",
HBaseFsck.PrintingErrorReporter.class.getName());
MockErrorReporter.calledCount = 0;
}
}
@Test(timeout=180000)
public void testCheckTableLocks() throws Exception {
IncrementingEnvironmentEdge edge = new IncrementingEnvironmentEdge(0);
EnvironmentEdgeManager.injectEdge(edge);
// check no errors
HBaseFsck hbck = doFsck(conf, false);
assertNoErrors(hbck);
ServerName mockName = ServerName.valueOf("localhost", 60000, 1);
final TableName tableName = TableName.valueOf("foo");
// obtain one lock
final TableLockManager tableLockManager =
TableLockManager.createTableLockManager(conf, TEST_UTIL.getZooKeeperWatcher(), mockName);
TableLockManager.TableLock
writeLock = tableLockManager.writeLock(tableName, "testCheckTableLocks");
writeLock.acquire();
hbck = doFsck(conf, false);
assertNoErrors(hbck); // should not have expired, no problems
edge.incrementTime(conf.getLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT,
TableLockManager.DEFAULT_TABLE_LOCK_EXPIRE_TIMEOUT_MS)); // let table lock expire
hbck = doFsck(conf, false);
assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] {
HBaseFsck.ErrorReporter.ERROR_CODE.EXPIRED_TABLE_LOCK});
final CountDownLatch latch = new CountDownLatch(1);
new Thread() {
@Override
public void run() {
TableLockManager.TableLock
readLock = tableLockManager.writeLock(tableName, "testCheckTableLocks");
try {
latch.countDown();
readLock.acquire();
} catch (IOException ex) {
fail();
} catch (IllegalStateException ex) {
return; // expected, since this will be reaped under us.
}
fail("should not have come here");
};
}.start();
latch.await(); // wait until thread starts
Threads.sleep(300); // wait some more to ensure writeLock.acquire() is called
hbck = doFsck(conf, false);
// still one expired, one not-expired
assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] {
HBaseFsck.ErrorReporter.ERROR_CODE.EXPIRED_TABLE_LOCK});
edge.incrementTime(conf.getLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT,
TableLockManager.DEFAULT_TABLE_LOCK_EXPIRE_TIMEOUT_MS)); // let table lock expire
hbck = doFsck(conf, false);
assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] {
HBaseFsck.ErrorReporter.ERROR_CODE.EXPIRED_TABLE_LOCK,
HBaseFsck.ErrorReporter.ERROR_CODE.EXPIRED_TABLE_LOCK}); // both are expired
Configuration localConf = new Configuration(conf);
// reaping from ZKInterProcessWriteLock uses znode cTime,
// which is not injectable through EnvironmentEdge
localConf.setLong(TableLockManager.TABLE_LOCK_EXPIRE_TIMEOUT, 1);
Threads.sleep(10);
hbck = doFsck(localConf, true); // now fix both cases
hbck = doFsck(localConf, false);
assertNoErrors(hbck);
// ensure that locks are deleted
writeLock = tableLockManager.writeLock(tableName, "should acquire without blocking");
writeLock.acquire(); // this should not block.
writeLock.release(); // release for clean state
tableLockManager.tableDeleted(tableName);
}
@Test(timeout=180000)
public void testCheckReplication() throws Exception {
// check no errors
HBaseFsck hbck = doFsck(conf, false);
assertNoErrors(hbck);
// create peer
ReplicationAdmin replicationAdmin = new ReplicationAdmin(conf);
Assert.assertEquals(0, replicationAdmin.getPeersCount());
int zkPort = conf.getInt(HConstants.ZOOKEEPER_CLIENT_PORT,
HConstants.DEFAULT_ZOOKEPER_CLIENT_PORT);
ReplicationPeerConfig rpc = new ReplicationPeerConfig();
rpc.setClusterKey("127.0.0.1:" + zkPort + ":/hbase");
replicationAdmin.addPeer("1", rpc, null);
replicationAdmin.getPeersCount();
Assert.assertEquals(1, replicationAdmin.getPeersCount());
// create replicator
ZooKeeperWatcher zkw = new ZooKeeperWatcher(conf, "Test Hbase Fsck", connection);
ReplicationQueues repQueues =
ReplicationFactory.getReplicationQueues(new ReplicationQueuesArguments(conf, connection,
zkw));
repQueues.init("server1");
// queues for current peer, no errors
repQueues.addLog("1", "file1");
repQueues.addLog("1-server2", "file1");
Assert.assertEquals(2, repQueues.getAllQueues().size());
hbck = doFsck(conf, false);
assertNoErrors(hbck);
// queues for removed peer
repQueues.addLog("2", "file1");
repQueues.addLog("2-server2", "file1");
Assert.assertEquals(4, repQueues.getAllQueues().size());
hbck = doFsck(conf, false);
assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] {
HBaseFsck.ErrorReporter.ERROR_CODE.UNDELETED_REPLICATION_QUEUE,
HBaseFsck.ErrorReporter.ERROR_CODE.UNDELETED_REPLICATION_QUEUE });
// fix the case
hbck = doFsck(conf, true);
hbck = doFsck(conf, false);
assertNoErrors(hbck);
// ensure only "2" is deleted
Assert.assertEquals(2, repQueues.getAllQueues().size());
Assert.assertNull(repQueues.getLogsInQueue("2"));
Assert.assertNull(repQueues.getLogsInQueue("2-sever2"));
replicationAdmin.removePeer("1");
repQueues.removeAllQueues();
zkw.close();
replicationAdmin.close();
}
/**
* This creates and fixes a bad table with a missing region -- hole in meta
* and data present but .regioninfo missing (an orphan hdfs region)in the fs.
*/
@Test(timeout=180000)
public void testHDFSRegioninfoMissing() throws Exception {
TableName table = TableName.valueOf("tableHDFSRegioninfoMissing");
try {
setupTable(table);
assertEquals(ROWKEYS.length, countRows());
// Mess it up by leaving a hole in the meta data
admin.disableTable(table);
deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"), Bytes.toBytes("C"), true,
true, false, true, HRegionInfo.DEFAULT_REPLICA_ID);
admin.enableTable(table);
HBaseFsck hbck = doFsck(conf, false);
assertErrors(hbck,
new HBaseFsck.ErrorReporter.ERROR_CODE[] {
HBaseFsck.ErrorReporter.ERROR_CODE.ORPHAN_HDFS_REGION,
HBaseFsck.ErrorReporter.ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
HBaseFsck.ErrorReporter.ERROR_CODE.HOLE_IN_REGION_CHAIN });
// holes are separate from overlap groups
assertEquals(0, hbck.getOverlapGroups(table).size());
// fix hole
doFsck(conf, true);
// check that hole fixed
assertNoErrors(doFsck(conf, false));
assertEquals(ROWKEYS.length, countRows());
} finally {
cleanupTable(table);
}
}
/**
* This creates and fixes a bad table with a region that is missing meta and
* not assigned to a region server.
*/
@Test (timeout=180000)
public void testNotInMetaOrDeployedHole() throws Exception {
TableName table =
TableName.valueOf("tableNotInMetaOrDeployedHole");
try {
setupTable(table);
assertEquals(ROWKEYS.length, countRows());
// Mess it up by leaving a hole in the meta data
admin.disableTable(table);
deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"), Bytes.toBytes("C"), true,
true, false); // don't rm from fs
admin.enableTable(table);
HBaseFsck hbck = doFsck(conf, false);
assertErrors(hbck,
new HBaseFsck.ErrorReporter.ERROR_CODE[] {
HBaseFsck.ErrorReporter.ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
HBaseFsck.ErrorReporter.ERROR_CODE.HOLE_IN_REGION_CHAIN });
// holes are separate from overlap groups
assertEquals(0, hbck.getOverlapGroups(table).size());
// fix hole
assertErrors(doFsck(conf, true),
new HBaseFsck.ErrorReporter.ERROR_CODE[] {
HBaseFsck.ErrorReporter.ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
HBaseFsck.ErrorReporter.ERROR_CODE.HOLE_IN_REGION_CHAIN });
// check that hole fixed
assertNoErrors(doFsck(conf, false));
assertEquals(ROWKEYS.length, countRows());
} finally {
cleanupTable(table);
}
}
@Test (timeout=180000)
public void testCleanUpDaughtersNotInMetaAfterFailedSplit() throws Exception {
TableName table = TableName.valueOf("testCleanUpDaughtersNotInMetaAfterFailedSplit");
MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
try {
HTableDescriptor desc = new HTableDescriptor(table);
desc.addFamily(new HColumnDescriptor(Bytes.toBytes("f")));
createTable(TEST_UTIL, desc, null);
tbl = connection.getTable(desc.getTableName());
for (int i = 0; i < 5; i++) {
Put p1 = new Put(("r" + i).getBytes());
p1.addColumn(Bytes.toBytes("f"), "q1".getBytes(), "v".getBytes());
tbl.put(p1);
}
admin.flush(desc.getTableName());
List<HRegion> regions = cluster.getRegions(desc.getTableName());
int serverWith = cluster.getServerWith(regions.get(0).getRegionInfo().getRegionName());
HRegionServer regionServer = cluster.getRegionServer(serverWith);
cluster.getServerWith(regions.get(0).getRegionInfo().getRegionName());
SplitTransactionImpl st = (SplitTransactionImpl)
new SplitTransactionFactory(TEST_UTIL.getConfiguration())
.create(regions.get(0), Bytes.toBytes("r3"));
st.prepare();
st.stepsBeforePONR(regionServer, regionServer, false);
AssignmentManager am = cluster.getMaster().getAssignmentManager();
for (RegionState state : am.getRegionStates().getRegionsInTransition()) {
am.regionOffline(state.getRegion());
}
Map<HRegionInfo, ServerName> regionsMap = new HashMap<HRegionInfo, ServerName>();
regionsMap.put(regions.get(0).getRegionInfo(), regionServer.getServerName());
am.assign(regionsMap);
am.waitForAssignment(regions.get(0).getRegionInfo());
HBaseFsck hbck = doFsck(conf, false);
assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] {
HBaseFsck.ErrorReporter.ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
HBaseFsck.ErrorReporter.ERROR_CODE.NOT_IN_META_OR_DEPLOYED });
// holes are separate from overlap groups
assertEquals(0, hbck.getOverlapGroups(table).size());
// fix hole
assertErrors(
doFsck(conf, false, true, false, false, false, false, false, false, false, false, false,
null),
new HBaseFsck.ErrorReporter.ERROR_CODE[] {
HBaseFsck.ErrorReporter.ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
HBaseFsck.ErrorReporter.ERROR_CODE.NOT_IN_META_OR_DEPLOYED });
// check that hole fixed
assertNoErrors(doFsck(conf, false));
assertEquals(5, countRows());
} finally {
if (tbl != null) {
tbl.close();
tbl = null;
}
cleanupTable(table);
}
}
/**
* This creates fixes a bad table with a hole in meta.
*/
@Test (timeout=180000)
public void testNotInMetaHole() throws Exception {
TableName table =
TableName.valueOf("tableNotInMetaHole");
try {
setupTable(table);
assertEquals(ROWKEYS.length, countRows());
// Mess it up by leaving a hole in the meta data
admin.disableTable(table);
deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"), Bytes.toBytes("C"), false,
true, false); // don't rm from fs
admin.enableTable(table);
HBaseFsck hbck = doFsck(conf, false);
assertErrors(hbck,
new HBaseFsck.ErrorReporter.ERROR_CODE[] {
HBaseFsck.ErrorReporter.ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
HBaseFsck.ErrorReporter.ERROR_CODE.HOLE_IN_REGION_CHAIN });
// holes are separate from overlap groups
assertEquals(0, hbck.getOverlapGroups(table).size());
// fix hole
assertErrors(doFsck(conf, true),
new HBaseFsck.ErrorReporter.ERROR_CODE[] {
HBaseFsck.ErrorReporter.ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
HBaseFsck.ErrorReporter.ERROR_CODE.HOLE_IN_REGION_CHAIN });
// check that hole fixed
assertNoErrors(doFsck(conf, false));
assertEquals(ROWKEYS.length, countRows());
} finally {
cleanupTable(table);
}
}
/**
* This creates and fixes a bad table with a region that is in meta but has
* no deployment or data hdfs
*/
@Test (timeout=180000)
public void testNotInHdfs() throws Exception {
TableName table =
TableName.valueOf("tableNotInHdfs");
try {
setupTable(table);
assertEquals(ROWKEYS.length, countRows());
// make sure data in regions, if in wal only there is no data loss
admin.flush(table);
// Mess it up by leaving a hole in the hdfs data
deleteRegion(conf, tbl.getTableDescriptor(), Bytes.toBytes("B"), Bytes.toBytes("C"), false,
false, true); // don't rm meta
HBaseFsck hbck = doFsck(conf, false);
assertErrors(hbck, new HBaseFsck.ErrorReporter.ERROR_CODE[] {
HBaseFsck.ErrorReporter.ERROR_CODE.NOT_IN_HDFS});
// holes are separate from overlap groups
assertEquals(0, hbck.getOverlapGroups(table).size());
// fix hole
doFsck(conf, true);
// check that hole fixed
assertNoErrors(doFsck(conf,false));
assertEquals(ROWKEYS.length - 2, countRows());
} finally {
cleanupTable(table);
}
}
/**
* This creates a table and simulates the race situation where a concurrent compaction or split
* has removed an colfam dir before the corruption checker got to it.
*/
// Disabled because fails sporadically. Is this test right? Timing-wise, there could be no
// files in a column family on initial creation -- as suggested by Matteo.
@Ignore
@Test(timeout=180000)
public void testQuarantineMissingFamdir() throws Exception {
TableName table = TableName.valueOf(name.getMethodName());
// inject a fault in the hfcc created.
final FileSystem fs = FileSystem.get(conf);
HBaseFsck hbck = new HBaseFsck(conf, hbfsckExecutorService) {
@Override
public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles)
throws IOException {
return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
AtomicBoolean attemptedFirstHFile = new AtomicBoolean(false);
@Override
protected void checkColFamDir(Path p) throws IOException {
if (attemptedFirstHFile.compareAndSet(false, true)) {
assertTrue(fs.delete(p, true)); // make sure delete happened.
}
super.checkColFamDir(p);
}
};
}
};
doQuarantineTest(table, hbck, 3, 0, 0, 0, 1);
hbck.close();
}
/**
* This creates a table and simulates the race situation where a concurrent compaction or split
* has removed a region dir before the corruption checker got to it.
*/
@Test(timeout=180000)
public void testQuarantineMissingRegionDir() throws Exception {
TableName table = TableName.valueOf(name.getMethodName());
// inject a fault in the hfcc created.
final FileSystem fs = FileSystem.get(conf);
HBaseFsck hbck = new HBaseFsck(conf, hbfsckExecutorService) {
@Override
public HFileCorruptionChecker createHFileCorruptionChecker(boolean sidelineCorruptHFiles)
throws IOException {
return new HFileCorruptionChecker(conf, executor, sidelineCorruptHFiles) {
AtomicBoolean attemptedFirstHFile = new AtomicBoolean(false);
@Override
protected void checkRegionDir(Path p) throws IOException {
if (attemptedFirstHFile.compareAndSet(false, true)) {
assertTrue(fs.delete(p, true)); // make sure delete happened.
}
super.checkRegionDir(p);
}
};
}
};
doQuarantineTest(table, hbck, 3, 0, 0, 0, 1);
hbck.close();
}
}