blob: 687902d79d68e141ef67eddd8f5ae693fc564a79 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.regionserver;
import static org.junit.Assert.assertTrue;
import java.io.IOException;
import java.util.NavigableMap;
import java.util.TreeMap;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.CellScanner;
import org.apache.hadoop.hbase.HBaseClassTestRule;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Durability;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.regionserver.wal.FSHLog;
import org.apache.hadoop.hbase.testclassification.MediumTests;
import org.apache.hadoop.hbase.testclassification.RegionServerTests;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManagerTestHelper;
import org.apache.hadoop.hbase.util.Threads;
import org.apache.hadoop.hbase.wal.WAL;
import org.apache.hadoop.hbase.wal.WALEdit;
import org.apache.hadoop.hbase.wal.WALKeyImpl;
import org.apache.hadoop.hbase.wal.WALProvider.Writer;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.ClassRule;
import org.junit.Rule;
import org.junit.Test;
import org.junit.experimental.categories.Category;
import org.junit.rules.TestName;
import org.mockito.Mockito;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hbase.thirdparty.com.google.common.io.Closeables;
/**
* Testing for lock up of FSHLog.
*/
@Category({ RegionServerTests.class, MediumTests.class })
public class TestWALLockup {
@ClassRule
public static final HBaseClassTestRule CLASS_RULE =
HBaseClassTestRule.forClass(TestWALLockup.class);
private static final Logger LOG = LoggerFactory.getLogger(TestWALLockup.class);
@Rule
public TestName name = new TestName();
private static final String COLUMN_FAMILY = "MyCF";
private static final byte [] COLUMN_FAMILY_BYTES = Bytes.toBytes(COLUMN_FAMILY);
private static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
private static Configuration CONF ;
private String dir;
// Test names
protected TableName tableName;
@Before
public void setup() throws IOException {
CONF = TEST_UTIL.getConfiguration();
// Disable block cache.
CONF.setFloat(HConstants.HFILE_BLOCK_CACHE_SIZE_KEY, 0f);
dir = TEST_UTIL.getDataTestDir("TestHRegion").toString();
tableName = TableName.valueOf(name.getMethodName());
}
@After
public void tearDown() throws Exception {
EnvironmentEdgeManagerTestHelper.reset();
LOG.info("Cleaning test directory: " + TEST_UTIL.getDataTestDir());
TEST_UTIL.cleanupTestDir();
}
private String getName() {
return name.getMethodName();
}
// A WAL that we can have throw exceptions when a flag is set.
private static final class DodgyFSLog extends FSHLog {
// Set this when want the WAL to start throwing exceptions.
volatile boolean throwException = false;
// Latch to hold up processing until after another operation has had time to run.
CountDownLatch latch = new CountDownLatch(1);
public DodgyFSLog(FileSystem fs, Path root, String logDir, Configuration conf)
throws IOException {
super(fs, root, logDir, conf);
}
@Override
protected void afterCreatingZigZagLatch() {
// If throwException set, then append will throw an exception causing the WAL to be
// rolled. We'll come in here. Hold up processing until a sync can get in before
// the zigzag has time to complete its setup and get its own sync in. This is what causes
// the lock up we've seen in production.
if (throwException) {
try {
LOG.info("LATCHED");
// So, timing can have it that the test can run and the bad flush below happens
// before we get here. In this case, we'll be stuck waiting on this latch but there
// is nothing in the WAL pipeline to get us to the below beforeWaitOnSafePoint...
// because all WALs have rolled. In this case, just give up on test.
if (!this.latch.await(5, TimeUnit.SECONDS)) {
LOG.warn("GIVE UP! Failed waiting on latch...Test is ABORTED!");
}
} catch (InterruptedException e) {
}
}
}
@Override
protected void beforeWaitOnSafePoint() {
if (throwException) {
LOG.info("COUNTDOWN");
// Don't countdown latch until someone waiting on it otherwise, the above
// afterCreatingZigZagLatch will get to the latch and no one will ever free it and we'll
// be stuck; test won't go down
while (this.latch.getCount() <= 0)
Threads.sleep(1);
this.latch.countDown();
}
}
@Override
protected Writer createWriterInstance(Path path) throws IOException {
final Writer w = super.createWriterInstance(path);
return new Writer() {
@Override
public void close() throws IOException {
w.close();
}
@Override
public void sync(boolean forceSync) throws IOException {
if (throwException) {
throw new IOException("FAKE! Failed to replace a bad datanode...SYNC");
}
w.sync(forceSync);
}
@Override
public void append(Entry entry) throws IOException {
if (throwException) {
throw new IOException("FAKE! Failed to replace a bad datanode...APPEND");
}
w.append(entry);
}
@Override
public long getLength() {
return w.getLength();
}
@Override
public long getSyncedLength() {
return w.getSyncedLength();
}
};
}
}
/**
* Reproduce locking up that happens when we get an inopportune sync during setup for
* zigzaglatch wait. See HBASE-14317. If below is broken, we will see this test timeout because
* it is locked up.
* <p>First I need to set up some mocks for Server and RegionServerServices. I also need to
* set up a dodgy WAL that will throw an exception when we go to append to it.
*/
@Test
public void testLockupWhenSyncInMiddleOfZigZagSetup() throws IOException {
// Mocked up server and regionserver services. Needed below.
RegionServerServices services = Mockito.mock(RegionServerServices.class);
Mockito.when(services.getConfiguration()).thenReturn(CONF);
Mockito.when(services.isStopped()).thenReturn(false);
Mockito.when(services.isAborted()).thenReturn(false);
// OK. Now I have my mocked up Server & RegionServerServices and dodgy WAL, go ahead with test.
FileSystem fs = FileSystem.get(CONF);
Path rootDir = new Path(dir + getName());
DodgyFSLog dodgyWAL = new DodgyFSLog(fs, rootDir, getName(), CONF);
dodgyWAL.init();
Path originalWAL = dodgyWAL.getCurrentFileName();
// I need a log roller running.
LogRoller logRoller = new LogRoller(services);
logRoller.addWAL(dodgyWAL);
// There is no 'stop' once a logRoller is running.. it just dies.
logRoller.start();
// Now get a region and start adding in edits.
final HRegion region = initHRegion(tableName, null, null, CONF, dodgyWAL);
byte [] bytes = Bytes.toBytes(getName());
NavigableMap<byte[], Integer> scopes = new TreeMap<>(
Bytes.BYTES_COMPARATOR);
scopes.put(COLUMN_FAMILY_BYTES, 0);
MultiVersionConcurrencyControl mvcc = new MultiVersionConcurrencyControl();
try {
// First get something into memstore. Make a Put and then pull the Cell out of it. Will
// manage append and sync carefully in below to manufacture hang. We keep adding same
// edit. WAL subsystem doesn't care.
Put put = new Put(bytes);
put.addColumn(COLUMN_FAMILY_BYTES, Bytes.toBytes("1"), bytes);
WALKeyImpl key = new WALKeyImpl(region.getRegionInfo().getEncodedNameAsBytes(),
TableName.META_TABLE_NAME, System.currentTimeMillis(), mvcc, scopes);
WALEdit edit = new WALEdit();
CellScanner CellScanner = put.cellScanner();
assertTrue(CellScanner.advance());
edit.add(CellScanner.current());
// Put something in memstore and out in the WAL. Do a big number of appends so we push
// out other side of the ringbuffer. If small numbers, stuff doesn't make it to WAL
for (int i = 0; i < 1000; i++) {
region.put(put);
}
// Set it so we start throwing exceptions.
LOG.info("SET throwing of exception on append");
dodgyWAL.throwException = true;
// This append provokes a WAL roll request
dodgyWAL.appendData(region.getRegionInfo(), key, edit);
boolean exception = false;
try {
dodgyWAL.sync(false);
} catch (Exception e) {
exception = true;
}
assertTrue("Did not get sync exception", exception);
// Get a memstore flush going too so we have same hung profile as up in the issue over
// in HBASE-14317. Flush hangs trying to get sequenceid because the ringbuffer is held up
// by the zigzaglatch waiting on syncs to come home.
Thread t = new Thread ("Flusher") {
@Override
public void run() {
try {
if (region.getMemStoreDataSize() <= 0) {
throw new IOException("memstore size=" + region.getMemStoreDataSize());
}
region.flush(false);
} catch (IOException e) {
// Can fail trying to flush in middle of a roll. Not a failure. Will succeed later
// when roll completes.
LOG.info("In flush", e);
}
LOG.info("Exiting");
}
};
t.setDaemon(true);
t.start();
// Wait until
while (dodgyWAL.latch.getCount() > 0) {
Threads.sleep(1);
}
// Now assert I got a new WAL file put in place even though loads of errors above.
assertTrue(originalWAL != dodgyWAL.getCurrentFileName());
// Can I append to it?
dodgyWAL.throwException = false;
try {
region.put(put);
} catch (Exception e) {
LOG.info("In the put", e);
}
} finally {
// To stop logRoller, its server has to say it is stopped.
Mockito.when(services.isStopped()).thenReturn(true);
Closeables.close(logRoller, true);
try {
if (region != null) {
region.close();
}
if (dodgyWAL != null) {
dodgyWAL.close();
}
} catch (Exception e) {
LOG.info("On way out", e);
}
}
}
/**
*
* If below is broken, we will see this test timeout because RingBufferEventHandler was stuck in
* attainSafePoint. Everyone will wait for sync to finish forever. See HBASE-14317.
*/
@Test
public void testRingBufferEventHandlerStuckWhenSyncFailed()
throws IOException, InterruptedException {
// A WAL that we can have throw exceptions and slow FSHLog.replaceWriter down
class DodgyFSLog extends FSHLog {
private volatile boolean zigZagCreated = false;
public DodgyFSLog(FileSystem fs, Path root, String logDir, Configuration conf)
throws IOException {
super(fs, root, logDir, conf);
}
@Override
protected void afterCreatingZigZagLatch() {
zigZagCreated = true;
// Sleep a while to wait for RingBufferEventHandler to get stuck first.
try {
Thread.sleep(3000);
} catch (InterruptedException ignore) {
}
}
@Override
protected long getSequenceOnRingBuffer() {
return super.getSequenceOnRingBuffer();
}
protected void publishSyncOnRingBufferAndBlock(long sequence) {
try {
super.blockOnSync(super.publishSyncOnRingBuffer(sequence, false));
Assert.fail("Expect an IOException here.");
} catch (IOException ignore) {
// Here, we will get an IOException.
}
}
@Override
protected Writer createWriterInstance(Path path) throws IOException {
final Writer w = super.createWriterInstance(path);
return new Writer() {
@Override
public void close() throws IOException {
w.close();
}
@Override
public void sync(boolean forceSync) throws IOException {
throw new IOException("FAKE! Failed to replace a bad datanode...SYNC");
}
@Override
public void append(Entry entry) throws IOException {
w.append(entry);
}
@Override
public long getLength() {
return w.getLength();
}
@Override
public long getSyncedLength() {
return w.getSyncedLength();
}
};
}
}
// Mocked up server and regionserver services. Needed below.
RegionServerServices services = Mockito.mock(RegionServerServices.class);
Mockito.when(services.getConfiguration()).thenReturn(CONF);
Mockito.when(services.isStopped()).thenReturn(false);
Mockito.when(services.isAborted()).thenReturn(false);
// OK. Now I have my mocked up Server & RegionServerServices and dodgy WAL, go ahead with test.
FileSystem fs = FileSystem.get(CONF);
Path rootDir = new Path(dir + getName());
final DodgyFSLog dodgyWAL = new DodgyFSLog(fs, rootDir, getName(), CONF);
// I need a log roller running.
LogRoller logRoller = new LogRoller(services);
logRoller.addWAL(dodgyWAL);
// There is no 'stop' once a logRoller is running.. it just dies.
logRoller.start();
try {
final long seqForSync = dodgyWAL.getSequenceOnRingBuffer();
// This call provokes a WAL roll, and we will get a new RingBufferEventHandler.ZigZagLatch
// in LogRoller.
// After creating ZigZagLatch, RingBufferEventHandler would get stuck due to sync event,
// as long as HBASE-14317 hasn't be fixed.
LOG.info("Trigger log roll for creating a ZigZagLatch.");
logRoller.requestRollAll();
while (!dodgyWAL.zigZagCreated) {
Thread.sleep(10);
}
// Send a sync event for RingBufferEventHandler,
// and it gets blocked in RingBufferEventHandler.attainSafePoint
LOG.info("Send sync for RingBufferEventHandler");
Thread syncThread = new Thread() {
@Override
public void run() {
dodgyWAL.publishSyncOnRingBufferAndBlock(seqForSync);
}
};
// Sync in another thread to avoid reset SyncFuture again.
syncThread.start();
syncThread.join();
try {
LOG.info("Call sync for testing whether RingBufferEventHandler is hanging.");
dodgyWAL.sync(false); // Should not get a hang here, otherwise we will see timeout in this test.
Assert.fail("Expect an IOException here.");
} catch (IOException ignore) {
}
} finally {
// To stop logRoller, its server has to say it is stopped.
Mockito.when(services.isStopped()).thenReturn(true);
if (logRoller != null) {
logRoller.close();
}
if (dodgyWAL != null) {
dodgyWAL.close();
}
}
}
/**
* @return A region on which you must call {@link HBaseTestingUtility#closeRegionAndWAL(HRegion)}
* when done.
*/
private static HRegion initHRegion(TableName tableName, byte[] startKey, byte[] stopKey,
Configuration conf, WAL wal) throws IOException {
ChunkCreator.initialize(MemStoreLAB.CHUNK_SIZE_DEFAULT, false, 0, 0,
0, null, MemStoreLAB.INDEX_CHUNK_SIZE_PERCENTAGE_DEFAULT);
return TEST_UTIL.createLocalHRegion(tableName, startKey, stopKey, conf, false,
Durability.SYNC_WAL, wal, COLUMN_FAMILY_BYTES);
}
}