blob: 5db17c34cbc9aa7cf4a9c73a20160c84d20c7fd4 [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tez.test;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.tez.client.TezClientUtils;
import org.apache.tez.client.TezClient;
import org.apache.tez.common.TezCommonUtils;
import org.apache.tez.dag.api.DAG;
import org.apache.tez.dag.api.DataSourceDescriptor;
import org.apache.tez.dag.api.InputDescriptor;
import org.apache.tez.dag.api.InputInitializerDescriptor;
import org.apache.tez.dag.api.TezConfiguration;
import org.apache.tez.dag.api.TezConstants;
import org.apache.tez.dag.api.TezException;
import org.apache.tez.dag.api.client.DAGClient;
import org.apache.tez.dag.api.client.DAGStatus;
import org.apache.tez.dag.api.client.DAGStatus.State;
import org.apache.tez.dag.app.RecoveryParser;
import org.apache.tez.dag.history.HistoryEvent;
import org.apache.tez.dag.history.HistoryEventType;
import org.apache.tez.dag.history.events.VertexInitializedEvent;
import org.apache.tez.dag.history.events.VertexRecoverableEventsGeneratedEvent;
import org.apache.tez.test.dag.MultiAttemptDAG;
import org.apache.tez.test.dag.MultiAttemptDAG.FailingInputInitializer;
import org.apache.tez.test.dag.MultiAttemptDAG.NoOpInput;
import org.apache.tez.test.dag.MultiAttemptDAG.TestRootInputInitializer;
import org.apache.tez.test.dag.SimpleVTestDAG;
import org.junit.After;
import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
import java.io.IOException;
import java.util.List;
import java.util.Random;
public class TestDAGRecovery {
private static final Log LOG = LogFactory.getLog(TestDAGRecovery.class);
private static Configuration conf = new Configuration();
private static MiniTezCluster miniTezCluster = null;
private static String TEST_ROOT_DIR = "target" + Path.SEPARATOR
+ TestDAGRecovery.class.getName() + "-tmpDir";
private static MiniDFSCluster dfsCluster = null;
private static TezClient tezSession = null;
private static FileSystem remoteFs = null;
private static TezConfiguration tezConf = null;
@BeforeClass
public static void beforeClass() throws Exception {
LOG.info("Starting mini clusters");
try {
conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, TEST_ROOT_DIR);
dfsCluster = new MiniDFSCluster.Builder(conf).numDataNodes(3)
.format(true).racks(null).build();
remoteFs = dfsCluster.getFileSystem();
} catch (IOException io) {
throw new RuntimeException("problem starting mini dfs cluster", io);
}
if (miniTezCluster == null) {
miniTezCluster = new MiniTezCluster(TestDAGRecovery.class.getName(),
1, 1, 1);
Configuration miniTezconf = new Configuration(conf);
miniTezconf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 4);
miniTezconf.set("fs.defaultFS", remoteFs.getUri().toString()); // use HDFS
miniTezCluster.init(miniTezconf);
miniTezCluster.start();
}
}
@AfterClass
public static void afterClass() throws InterruptedException {
if (tezSession != null) {
try {
LOG.info("Stopping Tez Session");
tezSession.stop();
} catch (Exception e) {
e.printStackTrace();
}
}
if (miniTezCluster != null) {
try {
LOG.info("Stopping MiniTezCluster");
miniTezCluster.stop();
} catch (Exception e) {
e.printStackTrace();
}
}
if (dfsCluster != null) {
try {
LOG.info("Stopping DFSCluster");
dfsCluster.shutdown();
} catch (Exception e) {
e.printStackTrace();
}
}
}
@Before
public void setup() throws Exception {
LOG.info("Starting session");
Path remoteStagingDir = remoteFs.makeQualified(new Path(TEST_ROOT_DIR, String
.valueOf(new Random().nextInt(100000))));
TezClientUtils.ensureStagingDirExists(conf, remoteStagingDir);
tezConf = new TezConfiguration(miniTezCluster.getConfig());
tezConf.setInt(TezConfiguration.DAG_RECOVERY_MAX_UNFLUSHED_EVENTS, 0);
tezConf.set(TezConfiguration.TEZ_AM_LOG_LEVEL, "DEBUG");
tezConf.set(TezConfiguration.TEZ_AM_STAGING_DIR,
remoteStagingDir.toString());
tezConf.setBoolean(TezConfiguration.TEZ_AM_NODE_BLACKLISTING_ENABLED, false);
tezConf.setInt(TezConfiguration.TEZ_AM_MAX_APP_ATTEMPTS, 4);
tezConf.setInt(TezConfiguration.TEZ_AM_RESOURCE_MEMORY_MB, 500);
tezConf.set(TezConfiguration.TEZ_AM_LAUNCH_CMD_OPTS, " -Xmx256m");
tezConf.setBoolean(TezConfiguration.TEZ_AM_SESSION_MODE, true);
tezConf.set(TezConfiguration.TEZ_AM_STAGING_SCRATCH_DATA_AUTO_DELETE, "false");
tezSession = TezClient.create("TestDAGRecovery", tezConf);
tezSession.start();
}
@After
public void teardown() throws InterruptedException {
if (tezSession != null) {
try {
LOG.info("Stopping Tez Session");
tezSession.stop();
} catch (Exception e) {
e.printStackTrace();
}
}
tezSession = null;
}
void runDAGAndVerify(DAG dag, DAGStatus.State finalState) throws Exception {
tezSession.waitTillReady();
DAGClient dagClient = tezSession.submitDAG(dag);
DAGStatus dagStatus = dagClient.getDAGStatus(null);
while (!dagStatus.isCompleted()) {
LOG.info("Waiting for dag to complete. Sleeping for 500ms."
+ " DAG name: " + dag.getName()
+ " DAG appContext: " + dagClient.getExecutionContext()
+ " Current state: " + dagStatus.getState());
Thread.sleep(100);
dagStatus = dagClient.getDAGStatus(null);
}
Assert.assertEquals(finalState, dagStatus.getState());
}
private void verifyRecoveryLog() throws IOException{
ApplicationId appId = tezSession.getAppMasterApplicationId();
Path tezSystemStagingDir = TezCommonUtils.getTezSystemStagingPath(tezConf, appId.toString());
Path recoveryDataDir = TezCommonUtils.getRecoveryPath(tezSystemStagingDir, tezConf);
FileSystem fs = tezSystemStagingDir.getFileSystem(tezConf);
for (int i=1; i<=3; ++i) {
Path currentAttemptRecoveryDataDir = TezCommonUtils.getAttemptRecoveryPath(recoveryDataDir,i);
Path recoveryFilePath = new Path(currentAttemptRecoveryDataDir,
appId.toString().replace("application", "dag") + "_1" + TezConstants.DAG_RECOVERY_RECOVER_FILE_SUFFIX);
List<HistoryEvent> historyEvents = RecoveryParser.parseDAGRecoveryFile(
fs.open(recoveryFilePath));
int inputInfoEventIndex = -1;
int vertexInitedEventIndex = -1;
for (int j=0;j<historyEvents.size(); ++j) {
HistoryEvent historyEvent = historyEvents.get(j);
LOG.info("Parsed event from recovery stream"
+ ", eventType=" + historyEvent.getEventType()
+ ", event=" + historyEvent);
if (historyEvent.getEventType() == HistoryEventType.VERTEX_DATA_MOVEMENT_EVENTS_GENERATED) {
VertexRecoverableEventsGeneratedEvent dmEvent =
(VertexRecoverableEventsGeneratedEvent) historyEvent;
// TODO do not need to check whether it is -1 after Tez-1521 is resolved
if (dmEvent.getVertexID().getId() == 0 && inputInfoEventIndex == -1) {
inputInfoEventIndex = j;
}
}
if (historyEvent.getEventType() == HistoryEventType.VERTEX_INITIALIZED) {
VertexInitializedEvent vInitedEvent = (VertexInitializedEvent) historyEvent;
if (vInitedEvent.getVertexID().getId() == 0) {
vertexInitedEventIndex = j;
}
}
}
// v1's init events must be logged before its VertexInitializedEvent (Tez-1345)
Assert.assertTrue("can not find VERTEX_DATA_MOVEMENT_EVENTS_GENERATED for v1", inputInfoEventIndex != -1);
Assert.assertTrue("can not find VERTEX_INITIALIZED for v1", vertexInitedEventIndex != -1);
Assert.assertTrue("VERTEX_DATA_MOVEMENT_EVENTS_GENERATED is logged before VERTEX_INITIALIZED for v1",
inputInfoEventIndex < vertexInitedEventIndex);
}
}
@Test(timeout=120000)
public void testBasicRecovery() throws Exception {
DAG dag = MultiAttemptDAG.createDAG("TestBasicRecovery", null);
// add input to v1 to make sure that there will be init events for v1 (TEZ-1345)
DataSourceDescriptor dataSource =
DataSourceDescriptor.create(InputDescriptor.create(NoOpInput.class.getName()),
InputInitializerDescriptor.create(TestRootInputInitializer.class.getName()), null);
dag.getVertex("v1").addDataSource("Input", dataSource);
runDAGAndVerify(dag, DAGStatus.State.SUCCEEDED);
verifyRecoveryLog();
// it should fail if submitting same dags in recovery mode (TEZ-1064)
try {
DAGClient dagClient = tezSession.submitDAG(dag);
Assert.fail("Expected DAG submit to fail on duplicate dag name");
} catch (TezException e) {
Assert.assertTrue(e.getMessage().contains("Duplicate dag name"));
}
}
@Test(timeout=120000)
public void testDelayedInit() throws Exception {
DAG dag = SimpleVTestDAG.createDAG("DelayedInitDAG", null);
dag.getVertex("v1").addDataSource(
"i1",
DataSourceDescriptor.create(
InputDescriptor.create(NoOpInput.class.getName()),
InputInitializerDescriptor.create(FailingInputInitializer.class
.getName()), null));
runDAGAndVerify(dag, State.SUCCEEDED);
}
}