blob: efa1fdb7cf705931d87d3b52c8b493ff39f4d52a [file] [log] [blame]
package org.apache.helix.integration.task;
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
import java.util.HashMap;
import java.util.Map;
import com.google.common.collect.Sets;
import org.apache.helix.ConfigAccessor;
import org.apache.helix.TestHelper;
import org.apache.helix.integration.manager.ClusterControllerManager;
import org.apache.helix.integration.manager.MockParticipantManager;
import org.apache.helix.mock.statemodel.MockTaskStateModelFactory;
import org.apache.helix.model.ClusterConfig;
import org.apache.helix.model.MasterSlaveSMD;
import org.apache.helix.participant.StateMachineEngine;
import org.apache.helix.task.JobConfig;
import org.apache.helix.task.JobContext;
import org.apache.helix.task.Task;
import org.apache.helix.task.TaskCallbackContext;
import org.apache.helix.task.TaskFactory;
import org.apache.helix.task.TaskPartitionState;
import org.apache.helix.task.TaskState;
import org.apache.helix.task.TaskSynchronizedTestBase;
import org.apache.helix.task.TaskUtil;
import org.apache.helix.task.Workflow;
import org.apache.helix.task.WorkflowConfig;
import org.apache.helix.tools.ClusterVerifiers.BestPossibleExternalViewVerifier;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
public class TestJobTimeoutTaskNotStarted extends TaskSynchronizedTestBase {
@BeforeClass
public void beforeClass() throws Exception {
_numDbs = 1;
_numNodes = 1;
_numPartitions = 50;
_numReplicas = 1;
_participants = new MockParticipantManager[_numNodes];
_gSetupTool.addCluster(CLUSTER_NAME, true);
setupParticipants();
setupDBs();
startParticipantsWithStuckTaskStateModelFactory();
createManagers();
_controller = new ClusterControllerManager(ZK_ADDR, CLUSTER_NAME, CONTROLLER_PREFIX);
_controller.syncStart();
// Enable cancellation
ConfigAccessor _configAccessor = new ConfigAccessor(_gZkClient);
ClusterConfig clusterConfig = _configAccessor.getClusterConfig(CLUSTER_NAME);
clusterConfig.stateTransitionCancelEnabled(true);
clusterConfig.setMaxConcurrentTaskPerInstance(_numPartitions);
_configAccessor.setClusterConfig(CLUSTER_NAME, clusterConfig);
_clusterVerifier =
new BestPossibleExternalViewVerifier.Builder(CLUSTER_NAME).setZkClient(_gZkClient)
.setWaitTillVerify(TestHelper.DEFAULT_REBALANCE_PROCESSING_WAIT_TIME)
.build();
Assert.assertTrue(_clusterVerifier.verifyByPolling(10000, 100));
}
protected void startParticipantsWithStuckTaskStateModelFactory() {
Map<String, TaskFactory> taskFactoryReg = new HashMap<String, TaskFactory>();
taskFactoryReg.put(MockTask.TASK_COMMAND, new TaskFactory() {
@Override
public Task createNewTask(TaskCallbackContext context) {
return new MockTask(context);
}
});
// start dummy participants
for (int i = 0; i < _numNodes; i++) {
String instanceName = PARTICIPANT_PREFIX + "_" + (_startPort + i);
_participants[i] = new MockParticipantManager(ZK_ADDR, CLUSTER_NAME, instanceName);
// Register a Task state model factory.
StateMachineEngine stateMachine = _participants[i].getStateMachineEngine();
stateMachine.registerStateModelFactory("Task",
new MockTaskStateModelFactory(_participants[i], taskFactoryReg));
_participants[i].syncStart();
}
}
@Test
public void testTaskNotStarted() throws InterruptedException {
final String BLOCK_WORKFLOW_NAME = "blockWorkflow";
final String TIMEOUT_WORKFLOW_NAME = "timeoutWorkflow";
final String DB_NAME = WorkflowGenerator.DEFAULT_TGT_DB;
final String TIMEOUT_JOB_1 = "timeoutJob1";
final String TIMEOUT_JOB_2 = "timeoutJob2";
// 50 blocking tasks
JobConfig.Builder blockJobBuilder =
new JobConfig.Builder().setWorkflow(BLOCK_WORKFLOW_NAME).setTargetResource(DB_NAME)
.setTargetPartitionStates(Sets.newHashSet(MasterSlaveSMD.States.MASTER.name()))
.setCommand(MockTask.TASK_COMMAND).setNumConcurrentTasksPerInstance(_numPartitions);
Workflow.Builder blockWorkflowBuilder =
new Workflow.Builder(BLOCK_WORKFLOW_NAME).addJob("blockJob", blockJobBuilder);
_driver.start(blockWorkflowBuilder.build());
int numOfParticipantThreads = 40;
Assert.assertTrue(TaskTestUtil.pollForAllTasksBlock(_manager.getHelixDataAccessor(),
_participants[0].getInstanceName(), numOfParticipantThreads, 10000));
// Now, the HelixTask threadpool is full and blocked by blockJob.
// New tasks assigned to the instance won't be assigned at all.
// 2 timeout jobs, first one timeout, but won't block the second one to run, the second one also
// timeout.
JobConfig.Builder timeoutJobBuilder =
new JobConfig.Builder().setWorkflow(TIMEOUT_WORKFLOW_NAME).setTargetResource(DB_NAME)
.setTargetPartitionStates(Sets.newHashSet(MasterSlaveSMD.States.MASTER.name()))
.setCommand(MockTask.TASK_COMMAND).setNumConcurrentTasksPerInstance(_numPartitions)
.setTimeout(3000); // Wait a bit so that tasks are already assigned to the job (and will
// be cancelled)
WorkflowConfig.Builder timeoutWorkflowConfigBuilder =
new WorkflowConfig.Builder(TIMEOUT_WORKFLOW_NAME).setFailureThreshold(1); // workflow
// ignores first
// job's timeout
// and schedule
// second job and
// succeed.
Workflow.Builder timeoutWorkflowBuilder = new Workflow.Builder(TIMEOUT_WORKFLOW_NAME)
.setWorkflowConfig(timeoutWorkflowConfigBuilder.build())
.addJob(TIMEOUT_JOB_1, timeoutJobBuilder); // job 1 timeout, but won't block job 2
timeoutJobBuilder.setIgnoreDependentJobFailure(true); // ignore first job's timeout
timeoutWorkflowBuilder.addJob(TIMEOUT_JOB_2, timeoutJobBuilder) // job 2 also timeout
.addParentChildDependency(TIMEOUT_JOB_1, TIMEOUT_JOB_2);
_driver.start(timeoutWorkflowBuilder.build());
_driver.pollForJobState(TIMEOUT_WORKFLOW_NAME,
TaskUtil.getNamespacedJobName(TIMEOUT_WORKFLOW_NAME, TIMEOUT_JOB_1), TaskState.TIMED_OUT);
_driver.pollForJobState(TIMEOUT_WORKFLOW_NAME,
TaskUtil.getNamespacedJobName(TIMEOUT_WORKFLOW_NAME, TIMEOUT_JOB_2), TaskState.TIMED_OUT);
_driver.pollForWorkflowState(TIMEOUT_WORKFLOW_NAME, TaskState.FAILED);
JobContext jobContext =
_driver.getJobContext(TaskUtil.getNamespacedJobName(TIMEOUT_WORKFLOW_NAME, TIMEOUT_JOB_1));
for (int pId : jobContext.getPartitionSet()) {
if (jobContext.getAssignedParticipant(pId) != null) {
// All tasks stuck at INIT->RUNNING, and state transition cancelled and marked TASK_ABORTED
Assert.assertEquals(jobContext.getPartitionState(pId), TaskPartitionState.TASK_ABORTED);
}
}
jobContext =
_driver.getJobContext(TaskUtil.getNamespacedJobName(TIMEOUT_WORKFLOW_NAME, TIMEOUT_JOB_2));
for (int pId : jobContext.getPartitionSet()) {
if (jobContext.getAssignedParticipant(pId) != null) {
// All tasks stuck at INIT->RUNNING, and state transition cancelled and marked TASK_ABORTED
Assert.assertEquals(jobContext.getPartitionState(pId), TaskPartitionState.TASK_ABORTED);
}
}
}
}