blob: e5e05b8d2aad84fd7012d79a627c68ce6d999d59 [file] [log] [blame]
package org.apache.helix.integration.task;
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import org.apache.helix.ConfigAccessor;
import org.apache.helix.HelixException;
import org.apache.helix.TestHelper;
import org.apache.helix.controller.rebalancer.strategy.CrushEdRebalanceStrategy;
import org.apache.helix.integration.manager.ClusterControllerManager;
import org.apache.helix.integration.manager.MockParticipantManager;
import org.apache.helix.mock.statemodel.MockTaskStateModelFactory;
import org.apache.helix.model.ClusterConfig;
import org.apache.helix.model.IdealState;
import org.apache.helix.model.MasterSlaveSMD;
import org.apache.helix.participant.StateMachineEngine;
import org.apache.helix.task.JobConfig;
import org.apache.helix.task.JobContext;
import org.apache.helix.task.Task;
import org.apache.helix.task.TaskCallbackContext;
import org.apache.helix.task.TaskFactory;
import org.apache.helix.task.TaskPartitionState;
import org.apache.helix.task.TaskResult;
import org.apache.helix.task.TaskState;
import org.apache.helix.task.TaskStateModelFactory;
import org.apache.helix.task.TaskSynchronizedTestBase;
import org.apache.helix.task.TaskUtil;
import org.apache.helix.task.Workflow;
import org.apache.helix.tools.ClusterVerifiers.BestPossibleExternalViewVerifier;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
public class TestJobFailureTaskNotStarted extends TaskSynchronizedTestBase {
private static final String DB_NAME = WorkflowGenerator.DEFAULT_TGT_DB;
private static final String UNBALANCED_DB_NAME = "UnbalancedDB";
private MockParticipantManager _blockedParticipant;
private MockParticipantManager _normalParticipant;
@BeforeClass
public void beforeClass() throws Exception {
_participants = new MockParticipantManager[_numNodes];
_numDbs = 1;
_numNodes = 2;
_numPartitions = 2;
_numReplicas = 1;
_gSetupTool.addCluster(CLUSTER_NAME, true);
setupParticipants();
setupDBs();
startParticipantsWithStuckTaskStateModelFactory();
createManagers();
_controller = new ClusterControllerManager(ZK_ADDR, CLUSTER_NAME, CONTROLLER_PREFIX);
_controller.syncStart();
// Enable cancellation
ConfigAccessor _configAccessor = new ConfigAccessor(_gZkClient);
ClusterConfig clusterConfig = _configAccessor.getClusterConfig(CLUSTER_NAME);
clusterConfig.stateTransitionCancelEnabled(true);
_configAccessor.setClusterConfig(CLUSTER_NAME, clusterConfig);
_clusterVerifier =
new BestPossibleExternalViewVerifier.Builder(CLUSTER_NAME)
.setZkClient(_gZkClient)
.setWaitTillVerify(TestHelper.DEFAULT_REBALANCE_PROCESSING_WAIT_TIME)
.build();
}
protected void startParticipantsWithStuckTaskStateModelFactory() {
Map<String, TaskFactory> taskFactoryReg = new HashMap<String, TaskFactory>();
taskFactoryReg.put(MockTask.TASK_COMMAND, new TaskFactory() {
@Override
public Task createNewTask(TaskCallbackContext context) {
return new MockTask(context);
}
});
List<String> instances =
_gSetupTool.getClusterManagementTool().getInstancesInCluster(CLUSTER_NAME);
_participants[0] = new MockParticipantManager(ZK_ADDR, CLUSTER_NAME, instances.get(0));
StateMachineEngine stateMachine = _participants[0].getStateMachineEngine();
stateMachine.registerStateModelFactory("Task",
new MockTaskStateModelFactory(_participants[0], taskFactoryReg));
_participants[0].syncStart();
_blockedParticipant = _participants[0];
_participants[1] = new MockParticipantManager(ZK_ADDR, CLUSTER_NAME, instances.get(1));
stateMachine = _participants[1].getStateMachineEngine();
stateMachine.registerStateModelFactory("Task",
new TaskStateModelFactory(_participants[1], taskFactoryReg));
_participants[1].syncStart();
_normalParticipant = _participants[1];
}
@Test
public void testTaskNotStarted() throws InterruptedException {
setupUnbalancedDB();
final String BLOCK_WORKFLOW_NAME = "blockWorkflow";
final String FAIL_WORKFLOW_NAME = "failWorkflow";
final String FAIL_JOB_NAME = "failJob";
ConfigAccessor configAccessor = new ConfigAccessor(_gZkClient);
final int numTask =
configAccessor.getClusterConfig(CLUSTER_NAME).getMaxConcurrentTaskPerInstance();
// Tasks targeting the unbalanced DB, the instance is setup to stuck on INIT->RUNNING, so it
// takes all threads
// on that instance.
JobConfig.Builder blockJobBuilder = new JobConfig.Builder().setWorkflow(BLOCK_WORKFLOW_NAME)
.setTargetResource(UNBALANCED_DB_NAME)
.setTargetPartitionStates(Sets.newHashSet(MasterSlaveSMD.States.MASTER.name()))
.setCommand(MockTask.TASK_COMMAND).setNumConcurrentTasksPerInstance(numTask);
Workflow.Builder blockWorkflowBuilder =
new Workflow.Builder(BLOCK_WORKFLOW_NAME).addJob("blockJob", blockJobBuilder);
_driver.start(blockWorkflowBuilder.build());
Assert.assertTrue(TaskTestUtil.pollForAllTasksBlock(_manager.getHelixDataAccessor(),
_blockedParticipant.getInstanceName(), numTask, 10000));
// Now, all HelixTask threads are stuck at INIT->RUNNING for task state transition(user task
// can't be submitted)
// New tasks assigned to the instance won't start INIT->RUNNING transition at all.
// A to-be-failed job, 2 tasks, 1 stuck and 1 fail, making the job fail.
JobConfig.Builder failJobBuilder =
new JobConfig.Builder().setWorkflow(FAIL_WORKFLOW_NAME).setTargetResource(DB_NAME)
.setTargetPartitionStates(Sets.newHashSet(MasterSlaveSMD.States.MASTER.name()))
.setCommand(MockTask.TASK_COMMAND).setJobCommandConfigMap(
ImmutableMap.of(MockTask.TASK_RESULT_STATUS, TaskResult.Status.FAILED.name()));
Workflow.Builder failWorkflowBuilder =
new Workflow.Builder(FAIL_WORKFLOW_NAME).addJob(FAIL_JOB_NAME, failJobBuilder);
_driver.start(failWorkflowBuilder.build());
_driver.pollForJobState(FAIL_WORKFLOW_NAME,
TaskUtil.getNamespacedJobName(FAIL_WORKFLOW_NAME, FAIL_JOB_NAME), TaskState.FAILED);
_driver.pollForWorkflowState(FAIL_WORKFLOW_NAME, TaskState.FAILED);
JobContext jobContext =
_driver.getJobContext(TaskUtil.getNamespacedJobName(FAIL_WORKFLOW_NAME, FAIL_JOB_NAME));
for (int pId : jobContext.getPartitionSet()) {
String assignedParticipant = jobContext.getAssignedParticipant(pId);
if (assignedParticipant == null) {
continue; // May not have been assigned at all due to quota limitations
}
if (jobContext.getAssignedParticipant(pId).equals(_blockedParticipant.getInstanceName())) {
Assert.assertEquals(jobContext.getPartitionState(pId), TaskPartitionState.TASK_ABORTED);
} else if (assignedParticipant.equals(_normalParticipant.getInstanceName())) {
Assert.assertEquals(jobContext.getPartitionState(pId), TaskPartitionState.TASK_ERROR);
} else {
throw new HelixException("There should be only 2 instances, 1 blocked, 1 normal.");
}
}
}
private void setupUnbalancedDB() throws InterruptedException {
// Start with Full-Auto mode to create the partitions, Semi-Auto won't create partitions.
_gSetupTool.addResourceToCluster(CLUSTER_NAME, UNBALANCED_DB_NAME, 50, MASTER_SLAVE_STATE_MODEL,
IdealState.RebalanceMode.FULL_AUTO.name(), CrushEdRebalanceStrategy.class.getName());
_gSetupTool.rebalanceStorageCluster(CLUSTER_NAME, UNBALANCED_DB_NAME, 1);
// Set preference list to put all partitions to one instance.
IdealState idealState = _gSetupTool.getClusterManagementTool()
.getResourceIdealState(CLUSTER_NAME, UNBALANCED_DB_NAME);
Set<String> partitions = idealState.getPartitionSet();
for (String partition : partitions) {
idealState.setPreferenceList(partition,
Lists.newArrayList(_blockedParticipant.getInstanceName()));
}
idealState.setRebalanceMode(IdealState.RebalanceMode.SEMI_AUTO);
_gSetupTool.getClusterManagementTool().setResourceIdealState(CLUSTER_NAME, UNBALANCED_DB_NAME,
idealState);
Assert.assertTrue(_clusterVerifier.verifyByPolling(10000, 100));
}
}