helix-core/src/main/java/org/apache/helix/task/AbstractTaskDispatcher.java - helix - Git at Google

 package org.apache.helix.task;

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Date;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.SortedSet;
 import java.util.TreeSet;

 import com.google.common.collect.Maps;
 import com.google.common.collect.Sets;
 import org.apache.helix.HelixDefinedState;
 import org.apache.helix.HelixManager;
 import org.apache.helix.common.caches.TaskDataCache;
 import org.apache.helix.controller.dataproviders.BaseControllerDataProvider;
 import org.apache.helix.controller.dataproviders.WorkflowControllerDataProvider;
 import org.apache.helix.controller.pipeline.AbstractBaseStage;
 import org.apache.helix.controller.rebalancer.util.RebalanceScheduler;
 import org.apache.helix.controller.stages.BestPossibleStateOutput;
 import org.apache.helix.controller.stages.CurrentStateOutput;
 import org.apache.helix.model.InstanceConfig;
 import org.apache.helix.model.Message;
 import org.apache.helix.model.Partition;
 import org.apache.helix.model.ResourceAssignment;
 import org.apache.helix.monitoring.mbeans.ClusterStatusMonitor;
 import org.apache.helix.monitoring.mbeans.JobMonitor;
 import org.apache.helix.task.assigner.AssignableInstance;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 public abstract class AbstractTaskDispatcher {
   private static final Logger LOG = LoggerFactory.getLogger(AbstractTaskDispatcher.class);
   private static final String TASK_LATENCY_TAG = "Latency";

   // For connection management
   protected HelixManager _manager;
   protected static RebalanceScheduler _rebalanceScheduler = new RebalanceScheduler();
   protected ClusterStatusMonitor _clusterStatusMonitor;

   public void init(HelixManager manager) {
     _manager = manager;
   }

   // Job Update related methods

   public void updatePreviousAssignedTasksStatus(
       Map<String, SortedSet<Integer>> currentInstanceToTaskAssignments,
       Set<String> excludedInstances, String jobResource, CurrentStateOutput currStateOutput,
       JobContext jobCtx, JobConfig jobCfg, TaskState jobState,
       Map<String, Set<Integer>> assignedPartitions, Set<Integer> partitionsToDropFromIs,
       Map<Integer, PartitionAssignment> paMap, TargetState jobTgtState,
       Set<Integer> skippedPartitions, WorkflowControllerDataProvider cache,
       Map<String, Set<Integer>> tasksToDrop) {

     // If a job is in one of the following states and its tasks are in RUNNING states, the tasks
     // will be aborted.
     Set<TaskState> jobStatesForAbortingTasks =
         new HashSet<>(Arrays.asList(TaskState.TIMING_OUT, TaskState.TIMED_OUT, TaskState.FAILING,
             TaskState.FAILED, TaskState.ABORTED));

     // Get AssignableInstanceMap for releasing resources for tasks in terminal states
     AssignableInstanceManager assignableInstanceManager = cache.getAssignableInstanceManager();

     // Iterate through all instances
     for (String instance : currentInstanceToTaskAssignments.keySet()) {
       assignedPartitions.put(instance, new HashSet<>());

       // Set all dropping transitions first. These are tasks coming from Participant disconnects
       // and have the requestedState of DROPPED.
       // These need to be prioritized over any other state transitions because of the race condition
       // with the same pId (task) running on other instances. This is because in paMap, we can only
       // define one transition per pId
       if (tasksToDrop.containsKey(instance)) {
         for (int pIdToDrop : tasksToDrop.get(instance)) {
           paMap.put(pIdToDrop,
               new PartitionAssignment(instance, TaskPartitionState.DROPPED.name()));
           assignedPartitions.get(instance).add(pIdToDrop);
         }
       }

       if (excludedInstances.contains(instance)) {
         continue;
       }

       // If not an excluded instance, we must instantiate its entry in assignedPartitions
       Set<Integer> pSet = currentInstanceToTaskAssignments.get(instance);

       // We need to remove all task pId's to be dropped because we already made an assignment in
       // paMap above for them to be dropped. The following does this.
       if (tasksToDrop.containsKey(instance)) {
         pSet.removeAll(tasksToDrop.get(instance));
       }

       // Used to keep track of partitions that are in either INIT or DROPPED states
       Set<Integer> donePartitions = new TreeSet<>();
       for (int pId : pSet) {
         final String pName = pName(jobResource, pId);
         TaskPartitionState currState = updateJobContextAndGetTaskCurrentState(currStateOutput,
             jobResource, pId, pName, instance, jobCtx, jobTgtState);

         if (!instance.equals(jobCtx.getAssignedParticipant(pId))) {
           LOG.warn(
               "Instance {} does not match the assigned participant for pId {} in the job context. Skipping task scheduling.",
               instance, pId);
           continue;
         }

         // Check for pending state transitions on this (partition, instance). If there is a pending
         // state transition, we prioritize this pending state transition and set the assignment from
         // this pending state transition, essentially "waiting" until this pending message clears
         Message pendingMessage =
             currStateOutput.getPendingMessage(jobResource, new Partition(pName), instance);
         if (pendingMessage != null && !pendingMessage.getToState().equals(currState.name())) {
           // If there is a pending message whose destination state is different from the current
           // state, just make the same assignment as the pending message. This is essentially
           // "waiting" until this state transition is complete
           processTaskWithPendingMessage(pId, pName, instance, pendingMessage, jobState, currState,
               paMap, assignedPartitions);
           continue;
         }

         // Get AssignableInstance for this instance and TaskConfig for releasing resources
         String quotaType = jobCfg.getJobType();
         String taskId;
         if (TaskUtil.isGenericTaskJob(jobCfg)) {
           taskId = jobCtx.getTaskIdForPartition(pId);
         } else {
           taskId = pName;
         }
         TaskConfig taskConfig = jobCfg.getTaskConfig(taskId);

         // Process any requested state transitions. If there is a requested state transition, just
         // "wait" until this state transition is complete
         String requestedStateStr =
             currStateOutput.getRequestedState(jobResource, new Partition(pName), instance);
         if (requestedStateStr != null && !requestedStateStr.isEmpty()) {
           TaskPartitionState requestedState = TaskPartitionState.valueOf(requestedStateStr);
           if (requestedState.equals(currState)) {
             LOG.warn(String.format(
                 "Requested state %s is the same as the current state for instance %s.",
                 requestedState, instance));
           }

           // For STOPPED tasks, if the targetState is STOP, we should not honor requestedState
           // transition and make it a NOP
           if (currState == TaskPartitionState.STOPPED && jobTgtState == TargetState.STOP) {
             // This task is STOPPED and not going to be re-run, so release this task
             assignableInstanceManager.release(instance, taskConfig, quotaType);
             continue;
           }

           // This contains check is necessary because we have already traversed pIdsToDrop at the
           // beginning of this method. If we already have a dropping transition, we do not want to
           // overwrite it. Any other requestedState transitions (for example, INIT to RUNNING or
           // RUNNING to COMPLETE, can wait without affecting correctness - they will be picked up
           // in ensuing runs of the Task pipeline)
           if (!paMap.containsKey(pId)) {
             paMap.put(pId, new PartitionAssignment(instance, requestedState.name()));
           }
           assignedPartitions.get(instance).add(pId);
           if (LOG.isDebugEnabled()) {
             LOG.debug(
                 String.format("Instance %s requested a state transition to %s for partition %s.",
                     instance, requestedState, pName));
           }
           continue;
         }

         switch (currState) {
         case RUNNING: {
           TaskPartitionState nextState = TaskPartitionState.RUNNING;
           if (jobStatesForAbortingTasks.contains(jobState)) {
             nextState = TaskPartitionState.TASK_ABORTED;
           } else if (jobTgtState == TargetState.STOP) {
             nextState = TaskPartitionState.STOPPED;
           }
           paMap.put(pId, new PartitionAssignment(instance, nextState.name()));
           assignedPartitions.get(instance).add(pId);
           if (LOG.isDebugEnabled()) {
             LOG.debug(String.format("Setting task partition %s state to %s on instance %s.", pName,
                 nextState, instance));
           }
         }
           break;
         case STOPPED: {
           // TODO: This case statement might be unreachable code - Hunter
           // This code may need to be removed because once a task is STOPPED and its workflow's
           // targetState is STOP, we do not assign that stopped task. Not assigning means it will
           // not be included in previousAssignment map in the next rebalance. If it is not in
           // prevInstanceToTaskAssignments, it will never hit this part of the code
           // When the parent workflow is to be resumed (target state is START), then it will just be
           // assigned as if it were being assigned for the first time
           TaskPartitionState nextState;
           if (jobTgtState.equals(TargetState.START)) {
             nextState = TaskPartitionState.RUNNING;
           } else {
             nextState = TaskPartitionState.STOPPED;
             // This task is STOPPED and not going to be re-run, so release this task
             assignableInstanceManager.release(instance, taskConfig, quotaType);
           }
           paMap.put(pId, new JobRebalancer.PartitionAssignment(instance, nextState.name()));
           assignedPartitions.get(instance).add(pId);

           if (LOG.isDebugEnabled()) {
             LOG.debug(String.format("Setting task partition %s state to %s on instance %s.", pName,
                 nextState, instance));
           }
         }
           break;
         case COMPLETED: {
           // The task has completed on this partition. Drop it from the instance and add it to assignedPartitions in
           // order to avoid scheduling it again in this pipeline.
           assignedPartitions.get(instance).add(pId);
           paMap.put(pId, new PartitionAssignment(instance, TaskPartitionState.DROPPED.name()));
           if (LOG.isDebugEnabled()) {
             LOG.debug(String.format(
                 "Task partition %s has completed with state %s. Marking as such in rebalancer context.",
                 pName, currState));
           }
           partitionsToDropFromIs.add(pId);
           // This task is COMPLETED, so release this task
           assignableInstanceManager.release(instance, taskConfig, quotaType);
         }
           break;
         case TIMED_OUT:

         case TASK_ERROR:

         case TASK_ABORTED:

         case ERROR: {
           // First make this task which is in terminal state to be dropped.
           // Later on, in next pipeline in handleAdditionalAssignments, the task will be retried if possible.
           // (meaning it is not ABORTED and max number of attempts has not been reached yet)
           assignedPartitions.get(instance).add(pId);
           paMap.put(pId, new PartitionAssignment(instance, TaskPartitionState.DROPPED.name()));
           if (LOG.isDebugEnabled()) {
             LOG.debug(String.format(
                 "Task partition %s has error state %s with msg %s. Marking as such in rebalancer context.",
                 pName, currState, jobCtx.getPartitionInfo(pId)));
           }
           // The error policy is to fail the task as soon a single partition fails for a specified
           // maximum number of attempts or task is in ABORTED state.
           // But notice that if job is TIMED_OUT, aborted task won't be treated as fail and won't
           // cause job fail.
           // After all tasks are aborted, they will be dropped, because of job timeout.
           if (jobState != TaskState.TIMED_OUT && jobState != TaskState.TIMING_OUT) {
             if (jobCtx.getPartitionNumAttempts(pId) >= jobCfg.getMaxAttemptsPerTask()
                 || currState.equals(TaskPartitionState.TASK_ABORTED)
                 || currState.equals(TaskPartitionState.ERROR)) {
               skippedPartitions.add(pId);
               partitionsToDropFromIs.add(pId);
               if (LOG.isDebugEnabled()) {
                 LOG.debug("skippedPartitions:" + skippedPartitions);
               }
             } else {
               // Mark the task to be started at some later time (if enabled)
               markPartitionDelayed(jobCfg, jobCtx, pId);
             }
           }
           // Release this task
           assignableInstanceManager.release(instance, taskConfig, quotaType);
         }
           break;
         case INIT: {
           // INIT is a temporary state for tasks
           // Two possible scenarios for INIT:
           // 1. Task is getting scheduled for the first time. In this case, Task's state will go
           // from null->INIT->RUNNING, and this INIT state will be transient and very short-lived
           // 2. Task is getting scheduled for the first time, but in this case, job is timed out or
           // timing out. In this case, it will be sent back to INIT state to be removed. Here we
           // ensure that this task then goes from INIT to DROPPED so that it will be released from
           // AssignableInstance to prevent resource leak
           if (jobState == TaskState.TIMED_OUT || jobState == TaskState.TIMING_OUT
               || jobTgtState == TargetState.DELETE) {
             // Job is timed out or timing out or targetState is to be deleted, so its tasks will be
             // sent back to INIT
             // In this case, tasks' IdealState will be removed, and they will be sent to DROPPED
             partitionsToDropFromIs.add(pId);

             assignedPartitions.get(instance).add(pId);
             paMap.put(pId, new PartitionAssignment(instance, TaskPartitionState.DROPPED.name()));

             // Also release resources for these tasks
             assignableInstanceManager.release(instance, taskConfig, quotaType);
             break;
           } else if (jobState == TaskState.IN_PROGRESS
               && (jobTgtState != TargetState.STOP && jobTgtState != TargetState.DELETE)) {
             // Job is in progress, implying that tasks are being re-tried, so set it to RUNNING
             paMap.put(pId,
                 new JobRebalancer.PartitionAssignment(instance, TaskPartitionState.RUNNING.name()));
             assignedPartitions.get(instance).add(pId);
             break;
           }
         }

         case DROPPED: {
           // currState in [INIT, DROPPED]. Do nothing, the partition is eligible to be reassigned.
           donePartitions.add(pId);
           if (LOG.isDebugEnabled()) {
             LOG.debug(String.format(
                 "Task partition %s has state %s. It will be dropped from the current ideal state.",
                 pName, currState));
           }
           // If it's DROPPED, release this task. If INIT, do not release
           if (currState == TaskPartitionState.DROPPED) {
             assignableInstanceManager.release(instance, taskConfig, quotaType);
           }
         }
           break;
         default:
           throw new AssertionError("Unknown enum symbol: " + currState);
         }
       }

       // Remove the set of task partitions that are completed or in one of the error states.
       pSet.removeAll(donePartitions);
     }
   }

   /**
    * Computes the partition name given the resource name and partition id.
    */
   protected String pName(String resource, int pId) {
     return String.format("%s_%s", resource, pId);
   }

   /**
    * An (instance, state) pair.
    */
   protected static class PartitionAssignment {
     public final String _instance;
     public final String _state;

     PartitionAssignment(String instance, String state) {
       _instance = instance;
       _state = state;
     }
   }

   private TaskPartitionState updateJobContextAndGetTaskCurrentState(
       CurrentStateOutput currentStateOutput, String jobResource, Integer pId, String pName,
       String instance, JobContext jobCtx, TargetState jobTgtState) {
     String currentStateString =
         currentStateOutput.getCurrentState(jobResource, new Partition(pName), instance);
     if (currentStateString == null) {
       // Task state is either DROPPED or INIT
       TaskPartitionState stateFromContext = jobCtx.getPartitionState(pId);
       // If jobTgtState is START: Since currentstate is null, this function will return INIT to
       // start the task or it will return the stateFromContext (the current context) and there is no
       // need to update the context.
       // If jobTgtState is DELETE: JobDispatcher handles this case and this part of the code will
       // not be triggered.
       // If jobTgtState is STOP:
       // If context is equal to INIT or RUNNING: Here context is set to be STOPPED.
       // Other states don't need special handling and context can remain unchanged.
       if (jobTgtState == TargetState.STOP && (stateFromContext == TaskPartitionState.RUNNING
           || stateFromContext == TaskPartitionState.INIT)) {
         jobCtx.setPartitionState(pId, TaskPartitionState.STOPPED);
         return TaskPartitionState.STOPPED;
       }
       return stateFromContext == null ? TaskPartitionState.INIT : stateFromContext;
     }
     TaskPartitionState currentState = TaskPartitionState.valueOf(currentStateString);
     // Update job context based on current state
     updatePartitionInformationInJobContext(currentStateOutput, jobResource, currentState, jobCtx,
         pId, pName, instance);
     return currentState;
   }

   /**
    * Based on the CurrentState of this task and Context information, the task information in the job
    * context gets updated.
    * @param currentStateOutput
    * @param jobResource
    * @param currentState
    * @param jobCtx
    * @param pId
    * @param pName
    * @param instance
    */
   private void updatePartitionInformationInJobContext(CurrentStateOutput currentStateOutput,
       String jobResource, TaskPartitionState currentState, JobContext jobCtx, Integer pId,
       String pName, String instance) {
     // The assignedParticipant field needs to be updated regardless of the current state and context
     // information because it will prevent controller to assign the task to the wrong participant
     // for targeted tasks when two CurrentStates exist for one task.
     // In the updatePreviousAssignedTasksStatus, we check
     // instance.equals(jobCtx.getAssignedParticipant(pId)) and bypass the assignment if instance is
     // not equal to job context's AssignedParticipant for this pId.
     jobCtx.setAssignedParticipant(pId, instance);
     // If job context needs to be updated with new state, update it accordingly
     // This check is necessary because we are relying on current state and we do not want to update
     // context as long as current state existed. We just want to update context information
     // (specially finish time) once.
     // This condition checks whether jobContext's state is out of date or not.
     if (!currentState.equals(jobCtx.getPartitionState(pId))) {
       jobCtx.setPartitionState(pId, currentState);
       String taskMsg = currentStateOutput.getInfo(jobResource, new Partition(pName), instance);
       if (taskMsg != null) {
         jobCtx.setPartitionInfo(pId, taskMsg);
       }
       if (currentState == TaskPartitionState.COMPLETED) {
         markPartitionCompleted(jobCtx, pId);
       }
       // This avoids a race condition in the case that although currentState is in the following
       // error condition, the pending message (INIT->RUNNNING) might still be present.
       // This is undesirable because this prevents JobContext from getting the proper update of
       // fields including task state and task's NUM_ATTEMPTS
       if (currentState == TaskPartitionState.ERROR || currentState == TaskPartitionState.TASK_ERROR
           || currentState == TaskPartitionState.TIMED_OUT
           || currentState == TaskPartitionState.TASK_ABORTED) {
         // Do not increment the task attempt count here - it will be incremented at scheduling
         // time
         markPartitionError(jobCtx, pId, currentState);
       }
     }
   }

   /**
    * Create an assignment based on an already-existing pending message. This effectively lets the
    * Controller to "wait" until the pending state transition has been processed.
    * @param pId
    * @param pName
    * @param instance
    * @param pendingMessage
    * @param jobState
    * @param currState
    * @param paMap
    * @param assignedPartitions
    */
   private void processTaskWithPendingMessage(Integer pId, String pName, String instance,
       Message pendingMessage, TaskState jobState, TaskPartitionState currState,
       Map<Integer, PartitionAssignment> paMap, Map<String, Set<Integer>> assignedPartitions) {

     if (jobState == TaskState.TIMING_OUT && currState == TaskPartitionState.INIT
         && pendingMessage.getToState().equals(TaskPartitionState.RUNNING.name())) {
       // While job is timing out, if the task is pending on INIT->RUNNING, set it back to INIT,
       // so that Helix will cancel the transition.
       paMap.put(pId, new PartitionAssignment(instance, TaskPartitionState.INIT.name()));
       assignedPartitions.get(instance).add(pId);
       if (LOG.isDebugEnabled()) {
         LOG.debug(String.format(
             "Task partition %s has a pending state transition on instance %s INIT->RUNNING. CurrentState is %s "
                 + "Setting it back to INIT so that Helix can cancel the transition(if enabled).",
             pName, instance, currState.name()));
       }
     } else {
       // Otherwise, Just copy forward
       // the state assignment from the pending message
       paMap.put(pId, new PartitionAssignment(instance, pendingMessage.getToState()));
       assignedPartitions.get(instance).add(pId);
       if (LOG.isDebugEnabled()) {
         LOG.debug(String.format(
             "Task partition %s has a pending state transition on instance %s. Using the pending message ToState which was %s.",
             pName, instance, pendingMessage.getToState()));
       }
     }
   }

   protected static void markPartitionCompleted(JobContext ctx, int pId) {
     ctx.setPartitionState(pId, TaskPartitionState.COMPLETED);
     ctx.setPartitionFinishTime(pId, System.currentTimeMillis());
   }

   protected static void markPartitionError(JobContext ctx, int pId, TaskPartitionState state) {
     ctx.setPartitionState(pId, state);
     ctx.setPartitionFinishTime(pId, System.currentTimeMillis());
   }

   protected static void markAllPartitionsError(JobContext ctx) {
     for (int pId : ctx.getPartitionSet()) {
       markPartitionError(ctx, pId, TaskPartitionState.ERROR);
     }
   }

   protected static void markPartitionDelayed(JobConfig cfg, JobContext ctx, int p) {
     long delayInterval = cfg.getTaskRetryDelay();
     if (delayInterval <= 0) {
       return;
     }
     long nextStartTime = ctx.getPartitionFinishTime(p) + delayInterval;
     ctx.setNextRetryTime(p, nextStartTime);
   }

   protected void handleJobTimeout(JobContext jobCtx, WorkflowContext workflowCtx,
       String jobResource, JobConfig jobCfg) {
     jobCtx.setFinishTime(System.currentTimeMillis());
     workflowCtx.setJobState(jobResource, TaskState.TIMED_OUT);
     // Mark all INIT task to TASK_ABORTED
     for (int pId : jobCtx.getPartitionSet()) {
       if (jobCtx.getPartitionState(pId) == TaskPartitionState.INIT) {
         jobCtx.setPartitionState(pId, TaskPartitionState.TASK_ABORTED);
       }
     }
     _clusterStatusMonitor.updateJobCounters(jobCfg, TaskState.TIMED_OUT);
     _rebalanceScheduler.removeScheduledRebalance(jobResource);
     TaskUtil.cleanupJobIdealStateExtView(_manager.getHelixDataAccessor(), jobResource);
   }

   protected void failJob(String jobName, WorkflowContext workflowContext, JobContext jobContext,
       WorkflowConfig workflowConfig, Map<String, JobConfig> jobConfigMap,
       WorkflowControllerDataProvider dataProvider) {
     markJobFailed(jobName, jobContext, workflowConfig, workflowContext, jobConfigMap, dataProvider);

     // Mark all INIT task to TASK_ABORTED
     for (int pId : jobContext.getPartitionSet()) {
       if (jobContext.getPartitionState(pId) == TaskPartitionState.INIT) {
         jobContext.setPartitionState(pId, TaskPartitionState.TASK_ABORTED);
       }
     }
     _clusterStatusMonitor.updateJobCounters(jobConfigMap.get(jobName), TaskState.FAILED);
     _rebalanceScheduler.removeScheduledRebalance(jobName);
     TaskUtil.cleanupJobIdealStateExtView(_manager.getHelixDataAccessor(), jobName);
   }

   // Compute real assignment from theoretical calculation with applied throttling
   // This is the actual assigning part
   protected void handleAdditionalTaskAssignment(
       Map<String, SortedSet<Integer>> currentInstanceToTaskAssignments,
       Set<String> excludedInstances, String jobResource, CurrentStateOutput currStateOutput,
       JobContext jobCtx, final JobConfig jobCfg, final WorkflowConfig workflowConfig,
       WorkflowContext workflowCtx, final WorkflowControllerDataProvider cache,
       Map<String, Set<Integer>> assignedPartitions, Map<Integer, PartitionAssignment> paMap,
       Set<Integer> skippedPartitions, TaskAssignmentCalculator taskAssignmentCal,
       Set<Integer> allPartitions, final long currentTime, Collection<String> liveInstances) {

     // See if there was LiveInstance change and cache LiveInstances from this iteration of pipeline
     boolean existsLiveInstanceOrCurrentStateOrMessageChangeChange =
         cache.getExistsLiveInstanceOrCurrentStateOrMessageChange();

     // The excludeSet contains the set of task partitions that must be excluded from consideration
     // when making any new assignments.
     // This includes all completed, failed, delayed, and already assigned partitions.
     Set<Integer> excludeSet = Sets.newTreeSet();
     // Add all assigned partitions to excludeSet
     for (Set<Integer> assignedSet : assignedPartitions.values()) {
       excludeSet.addAll(assignedSet);
     }
     addCompletedTasks(excludeSet, jobCtx, allPartitions);
     addPartitionsReachedMaximumRetries(excludeSet, jobCtx, allPartitions, jobCfg);
     excludeSet.addAll(skippedPartitions);
     Set<Integer> partitionsWithDelay = TaskUtil.getNonReadyPartitions(jobCtx, currentTime);
     excludeSet.addAll(partitionsWithDelay);

     // The following is filtering of tasks before passing them to the assigner
     // Only feed in tasks that need to be assigned (null and STOPPED)
     Set<Integer> filteredTaskPartitionNumbers = filterTasks(allPartitions, jobCtx, liveInstances);
     // Remove all excludeSet tasks to be safer because some STOPPED tasks have been already
     // re-started (excludeSet includes already-assigned partitions). Also tasks with their retry
     // limit exceed (addGiveupPartitions) will be removed as well
     filteredTaskPartitionNumbers.removeAll(excludeSet);

     Set<Integer> partitionsToRetryOnLiveInstanceChangeForTargetedJob = new HashSet<>();
     // If the job is a targeted job, in case of live instance change, we need to assign
     // non-terminal tasks so that they could be re-scheduled
     if (!TaskUtil.isGenericTaskJob(jobCfg)
         && existsLiveInstanceOrCurrentStateOrMessageChangeChange) {
       // This job is a targeted job, so FixedAssignmentCalculator will be used
       // There has been a live instance change. Must re-add incomplete task partitions to be
       // re-assigned and re-scheduled
       for (int partitionNum : allPartitions) {
         TaskPartitionState taskPartitionState = jobCtx.getPartitionState(partitionNum);
         if (isTaskNotInTerminalState(taskPartitionState)
             && !partitionsWithDelay.contains(partitionNum)) {
           // Some targeted tasks may have timed-out due to Participants (instances) not being
           // live, so we give tasks like these another try
           // If some of these tasks are already scheduled and running, they will be dropped as
           // well
           // Also, do not include partitions with delay that are not ready to be assigned and
           // scheduled
           partitionsToRetryOnLiveInstanceChangeForTargetedJob.add(partitionNum);
         }
       }
     }
     filteredTaskPartitionNumbers.addAll(partitionsToRetryOnLiveInstanceChangeForTargetedJob);

     // The actual assignment is computed here
     // Get instance->[partition, ...] mappings for the target resource.
     Map<String, SortedSet<Integer>> tgtPartitionAssignments =
         taskAssignmentCal.getTaskAssignment(currStateOutput, liveInstances, jobCfg, jobCtx,
             workflowConfig, workflowCtx, filteredTaskPartitionNumbers, cache.getIdealStates());

     if (!TaskUtil.isGenericTaskJob(jobCfg) && jobCfg.isRebalanceRunningTask()) {
       // TODO: Revisit the logic for isRebalanceRunningTask() and valid use cases for it
       // TODO: isRebalanceRunningTask() was originally put in place to allow users to move
       // ("rebalance") long-running tasks, but there hasn't been a clear use case for this
       // Previously, there was a bug in the condition above (it was || where it should have been &&)
       dropRebalancedRunningTasks(tgtPartitionAssignments, currentInstanceToTaskAssignments, paMap,
           jobCtx);
     }

     // If this is a targeted job and if there was a live instance change
     if (!TaskUtil.isGenericTaskJob(jobCfg)
         && existsLiveInstanceOrCurrentStateOrMessageChangeChange) {
       // Drop current jobs only if they are assigned to a different instance, regardless of
       // the jobCfg.isRebalanceRunningTask() setting
       dropRebalancedRunningTasks(tgtPartitionAssignments, currentInstanceToTaskAssignments, paMap,
           jobCtx);
     }
     // Go through ALL instances and assign/throttle tasks accordingly
     for (Map.Entry<String, SortedSet<Integer>> entry : currentInstanceToTaskAssignments.entrySet()) {
       String instance = entry.getKey();
       if (!tgtPartitionAssignments.containsKey(instance)) {
         // There is no assignment made for this instance, so it is safe to skip
         continue;
       }
       if (excludedInstances.contains(instance)) {
         // There is a task assignment made for this instance, but for some reason, we cannot
         // assign to this instance. So we must skip the actual scheduling, but we must also
         // release the prematurely assigned tasks from AssignableInstance
         if (!cache.getAssignableInstanceManager().getAssignableInstanceMap()
             .containsKey(instance)) {
           continue; // This should not happen; skip!
         }
         AssignableInstanceManager assignableInstanceManager = cache.getAssignableInstanceManager();
         String quotaType = jobCfg.getJobType();
         for (int partitionNum : tgtPartitionAssignments.get(instance)) {
           // Get the TaskConfig for this partitionNumber
           String taskId = getTaskId(jobCfg, jobCtx, partitionNum);
           TaskConfig taskConfig = jobCfg.getTaskConfig(taskId);
           assignableInstanceManager.release(instance, taskConfig, quotaType);
         }
         continue;
       }
       // 1. throttled by job configuration
       // Contains the set of task partitions currently assigned to the instance.
       int jobCfgLimitation =
           jobCfg.getNumConcurrentTasksPerInstance() - assignedPartitions.get(instance).size();
       // 2. throttled by participant capacity
       int participantCapacity = cache.getInstanceConfigMap().get(instance).getMaxConcurrentTask();
       if (participantCapacity == InstanceConfig.MAX_CONCURRENT_TASK_NOT_SET) {
         participantCapacity = cache.getClusterConfig().getMaxConcurrentTaskPerInstance();
       }
       int participantLimitation =
           participantCapacity - cache.getParticipantActiveTaskCount(instance);
       // New tasks to be assigned
       int numToAssign = Math.min(jobCfgLimitation, participantLimitation);
       if (LOG.isDebugEnabled()) {
         LOG.debug(String.format(
             "Throttle tasks to be assigned to instance %s using limitation: Job Concurrent Task(%d), "
                 + "Participant Max Task(%d). Remaining capacity %d.",
             instance, jobCfgLimitation, participantCapacity, numToAssign));
       }
       Set<Integer> throttledSet = new HashSet<>();
       if (numToAssign > 0) {
         List<Integer> nextPartitions = getNextPartitions(tgtPartitionAssignments.get(instance),
             excludeSet, throttledSet, numToAssign);
         for (Integer pId : nextPartitions) {
           // The following is the actual scheduling of the tasks
           String pName = pName(jobResource, pId);
           paMap.put(pId, new PartitionAssignment(instance, TaskPartitionState.RUNNING.name()));
           excludeSet.add(pId);
           jobCtx.setAssignedParticipant(pId, instance);
           jobCtx.setPartitionState(pId, TaskPartitionState.INIT);
           final long currentTimestamp = System.currentTimeMillis();
           jobCtx.setPartitionStartTime(pId, currentTimestamp);
           if (jobCtx.getExecutionStartTime() == WorkflowContext.NOT_STARTED) {
             // This means this is the very first task scheduled for this job
             jobCtx.setExecutionStartTime(currentTimestamp);
             reportSubmissionToScheduleDelay(cache, _clusterStatusMonitor, workflowConfig, jobCfg,
                 currentTimestamp);
           }
           // Increment the task attempt count at schedule time
           jobCtx.incrementNumAttempts(pId);
           if (LOG.isDebugEnabled()) {
             LOG.debug(String.format("Setting task partition %s state to %s on instance %s.", pName,
                 TaskPartitionState.RUNNING, instance));
           }
         }
         cache.setParticipantActiveTaskCount(instance,
             cache.getParticipantActiveTaskCount(instance) + nextPartitions.size());
       } else {
         // No assignment was actually scheduled, so this assignment needs to be released
         // Put all assignments in throttledSet. Be sure to subtract excludeSet because excludeSet is
         // already applied at filteringPartitions (excludeSet may contain partitions that are
         // currently running)
         Set<Integer> throttledSetWithExcludeSet =
             new HashSet<>(tgtPartitionAssignments.get(instance));
         throttledSetWithExcludeSet.removeAll(excludeSet); // Remove excludeSet
         throttledSet.addAll(throttledSetWithExcludeSet);
       }
       if (!throttledSet.isEmpty()) {
         // Release the tasks in throttledSet because they weren't actually assigned
         if (!cache.getAssignableInstanceManager().getAssignableInstanceMap()
             .containsKey(instance)) {
           continue;
         }
         AssignableInstanceManager assignableInstanceManager = cache.getAssignableInstanceManager();
         String quotaType = jobCfg.getJobType();
         for (int partitionNum : throttledSet) {
           // Get the TaskConfig for this partitionNumber
           String taskId = getTaskId(jobCfg, jobCtx, partitionNum);
           TaskConfig taskConfig = jobCfg.getTaskConfig(taskId);
           assignableInstanceManager.release(instance, taskConfig, quotaType);
         }
         LOG.debug(
             throttledSet.size() + "tasks are ready but throttled when assigned to participant.");
       }
     }
   }

   protected void scheduleForNextTask(String job, JobContext jobCtx, long now) {
     // Figure out the earliest schedulable time in the future of a non-complete job
     boolean shouldSchedule = false;
     long earliestTime = Long.MAX_VALUE;
     for (int p : jobCtx.getPartitionSet()) {
       long retryTime = jobCtx.getNextRetryTime(p);
       TaskPartitionState state = jobCtx.getPartitionState(p);
       state = (state != null) ? state : TaskPartitionState.INIT;
       Set<TaskPartitionState> errorStates = Sets.newHashSet(TaskPartitionState.ERROR,
           TaskPartitionState.TASK_ERROR, TaskPartitionState.TIMED_OUT);
       if (errorStates.contains(state) && retryTime > now && retryTime < earliestTime) {
         earliestTime = retryTime;
         shouldSchedule = true;
       }
     }

     // If any was found, then schedule it
     if (shouldSchedule) {
       long scheduledTime = _rebalanceScheduler.getRebalanceTime(job);
       if (scheduledTime == -1 || earliestTime < scheduledTime) {
         _rebalanceScheduler.scheduleRebalance(_manager, job, earliestTime);
       }
     }
   }

   // Add all partitions/tasks that are cannot be retried. These tasks are:
   // 1- Task is in ABORTED or ERROR state.
   // 2- Task has just gone to TIMED_OUT, ERROR or DROPPED states and has reached to its
   // maxNumberAttempts
   // These tasks determine whether the job needs to FAILED or not.
   protected static void addGivenUpPartitions(Set<Integer> set, JobContext ctx,
       Iterable<Integer> pIds, JobConfig cfg) {
     for (Integer pId : pIds) {
       if (isTaskGivenup(ctx, cfg, pId)) {
         set.add(pId);
       }
     }
   }

   // Add all partitions that have reached their maxNumberAttempts. These tasks should not be
   // considered for scheduling again.
   protected static void addPartitionsReachedMaximumRetries(Set<Integer> set, JobContext ctx,
       Iterable<Integer> pIds, JobConfig cfg) {
     for (Integer pId : pIds) {
       if (ctx.getPartitionNumAttempts(pId) >= cfg.getMaxAttemptsPerTask()) {
         set.add(pId);
       }
     }
   }

   private static List<Integer> getNextPartitions(SortedSet<Integer> candidatePartitions,
       Set<Integer> excluded, Set<Integer> throttled, int n) {
     List<Integer> result = new ArrayList<>();
     for (Integer pId : candidatePartitions) {
       if (!excluded.contains(pId)) {
         if (result.size() < n) {
           result.add(pId);
         } else {
           throttled.add(pId);
         }
       }
     }
     return result;
   }

   private static void addCompletedTasks(Set<Integer> set, JobContext ctx, Iterable<Integer> pIds) {
     for (Integer pId : pIds) {
       TaskPartitionState state = ctx.getPartitionState(pId);
       if (state == TaskPartitionState.COMPLETED) {
         set.add(pId);
       }
     }
   }

   /**
    * Returns a filtered Iterable of tasks. To filter tasks in this context means to only allow tasks
    * whose contexts are either null or in STOPPED, TIMED_OUT, TASK_ERROR, or DROPPED state because
    * only the
    * tasks whose contexts are in these states are eligible to be assigned or re-tried.
    * Also, for those tasks in non-terminal states whose previously assigned instances are no longer
    * LiveInstances are re-added so that they could be re-assigned.
    * @param allPartitions
    * @param jobContext
    * @return a filter Iterable of task partition numbers
    */
   private Set<Integer> filterTasks(Iterable<Integer> allPartitions, JobContext jobContext,
       Collection<String> liveInstances) {
     Set<Integer> filteredTasks = new HashSet<>();
     for (int partitionNumber : allPartitions) {
       TaskPartitionState state = jobContext.getPartitionState(partitionNumber);
       // Allow tasks eligible for scheduling
       if (state == null || state == TaskPartitionState.STOPPED
           || state == TaskPartitionState.TIMED_OUT || state == TaskPartitionState.TASK_ERROR
           || state == TaskPartitionState.DROPPED) {
         filteredTasks.add(partitionNumber);
       }
       // Allow tasks whose assigned instances are no longer live for rescheduling
       if (isTaskNotInTerminalState(state)) {
         String assignedParticipant = jobContext.getAssignedParticipant(partitionNumber);
         if (assignedParticipant != null && !liveInstances.contains(assignedParticipant)) {
           // The assigned instance is no longer live, so mark it as DROPPED in the context
           jobContext.setPartitionState(partitionNumber, TaskPartitionState.DROPPED);
           filteredTasks.add(partitionNumber);
         }
       }
     }
     return filteredTasks;
   }

   /**
    * Returns whether if the task is not in a terminal state and could be re-scheduled.
    * @param state
    * @return
    */
   private boolean isTaskNotInTerminalState(TaskPartitionState state) {
     return state != TaskPartitionState.COMPLETED && state != TaskPartitionState.TASK_ABORTED
         && state != TaskPartitionState.DROPPED && state != TaskPartitionState.ERROR;
   }

   protected static boolean isTaskGivenup(JobContext ctx, JobConfig cfg, int pId) {
     TaskPartitionState state = ctx.getPartitionState(pId);
     if (state == TaskPartitionState.TASK_ABORTED || state == TaskPartitionState.ERROR) {
       return true;
     }
     if (state == TaskPartitionState.TIMED_OUT || state == TaskPartitionState.TASK_ERROR
         || state == TaskPartitionState.DROPPED) {
       return ctx.getPartitionNumAttempts(pId) >= cfg.getMaxAttemptsPerTask();
     }
     return false;
   }

   /**
    * If assignment is different from previous assignment, drop the old running task if it's no
    * longer assigned to the same instance, but not removing it from excludeSet because the same task
    * should not be assigned to the new instance right away.
    * Also only drop if the old and the new assignments both have the partition (task) and they
    * differ (because that means the task has been assigned to a different instance).
    */
   private void dropRebalancedRunningTasks(Map<String, SortedSet<Integer>> newAssignment,
       Map<String, SortedSet<Integer>> oldAssignment, Map<Integer, PartitionAssignment> paMap,
       JobContext jobContext) {

     for (String instance : oldAssignment.keySet()) {
       for (int pId : oldAssignment.get(instance)) {
         if (jobContext.getPartitionState(pId) == TaskPartitionState.RUNNING) {
           // Check if the new assignment has this task on a different instance
           boolean existsInNewAssignment = false;
           for (Map.Entry<String, SortedSet<Integer>> entry : newAssignment.entrySet()) {
             if (!entry.getKey().equals(instance) && entry.getValue().contains(pId)) {
               // Found the partition number; new assignment has been made
               existsInNewAssignment = true;
               LOG.info(
                   "Currently running task partition number: {} is being dropped from instance: {} and will be newly assigned to instance: {}. This is due to a LiveInstance/CurrentState change, and because this is a targeted task.",
                   pId, instance, entry.getKey());
               break;
             }
           }
           if (existsInNewAssignment
               && instance.equals(jobContext.getAssignedParticipant(pId))
           ) {
             // We need to drop this task in the old assignment
             paMap.put(pId, new PartitionAssignment(instance, TaskPartitionState.DROPPED.name()));
             jobContext.setPartitionState(pId, TaskPartitionState.DROPPED);
             // Now it will be dropped and be rescheduled
           }
         }
       }
     }
   }

   protected void markJobComplete(final String jobName, final JobContext jobContext,
       final WorkflowConfig workflowConfig, WorkflowContext workflowContext,
       final Map<String, JobConfig> jobConfigMap,
       final WorkflowControllerDataProvider dataProvider) {
     finishJobInRuntimeJobDag(dataProvider.getTaskDataCache(), workflowConfig.getWorkflowId(),
         jobName);
     final long currentTime = System.currentTimeMillis();
     workflowContext.setJobState(jobName, TaskState.COMPLETED);
     jobContext.setFinishTime(currentTime);
     if (isWorkflowFinished(workflowContext, workflowConfig, jobConfigMap, dataProvider)) {
       workflowContext.setFinishTime(currentTime);
       updateWorkflowMonitor(workflowContext, workflowConfig);
     }
     scheduleJobCleanUp(jobConfigMap.get(jobName), workflowConfig, currentTime);

     // Job has completed successfully so report ControllerInducedDelay
     JobConfig jobConfig = jobConfigMap.get(jobName);
     if (jobConfig != null) {
       reportControllerInducedDelay(dataProvider, _clusterStatusMonitor, workflowConfig, jobConfig,
           currentTime);
     }
   }

   protected void markJobFailed(String jobName, JobContext jobContext, WorkflowConfig workflowConfig,
       WorkflowContext workflowContext, Map<String, JobConfig> jobConfigMap,
       WorkflowControllerDataProvider clusterDataCache) {
     finishJobInRuntimeJobDag(clusterDataCache.getTaskDataCache(), workflowConfig.getWorkflowId(),
         jobName);
     long currentTime = System.currentTimeMillis();
     workflowContext.setJobState(jobName, TaskState.FAILED);
     if (jobContext != null) {
       jobContext.setFinishTime(currentTime);
     }
     if (isWorkflowFinished(workflowContext, workflowConfig, jobConfigMap, clusterDataCache)) {
       workflowContext.setFinishTime(currentTime);
       updateWorkflowMonitor(workflowContext, workflowConfig);
     }
     scheduleJobCleanUp(jobConfigMap.get(jobName), workflowConfig, currentTime);
   }

   protected void scheduleJobCleanUp(JobConfig jobConfig, WorkflowConfig workflowConfig,
       long currentTime) {
     long currentScheduledTime =
         _rebalanceScheduler.getRebalanceTime(workflowConfig.getWorkflowId()) == -1 ? Long.MAX_VALUE
             : _rebalanceScheduler.getRebalanceTime(workflowConfig.getWorkflowId());
     if (currentTime + jobConfig.getExpiry() < currentScheduledTime) {
       _rebalanceScheduler.scheduleRebalance(_manager, workflowConfig.getWorkflowId(),
           currentTime + jobConfig.getExpiry());
     }
   }

   // Workflow related methods

   /**
    * Checks if the workflow has finished (either completed or failed).
    * Set the state in workflow context properly.
    * @param ctx Workflow context containing job states
    * @param cfg Workflow config containing set of jobs
    * @return returns true if the workflow
    *         1. completed (all tasks are {@link TaskState#COMPLETED})
    *         2. failed (any task is {@link TaskState#FAILED}
    *         3. workflow is {@link TaskState#TIMED_OUT}
    *         returns false otherwise.
    */
   protected boolean isWorkflowFinished(WorkflowContext ctx, WorkflowConfig cfg,
       Map<String, JobConfig> jobConfigMap, WorkflowControllerDataProvider clusterDataCache) {
     boolean incomplete = false;

     TaskState workflowState = ctx.getWorkflowState();
     if (TaskState.TIMED_OUT.equals(workflowState)) {
       // We don't update job state here as JobRebalancer will do it
       return true;
     }

     // Check if failed job count is beyond threshold and if so, fail the workflow
     // and abort in-progress jobs
     int failedJobs = 0;
     for (String job : cfg.getJobDag().getAllNodes()) {
       TaskState jobState = ctx.getJobState(job);
       if (jobState == TaskState.FAILED || jobState == TaskState.TIMED_OUT) {
         failedJobs++;
         if (!cfg.isJobQueue() && failedJobs > cfg.getFailureThreshold()) {
           ctx.setWorkflowState(TaskState.FAILED);
           LOG.info("Workflow {} reached the failure threshold, so setting its state to FAILED.",
               cfg.getWorkflowId());
           for (String jobToFail : cfg.getJobDag().getAllNodes()) {
             if (ctx.getJobState(jobToFail) == TaskState.IN_PROGRESS) {
               ctx.setJobState(jobToFail, TaskState.ABORTED);

               // Skip aborted jobs latency since they are not accurate latency for job running time
               if (_clusterStatusMonitor != null) {
                 _clusterStatusMonitor.updateJobCounters(jobConfigMap.get(jobToFail),
                     TaskState.ABORTED);
               }

               // Since the job is aborted, release resources occupied by it
               // Otherwise, we run the risk of resource leak
               if (clusterDataCache != null) {
                 AssignableInstanceManager assignableInstanceManager =
                     clusterDataCache.getAssignableInstanceManager();
                 JobConfig jobConfig = jobConfigMap.get(jobToFail);
                 String quotaType = jobConfig.getJobType();
                 Map<String, TaskConfig> taskConfigMap = jobConfig.getTaskConfigMap();
                 // Iterate over all tasks and release them
                 for (Map.Entry<String, TaskConfig> taskEntry : taskConfigMap.entrySet()) {
                   TaskConfig taskConfig = taskEntry.getValue();
                   for (String assignableInstanceName : assignableInstanceManager
                       .getAssignableInstanceNames()) {
                     assignableInstanceManager.release(assignableInstanceName, taskConfig,
                         quotaType);
                   }
                 }
               }
             }
           }
           return true;
         }
       }
       if (jobState != TaskState.COMPLETED && jobState != TaskState.FAILED
           && jobState != TaskState.TIMED_OUT) {
         incomplete = true;
       }
     }
     if (!incomplete && cfg.isTerminable()) {
       ctx.setWorkflowState(TaskState.COMPLETED);
       return true;
     }
     return false;
   }

   protected void updateWorkflowMonitor(WorkflowContext context, WorkflowConfig config) {
     if (_clusterStatusMonitor != null) {
       _clusterStatusMonitor.updateWorkflowCounters(config, context.getWorkflowState(),
           context.getFinishTime() - context.getStartTime());
     }
   }

   // Common methods

   protected Set<String> getExcludedInstances(String currentJobName, WorkflowConfig workflowCfg,
       WorkflowContext workflowContext, WorkflowControllerDataProvider cache) {
     Set<String> ret = new HashSet<>();

     if (!workflowCfg.isAllowOverlapJobAssignment()) {
       // exclude all instances that has been assigned other jobs' tasks
       for (String jobName : workflowCfg.getJobDag().getAllNodes()) {
         if (jobName.equals(currentJobName)) {
           continue;
         }
         JobContext jobContext = cache.getJobContext(jobName);
         if (jobContext == null) {
           continue;
         }
         // Also skip if the job is not currently running
         // For example, if the job here is in a terminal state (such as ABORTED), then its tasks are
         // practically not running, so we do not need to exclude instances who have tasks from dead
         // jobs
         TaskState jobState = workflowContext.getJobState(jobName);
         if (jobState != TaskState.IN_PROGRESS) {
           continue;
         }
         for (int pId : jobContext.getPartitionSet()) {
           TaskPartitionState partitionState = jobContext.getPartitionState(pId);
           if (partitionState == TaskPartitionState.INIT
               || partitionState == TaskPartitionState.RUNNING) {
             ret.add(jobContext.getAssignedParticipant(pId));
           }
         }
       }
     }
     return ret;
   }

   /**
    * Schedule the rebalancer timer for task framework elements
    * @param resourceId The resource id
    * @param startTime The resource start time
    * @param timeoutPeriod The resource timeout period. Will be -1 if it is not set.
    */
   protected void scheduleRebalanceForTimeout(String resourceId, long startTime,
       long timeoutPeriod) {
     long nextTimeout = getTimeoutTime(startTime, timeoutPeriod);
     long nextRebalanceTime = _rebalanceScheduler.getRebalanceTime(resourceId);
     if (nextTimeout >= System.currentTimeMillis()
         && (nextRebalanceTime == TaskConstants.DEFAULT_NEVER_TIMEOUT
             || nextTimeout < nextRebalanceTime)) {
       _rebalanceScheduler.scheduleRebalance(_manager, resourceId, nextTimeout);
     }
   }

   /**
    * Basic function to check task framework resources, workflow and job, are timeout
    * @param startTime Resources start time
    * @param timeoutPeriod Resources timeout period. Will be -1 if it is not set.
    * @return
    */
   protected boolean isTimeout(long startTime, long timeoutPeriod) {
     long nextTimeout = getTimeoutTime(startTime, timeoutPeriod);
     return nextTimeout != TaskConstants.DEFAULT_NEVER_TIMEOUT
         && nextTimeout <= System.currentTimeMillis();
   }

   private long getTimeoutTime(long startTime, long timeoutPeriod) {
     return (timeoutPeriod == TaskConstants.DEFAULT_NEVER_TIMEOUT
         || timeoutPeriod > Long.MAX_VALUE - startTime)
             // check long overflow
             ? TaskConstants.DEFAULT_NEVER_TIMEOUT
             : startTime + timeoutPeriod;
   }

   /**
    * Set the ClusterStatusMonitor for metrics update
    */
   public void setClusterStatusMonitor(ClusterStatusMonitor clusterStatusMonitor) {
     _clusterStatusMonitor = clusterStatusMonitor;
   }

   /**
    * Returns an appropriate TaskId depending on whether the job is targeted or not.
    * @param jobCfg
    * @param jobCtx
    * @param partitionNum
    * @return
    */
   private String getTaskId(JobConfig jobCfg, JobContext jobCtx, int partitionNum) {
     if (TaskUtil.isGenericTaskJob(jobCfg)) {
       return jobCtx.getTaskIdForPartition(partitionNum);
     }
     // This is a targeted task
     return pName(jobCfg.getJobId(), partitionNum);
   }

   /**
    * Checks if the workflow has been stopped.
    * In the case of a recurrent workflow template, we look at its TargetState.
    * @param ctx Workflow context containing task states
    * @param cfg Workflow config containing set of tasks
    * @return returns true if all tasks are {@link TaskState#STOPPED}, false otherwise.
    */
   protected boolean isWorkflowStopped(WorkflowContext ctx, WorkflowConfig cfg) {
     if (cfg.isRecurring()) {
       return cfg.getTargetState() == TargetState.STOP;
     }

     for (String job : cfg.getJobDag().getAllNodes()) {
       TaskState jobState = ctx.getJobState(job);
       if (jobState != null
           && (jobState.equals(TaskState.IN_PROGRESS) || jobState.equals(TaskState.STOPPING))) {
         return false;
       }
     }
     return true;
   }

   protected ResourceAssignment buildEmptyAssignment(String name,
       CurrentStateOutput currStateOutput) {
     ResourceAssignment assignment = new ResourceAssignment(name);
     Set<Partition> partitions = currStateOutput.getCurrentStateMappedPartitions(name);
     for (Partition partition : partitions) {
       Map<String, String> currentStateMap = currStateOutput.getCurrentStateMap(name, partition);
       Map<String, String> replicaMap = Maps.newHashMap();
       for (String instanceName : currentStateMap.keySet()) {
         replicaMap.put(instanceName, HelixDefinedState.DROPPED.toString());
       }
       assignment.addReplicaMap(partition, replicaMap);
     }
     return assignment;
   }

   /**
    * Check all the dependencies of a job to determine whether the job is ready to be scheduled.
    * @param job
    * @param workflowCfg
    * @param workflowCtx
    * @return
    */
   protected boolean isJobReadyToSchedule(String job, WorkflowConfig workflowCfg,
       WorkflowContext workflowCtx, int incompleteAllCount, Map<String, JobConfig> jobConfigMap,
       WorkflowControllerDataProvider clusterDataCache,
       AssignableInstanceManager assignableInstanceManager) {
     int notStartedCount = 0;
     int failedOrTimeoutCount = 0;
     int incompleteParentCount = 0;
     JobConfig jobConfig = jobConfigMap.get(job);

     if (jobConfig == null) {
       LOG.error(String.format("The job config is missing for job %s", job));
       return false;
     }

     String quotaType = TaskAssignmentCalculator.getQuotaType(workflowCfg, jobConfig);
     if (quotaType == null || !assignableInstanceManager.hasQuotaType(quotaType)) {
       quotaType = AssignableInstance.DEFAULT_QUOTA_TYPE;
     }

     if (!assignableInstanceManager.hasGlobalCapacity(quotaType)) {
       LOG.info(String.format(
           "Job %s not ready to schedule due to not having enough quota for quota type %s", job,
           quotaType));
       return false;
     }

     for (String parent : workflowCfg.getJobDag().getDirectParents(job)) {
       TaskState jobState = workflowCtx.getJobState(parent);
       if (jobState == null || jobState == TaskState.NOT_STARTED) {
         ++notStartedCount;
       } else if (jobState == TaskState.FAILED || jobState == TaskState.TIMED_OUT) {
         ++failedOrTimeoutCount;
       } else if (jobState != TaskState.COMPLETED) {
         incompleteParentCount++;
       }
     }

     // If there is any parent job not started, this job should not be scheduled
     if (notStartedCount > 0) {
       if (LOG.isDebugEnabled()) {
         LOG.debug(String.format("Job %s is not ready to start, notStartedParent(s)=%d.", job,
             notStartedCount));
       }
       return false;
     }

     // If there is parent job failed, schedule the job only when ignore dependent
     // job failure enabled
     if (failedOrTimeoutCount > 0 && !jobConfig.isIgnoreDependentJobFailure()) {
       markJobFailed(job, null, workflowCfg, workflowCtx, jobConfigMap, clusterDataCache);
       if (LOG.isDebugEnabled()) {
         LOG.debug(String.format("Job %s is not ready to start, failedCount(s)=%d.", job,
             failedOrTimeoutCount));
       }
       return false;
     }

     if (workflowCfg.isJobQueue()) {
       // If job comes from a JobQueue, it should apply the parallel job logics
       if (incompleteAllCount >= workflowCfg.getParallelJobs()) {
         if (LOG.isDebugEnabled()) {
           LOG.debug(String.format("Job %s is not ready to schedule, inCompleteJobs(s)=%d.", job,
               incompleteAllCount));
         }
         return false;
       }
     } else {
       // If this job comes from a generic workflow, job will not be scheduled until
       // all the direct parent jobs finished
       if (incompleteParentCount > 0) {
         if (LOG.isDebugEnabled()) {
           LOG.debug(String.format("Job %s is not ready to start, notFinishedParent(s)=%d.", job,
               incompleteParentCount));
         }
         return false;
       }
     }

     return true;
   }

   /**
    * Check if a workflow is ready to schedule.
    * @param workflowCfg the workflow to check
    * @return true if the workflow is ready for schedule, false if not ready
    */
   protected boolean isWorkflowReadyForSchedule(WorkflowConfig workflowCfg) {
     Date startTime = workflowCfg.getStartTime();
     // Workflow with non-scheduled config or passed start time is ready to schedule.
     return (startTime == null || startTime.getTime() <= System.currentTimeMillis());
   }

   public void updateBestPossibleStateOutput(String resource,
       ResourceAssignment partitionStateAssignment, BestPossibleStateOutput output) {
     // Use the internal MappingCalculator interface to compute the final assignment
     // The next release will support rebalancers that compute the mapping from start to finish
     for (Partition partition : partitionStateAssignment.getMappedPartitions()) {
       Map<String, String> newStateMap = partitionStateAssignment.getReplicaMap(partition);
       output.setState(resource, partition, newStateMap);
     }
   }

   protected void finishJobInRuntimeJobDag(TaskDataCache clusterDataCache, String workflowName,
       String jobName) {
     RuntimeJobDag runtimeJobDag = clusterDataCache.getRuntimeJobDag(workflowName);
     if (runtimeJobDag != null) {
       runtimeJobDag.finishJob(jobName);
       LOG.debug(
           String.format("Finish job %s of workflow %s for runtime job DAG", jobName, workflowName));
     } else {
       LOG.warn(String.format("Failed to find runtime job DAG for workflow %s and job %s",
           workflowName, jobName));
     }
   }

   /**
    * TODO: Move this logic to Task Framework metrics class for refactoring.
    * Computes and passes on submissionToProcessDelay to the dynamic metric.
    * @param dataProvider
    * @param clusterStatusMonitor
    * @param workflowConfig
    * @param jobConfig
    * @param currentTimestamp
    */
   protected static void reportSubmissionToProcessDelay(BaseControllerDataProvider dataProvider,
       final ClusterStatusMonitor clusterStatusMonitor, final WorkflowConfig workflowConfig,
       final JobConfig jobConfig, final long currentTimestamp) {
     AbstractBaseStage.asyncExecute(dataProvider.getAsyncTasksThreadPool(), () -> {
       // Asynchronously update the appropriate JobMonitor
       JobMonitor jobMonitor = clusterStatusMonitor
           .getJobMonitor(TaskAssignmentCalculator.getQuotaType(workflowConfig, jobConfig));
       if (jobMonitor == null) {
         return null;
       }

       // Compute SubmissionToProcessDelay
       long submissionToProcessDelay = currentTimestamp - jobConfig.getStat().getCreationTime();
       jobMonitor.updateSubmissionToProcessDelayGauge(submissionToProcessDelay);
       return null;
     });
   }

   /**
    * TODO: Move this logic to Task Framework metrics class for refactoring.
    * Computes and passes on submissionToScheduleDelay to the dynamic metric.
    * @param dataProvider
    * @param clusterStatusMonitor
    * @param workflowConfig
    * @param jobConfig
    * @param currentTimestamp
    */
   private static void reportSubmissionToScheduleDelay(BaseControllerDataProvider dataProvider,
       final ClusterStatusMonitor clusterStatusMonitor, final WorkflowConfig workflowConfig,
       final JobConfig jobConfig, final long currentTimestamp) {
     AbstractBaseStage.asyncExecute(dataProvider.getAsyncTasksThreadPool(), () -> {
       // Asynchronously update the appropriate JobMonitor
       JobMonitor jobMonitor = clusterStatusMonitor
           .getJobMonitor(TaskAssignmentCalculator.getQuotaType(workflowConfig, jobConfig));
       if (jobMonitor == null) {
         return null;
       }

       // Compute SubmissionToScheduleDelay
       long submissionToStartDelay = currentTimestamp - jobConfig.getStat().getCreationTime();
       jobMonitor.updateSubmissionToScheduleDelayGauge(submissionToStartDelay);
       return null;
     });
   }

   /**
    * TODO: Move this logic to Task Framework metrics class for refactoring.
    * Computes and passes on controllerInducedDelay to the dynamic metric.
    * @param dataProvider
    * @param clusterStatusMonitor
    * @param workflowConfig
    * @param jobConfig
    * @param currentTimestamp
    */
   private static void reportControllerInducedDelay(BaseControllerDataProvider dataProvider,
       final ClusterStatusMonitor clusterStatusMonitor, final WorkflowConfig workflowConfig,
       final JobConfig jobConfig, final long currentTimestamp) {
     AbstractBaseStage.asyncExecute(dataProvider.getAsyncTasksThreadPool(), () -> {
       // Asynchronously update the appropriate JobMonitor
       JobMonitor jobMonitor = clusterStatusMonitor
           .getJobMonitor(TaskAssignmentCalculator.getQuotaType(workflowConfig, jobConfig));
       if (jobMonitor == null) {
         return null;
       }

       // Compute ControllerInducedDelay only if the workload is a test load
       // NOTE: this metric cannot be computed for general user-submitted workloads because
       // the actual runtime of the tasks vary, and there could exist multiple tasks per
       // job
       // NOTE: a test workload will have the "latency" field in the mapField of the
       // JobConfig (taskConfig)
       String firstTask = jobConfig.getTaskConfigMap().keySet().iterator().next();
       if (jobConfig.getTaskConfig(firstTask).getConfigMap().containsKey(TASK_LATENCY_TAG)) {
         long taskDuration =
             Long.valueOf(jobConfig.getTaskConfig(firstTask).getConfigMap().get(TASK_LATENCY_TAG));
         long controllerInducedDelay =
             currentTimestamp - jobConfig.getStat().getCreationTime() - taskDuration;
         jobMonitor.updateControllerInducedDelayGauge(controllerInducedDelay);
       }
       return null;
     });
   }
 }