drill-yarn/src/main/java/org/apache/drill/yarn/appMaster/ClusterControllerImpl.java - drill - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.drill.yarn.appMaster;

 import java.util.ArrayList;
 import java.util.EnumSet;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;

 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.drill.yarn.appMaster.TaskLifecycleListener.Event;
 import org.apache.drill.yarn.core.DoYUtil;
 import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse;
 import org.apache.hadoop.yarn.api.records.Container;
 import org.apache.hadoop.yarn.api.records.ContainerId;
 import org.apache.hadoop.yarn.api.records.ContainerStatus;
 import org.apache.hadoop.yarn.api.records.Resource;
 import org.apache.hadoop.yarn.proto.YarnServiceProtos.SchedulerResourceTypes;

 /**
  * Controls the Drill cluster by representing the current cluster state with a
  * desired state, taking corrective action to keep the cluster in the desired
  * state. The cluster as a whole has a state, as do each task (node) within the
  * cluster.
  * <p>
  * This class is designed to allow unit tests. In general, testing the
  * controller on a live cluster is tedious. This class encapsulates the
  * controller algorithm so it can be driven by a simulated cluster.
  * <p>
  * This object is shared between threads, thus synchronized.
  */

 public class ClusterControllerImpl implements ClusterController {
   /**
    * Controller lifecycle state.
    */

   public enum State {
     /**
      * Cluster is starting. Things are in a partially-built state. No tasks are
      * started until the cluster moves to LIVE.
      */

     START,

     /**
      * Normal operating state: the controller seeks to maintain the desired
      * number of tasks.
      */

     LIVE,

     /**
      * Controller is shutting down. Tasks are gracefully (where possible) ended;
      * no new tasks are started. (That is, when we detect the exit of a task,
      * the controller no longer immediately tries to start a replacement.
      */

     ENDING,

     /**
      * The controller has shut down. All tasks and threads are stopped. The
      * controller allows the main thread (which has been patiently waiting) to
      * continue, allowing the AM itself to shut down. Thus, this is a very
      * short-lived state.
      */

     ENDED,

     /**
      * Something bad happened on start-up; the AM can't start and must shut
      * down.
      */

     FAILED
   }

   private final static int PRIORITY_OFFSET = 1;

   private static final Log LOG = LogFactory.getLog(ClusterControllerImpl.class);

   /**
    * Signals the completion of the cluster run. The main program waits on this
    * mutex until all tasks complete (batch) or the cluster is explicitly shut
    * down (persistent tasks.)
    */

   private Object completionMutex = new Object();

   /**
    * Maximum number of retries for each task launch.
    */

   protected int maxRetries = 3;

   /**
    * Controller state.
    *
    * @see {@link State}
    */

   State state = State.START;

   /**
    * Definition of the task types that can be run by this controller, along with
    * the target task levels for each.
    */

   private Map<String, SchedulerStateActions> taskPools = new HashMap<>();

   /**
    * List of task pools prioritized in the order in which tasks should start.
    * DoY supports only one task pool at present. The idea is to, later, support
    * multiple pools that represent, say, pool 1 as the minimum number of
    * Drillbits to run at all times, with pool 2 as extra Drillbits to start up
    * during peak demand.
    * <p>
    * The priority also gives rise to YARN request priorities which are the only
    * tool the AM has to associate container grants with the requests to which
    * they correspond.
    */

   private List<SchedulerStateActions> prioritizedGroups = new ArrayList<>();

   /**
    * Cluster-wide association of YARN container IDs to tasks.
    */

   private Set<ContainerId> allocatedContainers = new HashSet<>();

   /**
    * Cluster-wide list of active tasks. Allows lookup from container ID to task
    * (and then from task to task type.)
    */

   private Map<ContainerId, Task> activeContainers = new HashMap<>();

   /**
    * Tracks the tasks that have completed: either successfully (state == ENDED)
    * or failed (state == FAILED). Eventually store this information elsewhere to
    * avoid cluttering memory with historical data. Entries here are static
    * copies, preserving the state at the time that the task completed.
    */

   private List<Task> completedTasks = new LinkedList<>();

   /**
    * Wrapper around the YARN API. Abstracts the details of YARN operations.
    */

   private final AMYarnFacade yarn;

   /**
    * Maximum number of new tasks to start on each "pulse" tick.
    */

   private int maxRequestsPerTick = 2;

   private int stopTimoutMs = 10_000;

   /**
    * Time (in ms) between request to YARN to get an updated list of the node
    * "inventory".
    */

   private int configPollPeriod = 60_000;
   private long nextResourcePollTime;

   /**
    * List of nodes available in the cluster. Necessary as part of the process of
    * ensuring that we run one Drillbit per node. (The YARN blacklist only half
    * works for this purpose.)
    */

   private NodeInventory nodeInventory;

   private long lastFailureCheckTime;

   private int failureCheckPeriodMs = 60_000;

   private int taskCheckPeriodMs = 10_000;
   private long lastTaskCheckTime;

   /**
    * To increase code modularity, add-ons (such as the ZK monitor) register as
    * lifecycle listeners that are alerted to "interesting" lifecycle events.
    */

   private List<TaskLifecycleListener> lifecycleListeners = new ArrayList<>();

   /**
    * Handy mechanism for setting properties on this controller that are
    * available to plugins and UI without cluttering this class with member
    * variables.
    */

   private Map<String, Object> properties = new HashMap<>();

   /**
    * When enabled, allows the controller to check for failures that result in no
    * drillbits running. The controller will then automatically exit as no useful
    * work can be done. Disable this to make debugging easier on a single-node
    * cluster (lets you, say, start a "stray" drill bit and see what happens
    * without the AM exiting.)
    */

   private boolean enableFailureCheck = true;

   public ClusterControllerImpl(AMYarnFacade yarn) {
     this.yarn = yarn;
   }

   @Override
   public void enableFailureCheck(boolean flag) {
     this.enableFailureCheck = flag;
   }

   /**
    * Define a task type. Registration order is important: the controller starts
    * task in the order that they are registered. Must happen before the YARN
    * callbacks start.
    *
    * @param scheduler
    */

   @Override
   public void registerScheduler(Scheduler scheduler) {
     assert !taskPools.containsKey(scheduler.getName());
     scheduler.setPriority(taskPools.size() + PRIORITY_OFFSET);
     SchedulerStateActions taskGroup = new SchedulerStateImpl(this, scheduler);
     taskPools.put(taskGroup.getName(), taskGroup);
     prioritizedGroups.add(taskGroup);
   }

   /**
    * Called when the caller has completed start-up and the controller should
    * become live.
    */

   @Override
   public synchronized void started() throws YarnFacadeException, AMException {
     nodeInventory = new NodeInventory(yarn);

     // Verify that no resource seeks a container larger than
     // what YARN can provide. Ensures a graceful exit in this
     // case.

     Resource maxResource = yarn.getRegistrationResponse()
         .getMaximumResourceCapability();
     for (SchedulerStateActions group : prioritizedGroups) {
       group.getScheduler().limitContainerSize(maxResource);
     }
     state = State.LIVE;
   }

   @Override
   public synchronized void tick(long curTime) {
     if (state == State.LIVE) {
       adjustTasks(curTime);
       requestContainers();
     }
     if (state == State.LIVE || state == State.ENDING) {
       checkTasks(curTime);
     }
   }

   /**
    * Adjust the number of running tasks to match the desired level.
    *
    * @param curTime
    */

   private void adjustTasks(long curTime) {
     if (enableFailureCheck && getFreeNodeCount() == 0) {
       checkForFailure(curTime);
     }
     if (state != State.LIVE) {
       return;
     }
     for (SchedulerStateActions group : prioritizedGroups) {
       group.adjustTasks();
     }
   }

   /**
    * Get the approximate number of free YARN nodes (those that can
    * accept a task request.) Starts with the number of nodes from
    * the node inventory, then subtracts any in-flight requests (which
    * do not, by definition, have node allocated.)
    * <p>
    * This approximation <b>does not</b> consider whether the node
    * has sufficient resources to run a task; only whether the node
    * itself exists.
    * @return The approximate number of free YARN nodes.
    */

   @Override
   public int getFreeNodeCount( ) {
     int count = nodeInventory.getFreeNodeCount();
     for (SchedulerStateActions group : prioritizedGroups) {
       count -= group.getRequestCount( );
     }
     return Math.max( 0, count );
   }

   /**
    * Check if the controller is unable to run any tasks. If so, and the option
    * is enabled, then automatically exit since no useful work can be done.
    *
    * @param curTime
    */

   private void checkForFailure(long curTime) {
     if (lastFailureCheckTime + failureCheckPeriodMs > curTime) {
       return;
     }
     lastFailureCheckTime = curTime;
     for (SchedulerStateActions group : prioritizedGroups) {
       if (group.getTaskCount() > 0) {
         return;
       }
     }
     LOG.error(
         "Application failure: no tasks are running and no nodes are available -- exiting.");
     terminate(State.FAILED);
   }

   /**
    * Periodically check tasks, handling any timeout issues.
    *
    * @param curTime
    */

   private void checkTasks(long curTime) {

     // Check periodically, not on every tick.

     if (lastTaskCheckTime + taskCheckPeriodMs > curTime) {
       return;
     }
     lastTaskCheckTime = curTime;

     // Check for task timeouts in states that have a timeout.

     EventContext context = new EventContext(this);
     for (SchedulerStateActions group : prioritizedGroups) {
       context.setGroup(group);
       group.checkTasks(context, curTime);
     }
   }

   /**
    * Get an update from YARN on available resources.
    */

   @Override
   public void updateRMStatus() {
     long curTime = System.currentTimeMillis();
     if (nextResourcePollTime > curTime) {
       return;
     }

     // yarnNodeCount = yarn.getNodeCount();
     // LOG.info("YARN reports " + yarnNodeCount + " nodes.");

     // Resource yarnResources = yarn.getResources();
     // if (yarnResources != null) {
     // LOG.info("YARN reports " + yarnResources.getMemory() + " MB, " +
     // yarnResources.getVirtualCores()
     // + " vcores available.");
     // }
     nextResourcePollTime = curTime + configPollPeriod;
   }

   /**
    * Request any containers that have accumulated.
    */

   private void requestContainers() {
     EventContext context = new EventContext(this);
     for (SchedulerStateActions group : prioritizedGroups) {
       context.setGroup(group);
       if (group.requestContainers(context, maxRequestsPerTick)) {
         break;
       }
     }
   }

   @Override
   public synchronized void containersAllocated(List<Container> containers) {
     EventContext context = new EventContext(this);
     for (Container container : containers) {
       if (allocatedContainers.contains(container.getId())) {
         continue;
       }

       // We should never get a container on a node in the blacklist we
       // sent to YARN. If we do, something is wrong. Log the error and
       // reject the container. Else, bad things happen further along as
       // the tracking mechanisms assume one task per node.

       String host = container.getNodeId().getHost();
       if (nodeInventory.isInUse(host)) {
         LOG.error( "Host is in use, but YARN allocated a container: " +
                    DoYUtil.labelContainer(container) + " - container rejected." );
         yarn.releaseContainer(container);
         continue;
       }

       // The container is fine.

       allocatedContainers.add(container.getId());
       int priority = container.getPriority().getPriority();
       int offset = priority - PRIORITY_OFFSET;
       if (offset < 0 || offset > prioritizedGroups.size()) {
         LOG.error("Container allocated with unknown priority " + DoYUtil.labelContainer(container));
         continue;
       }
       context.setGroup(prioritizedGroups.get(offset));
       context.group.containerAllocated(context, container);
     }
   }

   @Override
   public synchronized void containerStarted(ContainerId containerId) {
     Task task = getTask(containerId);
     if (task == null) {
       return;
     }
     EventContext context = new EventContext(this, task);
     context.getState().containerStarted(context);
     LOG.trace("Container started: " + containerId);
   }

   @Override
   public synchronized void taskStartFailed(ContainerId containerId,
       Throwable t) {
     Task task = getTask(containerId);
     if (task == null) {
       return;
     }
     EventContext context = new EventContext(this, task);
     context.getState().launchFailed(context, t);
   }

   private Task getTask(ContainerId containerId) {
     return activeContainers.get(containerId);
   }

   @Override
   public synchronized void containerStopped(ContainerId containerId) {
     // Ignored because the node manager notification is very
     // unreliable. Better to rely on the Resource Manager
     // completion request.
     // Task task = getTask(containerId);
     // if (task == null) {
     // return; }
     // EventContext context = new EventContext(this, task);
     // context.getState().containerStopped(context);
   }

   @Override
   public synchronized void containersCompleted(List<ContainerStatus> statuses) {
     EventContext context = new EventContext(this);
     for (ContainerStatus status : statuses) {
       Task task = getTask(status.getContainerId());
       if (task == null) {
         if (task == null) {
           // Will occur if a container was allocated but rejected.
           // Any other occurrence is unexpected and an error.

           LOG.warn("Container completed but no associated task state: " + status.getContainerId() );
         }
         continue;
       }
       context.setTask(task);
       context.getState().containerCompleted(context, status);
     }
     checkStatus();
   }

   @Override
   public synchronized float getProgress() {
     int numerator = 0;
     int denominator = 0;
     for (SchedulerStateActions group : taskPools.values()) {
       Scheduler sched = group.getScheduler();
       int[] progress = sched.getProgress();
       numerator += progress[0];
       denominator += progress[1];
     }
     if (numerator == 0) {
       return 1;
     }
     return (float) denominator / (float) numerator;
   }

   @Override
   public synchronized void stopTaskFailed(ContainerId containerId,
       Throwable t) {
     Task task = getTask(containerId);
     if (task == null) {
       return;
     }
     EventContext context = new EventContext(this, task);
     context.getState().stopTaskFailed(context, t);
   }

   @Override
   public synchronized void resizeDelta(int delta) {
     // TODO: offer the delta to each scheduler in turn.
     // For now, we support only one scheduler.

     prioritizedGroups.get(0).getScheduler().change(delta);
   }

   @Override
   public synchronized int resizeTo(int n) {
     // TODO: offer the delta to each scheduler in turn.
     // For now, we support only one scheduler.

     return prioritizedGroups.get(0).getScheduler().resize(n);
   }

   @Override
   public synchronized void shutDown() {
     LOG.info("Shut down request received");
     this.state = State.ENDING;
     EventContext context = new EventContext(this);
     for (SchedulerStateActions group : prioritizedGroups) {
       group.shutDown(context);
     }
     checkStatus();
   }

   @Override
   public boolean waitForCompletion() {
     start();
     synchronized (completionMutex) {
       try {
         completionMutex.wait();
         LOG.info("Controller shut down completed");
       } catch (InterruptedException e) {
         // Should not happen
       }
     }
     return succeeded();
   }

   private void start() {
     yarnReport();
   }

   private void yarnReport() {
     RegisterApplicationMasterResponse response = yarn.getRegistrationResponse();
     LOG.info("YARN queue: " + response.getQueue());
     Resource resource = response.getMaximumResourceCapability();
     LOG.info("YARN max resource: " + resource.getMemory() + " MB, "
         + resource.getVirtualCores() + " cores");
     EnumSet<SchedulerResourceTypes> types = response
         .getSchedulerResourceTypes();
     StringBuilder buf = new StringBuilder();
     String sep = "";
     for (SchedulerResourceTypes type : types) {
       buf.append(sep);
       buf.append(type.toString());
       sep = ", ";
     }
     LOG.info("YARN scheduler resource types: " + buf.toString());
   }

   /**
    * Check for overall completion. We are done when either we've successfully
    * run all tasks, or we've run some and given up on others. We're done when
    * the number of completed or failed tasks reaches our target.
    */

   private void checkStatus() {
     if (state != State.ENDING) {
       return;
     }
     for (SchedulerStateActions group : prioritizedGroups) {
       if (!group.isDone()) {
         return;
       }
     }
     terminate(State.ENDED);
   }

   private void terminate(State state) {
     this.state = state;
     synchronized (completionMutex) {
       completionMutex.notify();
     }
   }

   public boolean isLive() {
     return state == State.LIVE;
   }

   public boolean succeeded() {
     return state == State.ENDED;
   }

   public void containerAllocated(Task task) {
     activeContainers.put(task.getContainerId(), task);
   }

   public AMYarnFacade getYarn() {
     return yarn;
   }

   public void containerReleased(Task task) {
     activeContainers.remove(task.getContainerId());
   }

   public void taskEnded(Task task) {
     completedTasks.add(task);
   }

   public void taskRetried(Task task) {
     Task copy = task.copy();
     copy.disposition = Task.Disposition.RETRIED;
     completedTasks.add(copy);
   }

   public void taskGroupCompleted(SchedulerStateActions taskGroup) {
     checkStatus();
   }

   public int getMaxRetries() {
     return maxRetries;
   }

   public int getStopTimeoutMs() {
     return stopTimoutMs;
   }

   @Override
   public synchronized void reserveHost(String hostName) {
     nodeInventory.reserve(hostName);
   }

   @Override
   public synchronized void releaseHost(String hostName) {
     nodeInventory.release(hostName);
   }

   public NodeInventory getNodeInventory() {
     return nodeInventory;
   }

   @Override
   public void setProperty(String key, Object value) {
     properties.put(key, value);
   }

   @Override
   public Object getProperty(String key) {
     return properties.get(key);
   }

   @Override
   public void registerLifecycleListener(TaskLifecycleListener listener) {
     lifecycleListeners.add(listener);
   }

   public void fireLifecycleChange(Event event, EventContext context) {
     for (TaskLifecycleListener listener : lifecycleListeners) {
       listener.stateChange(event, context);
     }
   }

   @Override
   public void setMaxRetries(int value) {
     maxRetries = value;
   }

   @Override
   public int getTargetCount() {
     int count = 0;
     for (SchedulerStateActions group : prioritizedGroups) {
       count += group.getScheduler().getTarget();
     }
     return count;
   }

   public State getState() {
     return state;
   }

   @Override
   public synchronized void visit(ControllerVisitor visitor) {
     visitor.visit(this);
   }

   public List<SchedulerStateActions> getPools() {
     return prioritizedGroups;
   }

   @Override
   public synchronized void visitTasks(TaskVisitor visitor) {
     for (SchedulerStateActions pool : prioritizedGroups) {
       pool.visitTaskModels(visitor);
     }
   }

   public List<Task> getHistory() {
     return completedTasks;
   }

   @Override
   public boolean isTaskLive(int id) {
     for (SchedulerStateActions group : prioritizedGroups) {
       Task task = group.getTask(id);
       if (task != null) {
         return task.isLive();
       }
     }
     return false;
   }

   @Override
   public synchronized boolean cancelTask(int id) {
     for (SchedulerStateActions group : prioritizedGroups) {
       Task task = group.getTask(id);
       if (task != null) {
         group.cancel(task);
         group.getScheduler().change(-1);
         return true;
       }
     }
     LOG.warn( "Requested to cancel task, but no task found: " + id );
     return false;
   }

   @Override
   public synchronized void completionAck(Task task, String propertyKey) {
     EventContext context = new EventContext(this);
     context.setTask(task);
     context.getState().completionAck(context);
     if (propertyKey != null) {
       task.properties.remove(propertyKey);
     }
   }

   @Override
   public synchronized void startAck(Task task, String propertyKey,
       Object value) {
     if (propertyKey != null && value != null) {
       task.properties.put(propertyKey, value);
     }
     EventContext context = new EventContext(this);
     context.setTask(task);
     context.getState().startAck(context);
   }

   @Override
   public boolean supportsDiskResource() {
     return getYarn().supportsDiskResource();
   }

   @Override
   public void registryDown() { shutDown( ); }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.drill.yarn.appMaster;

	import java.util.ArrayList;
	import java.util.EnumSet;
	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.LinkedList;
	import java.util.List;
	import java.util.Map;
	import java.util.Set;

	import org.apache.commons.logging.Log;
	import org.apache.commons.logging.LogFactory;
	import org.apache.drill.yarn.appMaster.TaskLifecycleListener.Event;
	import org.apache.drill.yarn.core.DoYUtil;
	import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse;
	import org.apache.hadoop.yarn.api.records.Container;
	import org.apache.hadoop.yarn.api.records.ContainerId;
	import org.apache.hadoop.yarn.api.records.ContainerStatus;
	import org.apache.hadoop.yarn.api.records.Resource;
	import org.apache.hadoop.yarn.proto.YarnServiceProtos.SchedulerResourceTypes;

	/**
	* Controls the Drill cluster by representing the current cluster state with a
	* desired state, taking corrective action to keep the cluster in the desired
	* state. The cluster as a whole has a state, as do each task (node) within the
	* cluster.
	* <p>
	* This class is designed to allow unit tests. In general, testing the
	* controller on a live cluster is tedious. This class encapsulates the
	* controller algorithm so it can be driven by a simulated cluster.
	* <p>
	* This object is shared between threads, thus synchronized.
	*/

	public class ClusterControllerImpl implements ClusterController {
	/**
	* Controller lifecycle state.
	*/

	public enum State {
	/**
	* Cluster is starting. Things are in a partially-built state. No tasks are
	* started until the cluster moves to LIVE.
	*/

	START,

	/**
	* Normal operating state: the controller seeks to maintain the desired
	* number of tasks.
	*/

	LIVE,

	/**
	* Controller is shutting down. Tasks are gracefully (where possible) ended;
	* no new tasks are started. (That is, when we detect the exit of a task,
	* the controller no longer immediately tries to start a replacement.
	*/

	ENDING,

	/**
	* The controller has shut down. All tasks and threads are stopped. The
	* controller allows the main thread (which has been patiently waiting) to
	* continue, allowing the AM itself to shut down. Thus, this is a very
	* short-lived state.
	*/

	ENDED,

	/**
	* Something bad happened on start-up; the AM can't start and must shut
	* down.
	*/

	FAILED
	}

	private final static int PRIORITY_OFFSET = 1;

	private static final Log LOG = LogFactory.getLog(ClusterControllerImpl.class);

	/**
	* Signals the completion of the cluster run. The main program waits on this
	* mutex until all tasks complete (batch) or the cluster is explicitly shut
	* down (persistent tasks.)
	*/

	private Object completionMutex = new Object();

	/**
	* Maximum number of retries for each task launch.
	*/

	protected int maxRetries = 3;

	/**
	* Controller state.
	*
	* @see {@link State}
	*/

	State state = State.START;

	/**
	* Definition of the task types that can be run by this controller, along with
	* the target task levels for each.
	*/

	private Map<String, SchedulerStateActions> taskPools = new HashMap<>();

	/**
	* List of task pools prioritized in the order in which tasks should start.
	* DoY supports only one task pool at present. The idea is to, later, support
	* multiple pools that represent, say, pool 1 as the minimum number of
	* Drillbits to run at all times, with pool 2 as extra Drillbits to start up
	* during peak demand.
	* <p>
	* The priority also gives rise to YARN request priorities which are the only
	* tool the AM has to associate container grants with the requests to which
	* they correspond.
	*/

	private List<SchedulerStateActions> prioritizedGroups = new ArrayList<>();

	/**
	* Cluster-wide association of YARN container IDs to tasks.
	*/

	private Set<ContainerId> allocatedContainers = new HashSet<>();

	/**
	* Cluster-wide list of active tasks. Allows lookup from container ID to task
	* (and then from task to task type.)
	*/

	private Map<ContainerId, Task> activeContainers = new HashMap<>();

	/**
	* Tracks the tasks that have completed: either successfully (state == ENDED)
	* or failed (state == FAILED). Eventually store this information elsewhere to
	* avoid cluttering memory with historical data. Entries here are static
	* copies, preserving the state at the time that the task completed.
	*/

	private List<Task> completedTasks = new LinkedList<>();

	/**
	* Wrapper around the YARN API. Abstracts the details of YARN operations.
	*/

	private final AMYarnFacade yarn;

	/**
	* Maximum number of new tasks to start on each "pulse" tick.
	*/

	private int maxRequestsPerTick = 2;

	private int stopTimoutMs = 10_000;

	/**
	* Time (in ms) between request to YARN to get an updated list of the node
	* "inventory".
	*/

	private int configPollPeriod = 60_000;
	private long nextResourcePollTime;

	/**
	* List of nodes available in the cluster. Necessary as part of the process of
	* ensuring that we run one Drillbit per node. (The YARN blacklist only half
	* works for this purpose.)
	*/

	private NodeInventory nodeInventory;

	private long lastFailureCheckTime;

	private int failureCheckPeriodMs = 60_000;

	private int taskCheckPeriodMs = 10_000;
	private long lastTaskCheckTime;

	/**
	* To increase code modularity, add-ons (such as the ZK monitor) register as
	* lifecycle listeners that are alerted to "interesting" lifecycle events.
	*/

	private List<TaskLifecycleListener> lifecycleListeners = new ArrayList<>();

	/**
	* Handy mechanism for setting properties on this controller that are
	* available to plugins and UI without cluttering this class with member
	* variables.
	*/

	private Map<String, Object> properties = new HashMap<>();

	/**
	* When enabled, allows the controller to check for failures that result in no
	* drillbits running. The controller will then automatically exit as no useful
	* work can be done. Disable this to make debugging easier on a single-node
	* cluster (lets you, say, start a "stray" drill bit and see what happens
	* without the AM exiting.)
	*/

	private boolean enableFailureCheck = true;

	public ClusterControllerImpl(AMYarnFacade yarn) {
	this.yarn = yarn;
	}

	@Override
	public void enableFailureCheck(boolean flag) {
	this.enableFailureCheck = flag;
	}

	/**
	* Define a task type. Registration order is important: the controller starts
	* task in the order that they are registered. Must happen before the YARN
	* callbacks start.
	*
	* @param scheduler
	*/

	@Override
	public void registerScheduler(Scheduler scheduler) {
	assert !taskPools.containsKey(scheduler.getName());
	scheduler.setPriority(taskPools.size() + PRIORITY_OFFSET);
	SchedulerStateActions taskGroup = new SchedulerStateImpl(this, scheduler);
	taskPools.put(taskGroup.getName(), taskGroup);
	prioritizedGroups.add(taskGroup);
	}

	/**
	* Called when the caller has completed start-up and the controller should
	* become live.
	*/

	@Override
	public synchronized void started() throws YarnFacadeException, AMException {
	nodeInventory = new NodeInventory(yarn);

	// Verify that no resource seeks a container larger than
	// what YARN can provide. Ensures a graceful exit in this
	// case.

	Resource maxResource = yarn.getRegistrationResponse()
	.getMaximumResourceCapability();
	for (SchedulerStateActions group : prioritizedGroups) {
	group.getScheduler().limitContainerSize(maxResource);
	}
	state = State.LIVE;
	}

	@Override
	public synchronized void tick(long curTime) {
	if (state == State.LIVE) {
	adjustTasks(curTime);
	requestContainers();
	}
	if (state == State.LIVE \|\| state == State.ENDING) {
	checkTasks(curTime);
	}
	}

	/**
	* Adjust the number of running tasks to match the desired level.
	*
	* @param curTime
	*/

	private void adjustTasks(long curTime) {
	if (enableFailureCheck && getFreeNodeCount() == 0) {
	checkForFailure(curTime);
	}
	if (state != State.LIVE) {
	return;
	}
	for (SchedulerStateActions group : prioritizedGroups) {
	group.adjustTasks();
	}
	}

	/**
	* Get the approximate number of free YARN nodes (those that can
	* accept a task request.) Starts with the number of nodes from
	* the node inventory, then subtracts any in-flight requests (which
	* do not, by definition, have node allocated.)
	* <p>
	* This approximation <b>does not</b> consider whether the node
	* has sufficient resources to run a task; only whether the node
	* itself exists.
	* @return The approximate number of free YARN nodes.
	*/

	@Override
	public int getFreeNodeCount( ) {
	int count = nodeInventory.getFreeNodeCount();
	for (SchedulerStateActions group : prioritizedGroups) {
	count -= group.getRequestCount( );
	}
	return Math.max( 0, count );
	}

	/**
	* Check if the controller is unable to run any tasks. If so, and the option
	* is enabled, then automatically exit since no useful work can be done.
	*
	* @param curTime
	*/

	private void checkForFailure(long curTime) {
	if (lastFailureCheckTime + failureCheckPeriodMs > curTime) {
	return;
	}
	lastFailureCheckTime = curTime;
	for (SchedulerStateActions group : prioritizedGroups) {
	if (group.getTaskCount() > 0) {
	return;
	}
	}
	LOG.error(
	"Application failure: no tasks are running and no nodes are available -- exiting.");
	terminate(State.FAILED);
	}

	/**
	* Periodically check tasks, handling any timeout issues.
	*
	* @param curTime
	*/

	private void checkTasks(long curTime) {

	// Check periodically, not on every tick.

	if (lastTaskCheckTime + taskCheckPeriodMs > curTime) {
	return;
	}
	lastTaskCheckTime = curTime;

	// Check for task timeouts in states that have a timeout.

	EventContext context = new EventContext(this);
	for (SchedulerStateActions group : prioritizedGroups) {
	context.setGroup(group);
	group.checkTasks(context, curTime);
	}
	}

	/**
	* Get an update from YARN on available resources.
	*/

	@Override
	public void updateRMStatus() {
	long curTime = System.currentTimeMillis();
	if (nextResourcePollTime > curTime) {
	return;
	}

	// yarnNodeCount = yarn.getNodeCount();
	// LOG.info("YARN reports " + yarnNodeCount + " nodes.");

	// Resource yarnResources = yarn.getResources();
	// if (yarnResources != null) {
	// LOG.info("YARN reports " + yarnResources.getMemory() + " MB, " +
	// yarnResources.getVirtualCores()
	// + " vcores available.");
	// }
	nextResourcePollTime = curTime + configPollPeriod;
	}

	/**
	* Request any containers that have accumulated.
	*/

	private void requestContainers() {
	EventContext context = new EventContext(this);
	for (SchedulerStateActions group : prioritizedGroups) {
	context.setGroup(group);
	if (group.requestContainers(context, maxRequestsPerTick)) {
	break;
	}
	}
	}

	@Override
	public synchronized void containersAllocated(List<Container> containers) {
	EventContext context = new EventContext(this);
	for (Container container : containers) {
	if (allocatedContainers.contains(container.getId())) {
	continue;
	}

	// We should never get a container on a node in the blacklist we
	// sent to YARN. If we do, something is wrong. Log the error and
	// reject the container. Else, bad things happen further along as
	// the tracking mechanisms assume one task per node.

	String host = container.getNodeId().getHost();
	if (nodeInventory.isInUse(host)) {
	LOG.error( "Host is in use, but YARN allocated a container: " +
	DoYUtil.labelContainer(container) + " - container rejected." );
	yarn.releaseContainer(container);
	continue;
	}

	// The container is fine.

	allocatedContainers.add(container.getId());
	int priority = container.getPriority().getPriority();
	int offset = priority - PRIORITY_OFFSET;
	if (offset < 0 \|\| offset > prioritizedGroups.size()) {
	LOG.error("Container allocated with unknown priority " + DoYUtil.labelContainer(container));
	continue;
	}
	context.setGroup(prioritizedGroups.get(offset));
	context.group.containerAllocated(context, container);
	}
	}

	@Override
	public synchronized void containerStarted(ContainerId containerId) {
	Task task = getTask(containerId);
	if (task == null) {
	return;
	}
	EventContext context = new EventContext(this, task);
	context.getState().containerStarted(context);
	LOG.trace("Container started: " + containerId);
	}

	@Override
	public synchronized void taskStartFailed(ContainerId containerId,
	Throwable t) {
	Task task = getTask(containerId);
	if (task == null) {
	return;
	}
	EventContext context = new EventContext(this, task);
	context.getState().launchFailed(context, t);
	}

	private Task getTask(ContainerId containerId) {
	return activeContainers.get(containerId);
	}

	@Override
	public synchronized void containerStopped(ContainerId containerId) {
	// Ignored because the node manager notification is very
	// unreliable. Better to rely on the Resource Manager
	// completion request.
	// Task task = getTask(containerId);
	// if (task == null) {
	// return; }
	// EventContext context = new EventContext(this, task);
	// context.getState().containerStopped(context);
	}

	@Override
	public synchronized void containersCompleted(List<ContainerStatus> statuses) {
	EventContext context = new EventContext(this);
	for (ContainerStatus status : statuses) {
	Task task = getTask(status.getContainerId());
	if (task == null) {
	if (task == null) {
	// Will occur if a container was allocated but rejected.
	// Any other occurrence is unexpected and an error.

	LOG.warn("Container completed but no associated task state: " + status.getContainerId() );
	}
	continue;
	}
	context.setTask(task);
	context.getState().containerCompleted(context, status);
	}
	checkStatus();
	}

	@Override
	public synchronized float getProgress() {
	int numerator = 0;
	int denominator = 0;
	for (SchedulerStateActions group : taskPools.values()) {
	Scheduler sched = group.getScheduler();
	int[] progress = sched.getProgress();
	numerator += progress[0];
	denominator += progress[1];
	}
	if (numerator == 0) {
	return 1;
	}
	return (float) denominator / (float) numerator;
	}

	@Override
	public synchronized void stopTaskFailed(ContainerId containerId,
	Throwable t) {
	Task task = getTask(containerId);
	if (task == null) {
	return;
	}
	EventContext context = new EventContext(this, task);
	context.getState().stopTaskFailed(context, t);
	}

	@Override
	public synchronized void resizeDelta(int delta) {
	// TODO: offer the delta to each scheduler in turn.
	// For now, we support only one scheduler.

	prioritizedGroups.get(0).getScheduler().change(delta);
	}

	@Override
	public synchronized int resizeTo(int n) {
	// TODO: offer the delta to each scheduler in turn.
	// For now, we support only one scheduler.

	return prioritizedGroups.get(0).getScheduler().resize(n);
	}

	@Override
	public synchronized void shutDown() {
	LOG.info("Shut down request received");
	this.state = State.ENDING;
	EventContext context = new EventContext(this);
	for (SchedulerStateActions group : prioritizedGroups) {
	group.shutDown(context);
	}
	checkStatus();
	}

	@Override
	public boolean waitForCompletion() {
	start();
	synchronized (completionMutex) {
	try {
	completionMutex.wait();
	LOG.info("Controller shut down completed");
	} catch (InterruptedException e) {
	// Should not happen
	}
	}
	return succeeded();
	}

	private void start() {
	yarnReport();
	}

	private void yarnReport() {
	RegisterApplicationMasterResponse response = yarn.getRegistrationResponse();
	LOG.info("YARN queue: " + response.getQueue());
	Resource resource = response.getMaximumResourceCapability();
	LOG.info("YARN max resource: " + resource.getMemory() + " MB, "
	+ resource.getVirtualCores() + " cores");
	EnumSet<SchedulerResourceTypes> types = response
	.getSchedulerResourceTypes();
	StringBuilder buf = new StringBuilder();
	String sep = "";
	for (SchedulerResourceTypes type : types) {
	buf.append(sep);
	buf.append(type.toString());
	sep = ", ";
	}
	LOG.info("YARN scheduler resource types: " + buf.toString());
	}

	/**
	* Check for overall completion. We are done when either we've successfully
	* run all tasks, or we've run some and given up on others. We're done when
	* the number of completed or failed tasks reaches our target.
	*/

	private void checkStatus() {
	if (state != State.ENDING) {
	return;
	}
	for (SchedulerStateActions group : prioritizedGroups) {
	if (!group.isDone()) {
	return;
	}
	}
	terminate(State.ENDED);
	}

	private void terminate(State state) {
	this.state = state;
	synchronized (completionMutex) {
	completionMutex.notify();
	}
	}

	public boolean isLive() {
	return state == State.LIVE;
	}

	public boolean succeeded() {
	return state == State.ENDED;
	}

	public void containerAllocated(Task task) {
	activeContainers.put(task.getContainerId(), task);
	}

	public AMYarnFacade getYarn() {
	return yarn;
	}

	public void containerReleased(Task task) {
	activeContainers.remove(task.getContainerId());
	}

	public void taskEnded(Task task) {
	completedTasks.add(task);
	}

	public void taskRetried(Task task) {
	Task copy = task.copy();
	copy.disposition = Task.Disposition.RETRIED;
	completedTasks.add(copy);
	}

	public void taskGroupCompleted(SchedulerStateActions taskGroup) {
	checkStatus();
	}

	public int getMaxRetries() {
	return maxRetries;
	}

	public int getStopTimeoutMs() {
	return stopTimoutMs;
	}

	@Override
	public synchronized void reserveHost(String hostName) {
	nodeInventory.reserve(hostName);
	}

	@Override
	public synchronized void releaseHost(String hostName) {
	nodeInventory.release(hostName);
	}

	public NodeInventory getNodeInventory() {
	return nodeInventory;
	}

	@Override
	public void setProperty(String key, Object value) {
	properties.put(key, value);
	}

	@Override
	public Object getProperty(String key) {
	return properties.get(key);
	}

	@Override
	public void registerLifecycleListener(TaskLifecycleListener listener) {
	lifecycleListeners.add(listener);
	}

	public void fireLifecycleChange(Event event, EventContext context) {
	for (TaskLifecycleListener listener : lifecycleListeners) {
	listener.stateChange(event, context);
	}
	}

	@Override
	public void setMaxRetries(int value) {
	maxRetries = value;
	}

	@Override
	public int getTargetCount() {
	int count = 0;
	for (SchedulerStateActions group : prioritizedGroups) {
	count += group.getScheduler().getTarget();
	}
	return count;
	}

	public State getState() {
	return state;
	}

	@Override
	public synchronized void visit(ControllerVisitor visitor) {
	visitor.visit(this);
	}

	public List<SchedulerStateActions> getPools() {
	return prioritizedGroups;
	}

	@Override
	public synchronized void visitTasks(TaskVisitor visitor) {
	for (SchedulerStateActions pool : prioritizedGroups) {
	pool.visitTaskModels(visitor);
	}
	}

	public List<Task> getHistory() {
	return completedTasks;
	}

	@Override
	public boolean isTaskLive(int id) {
	for (SchedulerStateActions group : prioritizedGroups) {
	Task task = group.getTask(id);
	if (task != null) {
	return task.isLive();
	}
	}
	return false;
	}

	@Override
	public synchronized boolean cancelTask(int id) {
	for (SchedulerStateActions group : prioritizedGroups) {
	Task task = group.getTask(id);
	if (task != null) {
	group.cancel(task);
	group.getScheduler().change(-1);
	return true;
	}
	}
	LOG.warn( "Requested to cancel task, but no task found: " + id );
	return false;
	}

	@Override
	public synchronized void completionAck(Task task, String propertyKey) {
	EventContext context = new EventContext(this);
	context.setTask(task);
	context.getState().completionAck(context);
	if (propertyKey != null) {
	task.properties.remove(propertyKey);
	}
	}

	@Override
	public synchronized void startAck(Task task, String propertyKey,
	Object value) {
	if (propertyKey != null && value != null) {
	task.properties.put(propertyKey, value);
	}
	EventContext context = new EventContext(this);
	context.setTask(task);
	context.getState().startAck(context);
	}

	@Override
	public boolean supportsDiskResource() {
	return getYarn().supportsDiskResource();
	}

	@Override
	public void registryDown() { shutDown( ); }
	}