drill-yarn/src/main/java/org/apache/drill/yarn/appMaster/AbstractDrillbitScheduler.java - drill - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.drill.yarn.appMaster;

 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.drill.yarn.zk.ZKRegistry;

 /**
  * Base class for schedulers (pools) for Drillbits. Derived classes implement
  * various policies for node selection. This class handles the common tasks such
  * as holding the Drillbit launch specification, providing Drillbit- specific
  * behaviors and so on.
  * <p>
  * The key purpose of this class is to abstract Drillbit-speicific code from the
  * rest of the AM cluster controller. We do so for several reasons: ease of
  * testing (we can use mock tasks), ability to handle additional server types in
  * the future, and a way to keep each module focused on a single task (as the
  * controller and its state machine is complex enough without mixing in Drillbit
  * specifics.)
  */

 public abstract class AbstractDrillbitScheduler
     extends PersistentTaskScheduler {
   /**
    * Interface to provide Drill-bit specific behavior. Ideally, this class would
    * provide the interface to gracefully shut down a Drillbit, but Drill has no
    * API to do graceful shutdown in this release. (The only graceful shutdown is
    * by issuing a SIGTERM from the node runing the Drillbit, but YARN has no way
    * to do this, despite active discussions on several YARN JIRA entries.
    */

   public class DrillbitManager extends AbstractTaskManager {
     /**
      * Allow only one concurrent container request by default to ensure that the
      * node blacklist mechanism works to ensure that the RM does not allocate
      * two containers on the same node.
      */

     @Override
     public int maxConcurrentAllocs() {
       return 1;
     }

     @Override
     public void allocated(EventContext context) {

       // One drillbit per node, so reserve the node
       // just allocated.

       context.controller.getNodeInventory().reserve(context.task.container);
     }

     @Override
     public void completed(EventContext context) {
       // This method is called for all completed tasks, even those that
       // completed (were cancelled) before a container was allocated.
       // If we have no container, then we have nothing to tell the
       // node inventory.

       if (context.task.container != null) {
         context.controller.getNodeInventory().release(context.task.container);
       }
       analyzeResult(context);
     }

     @Override
     public boolean isLive(EventContext context) {
       ZKRegistry reg = (ZKRegistry) context.controller.getProperty(ZKRegistry.CONTROLLER_PROPERTY);
       return reg.isRegistered(context.task);
     }

     /**
      * Analyze the result. Drillbits should not exit, but this one did. It might
      * be because we asked it to exit, which is fine. Otherwise, the exit is
      * unexpected and we should 1) provide the admin with an explanation, and 2)
      * prevent retries after a few tries.
      *
      * @param context
      */

     private void analyzeResult(EventContext context) {
       Task task = context.task;

       // If we cancelled the Drill-bit, just unblacklist the
       // host so we can run another drillbit on it later.

       if (task.isCancelled()) {
         return;
       }

       // The Drill-bit stopped on its own.
       // Maybe the exit status will tell us something.

       int exitCode = task.completionStatus.getExitStatus();

       // We can also consider the runtime.

       long duration = task.uptime() / 1000;

       // The ZK state may also help.

       boolean registered = task.trackingState != Task.TrackingState.NEW;

       // If the exit code was 1, then the script probably found
       // an error. Only retry once.

       if (registered || task.getTryCount() < 2) {

         // Use the default retry policy.

         return;
       }

       // Seems to be a mis-configuration. The Drill-bit exited quickly and
       // did not register in ZK. Also, we've tried twice now with no luck.
       // Assume the node is bad.

       String hostName = task.getHostName();
       StringBuilder buf = new StringBuilder();
       buf.append(task.getLabel()).append(" on host ").append(hostName)
           .append(" failed with status ").append(exitCode).append(" after ")
           .append(duration).append(" secs. with");
       if (!registered) {
         buf.append("out");
       }
       buf.append(" ZK registration");
       if (duration < 60 && !registered) {
         buf.append(
             "\n    Probable configuration problem, check Drill log file on host ")
             .append(hostName).append(".");
       }
       LOG.error(buf.toString());
       task.cancelled = true;

       // Mark the host as permanently blacklisted. Leave it
       // in YARN's blacklist.

       context.controller.getNodeInventory().blacklist(hostName);
     }
   }

   private static final Log LOG = LogFactory
       .getLog(AbstractDrillbitScheduler.class);

   public AbstractDrillbitScheduler(String type, String name, int quantity) {
     super(type, name, quantity);
     isTracked = true;
     setTaskManager(new DrillbitManager());
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.drill.yarn.appMaster;

	import org.apache.commons.logging.Log;
	import org.apache.commons.logging.LogFactory;
	import org.apache.drill.yarn.zk.ZKRegistry;

	/**
	* Base class for schedulers (pools) for Drillbits. Derived classes implement
	* various policies for node selection. This class handles the common tasks such
	* as holding the Drillbit launch specification, providing Drillbit- specific
	* behaviors and so on.
	* <p>
	* The key purpose of this class is to abstract Drillbit-speicific code from the
	* rest of the AM cluster controller. We do so for several reasons: ease of
	* testing (we can use mock tasks), ability to handle additional server types in
	* the future, and a way to keep each module focused on a single task (as the
	* controller and its state machine is complex enough without mixing in Drillbit
	* specifics.)
	*/

	public abstract class AbstractDrillbitScheduler
	extends PersistentTaskScheduler {
	/**
	* Interface to provide Drill-bit specific behavior. Ideally, this class would
	* provide the interface to gracefully shut down a Drillbit, but Drill has no
	* API to do graceful shutdown in this release. (The only graceful shutdown is
	* by issuing a SIGTERM from the node runing the Drillbit, but YARN has no way
	* to do this, despite active discussions on several YARN JIRA entries.
	*/

	public class DrillbitManager extends AbstractTaskManager {
	/**
	* Allow only one concurrent container request by default to ensure that the
	* node blacklist mechanism works to ensure that the RM does not allocate
	* two containers on the same node.
	*/

	@Override
	public int maxConcurrentAllocs() {
	return 1;
	}

	@Override
	public void allocated(EventContext context) {

	// One drillbit per node, so reserve the node
	// just allocated.

	context.controller.getNodeInventory().reserve(context.task.container);
	}

	@Override
	public void completed(EventContext context) {
	// This method is called for all completed tasks, even those that
	// completed (were cancelled) before a container was allocated.
	// If we have no container, then we have nothing to tell the
	// node inventory.

	if (context.task.container != null) {
	context.controller.getNodeInventory().release(context.task.container);
	}
	analyzeResult(context);
	}

	@Override
	public boolean isLive(EventContext context) {
	ZKRegistry reg = (ZKRegistry) context.controller.getProperty(ZKRegistry.CONTROLLER_PROPERTY);
	return reg.isRegistered(context.task);
	}

	/**
	* Analyze the result. Drillbits should not exit, but this one did. It might
	* be because we asked it to exit, which is fine. Otherwise, the exit is
	* unexpected and we should 1) provide the admin with an explanation, and 2)
	* prevent retries after a few tries.
	*
	* @param context
	*/

	private void analyzeResult(EventContext context) {
	Task task = context.task;

	// If we cancelled the Drill-bit, just unblacklist the
	// host so we can run another drillbit on it later.

	if (task.isCancelled()) {
	return;
	}

	// The Drill-bit stopped on its own.
	// Maybe the exit status will tell us something.

	int exitCode = task.completionStatus.getExitStatus();

	// We can also consider the runtime.

	long duration = task.uptime() / 1000;

	// The ZK state may also help.

	boolean registered = task.trackingState != Task.TrackingState.NEW;

	// If the exit code was 1, then the script probably found
	// an error. Only retry once.

	if (registered \|\| task.getTryCount() < 2) {

	// Use the default retry policy.

	return;
	}

	// Seems to be a mis-configuration. The Drill-bit exited quickly and
	// did not register in ZK. Also, we've tried twice now with no luck.
	// Assume the node is bad.

	String hostName = task.getHostName();
	StringBuilder buf = new StringBuilder();
	buf.append(task.getLabel()).append(" on host ").append(hostName)
	.append(" failed with status ").append(exitCode).append(" after ")
	.append(duration).append(" secs. with");
	if (!registered) {
	buf.append("out");
	}
	buf.append(" ZK registration");
	if (duration < 60 && !registered) {
	buf.append(
	"\n Probable configuration problem, check Drill log file on host ")
	.append(hostName).append(".");
	}
	LOG.error(buf.toString());
	task.cancelled = true;

	// Mark the host as permanently blacklisted. Leave it
	// in YARN's blacklist.

	context.controller.getNodeInventory().blacklist(hostName);
	}
	}

	private static final Log LOG = LogFactory
	.getLog(AbstractDrillbitScheduler.class);

	public AbstractDrillbitScheduler(String type, String name, int quantity) {
	super(type, name, quantity);
	isTracked = true;
	setTaskManager(new DrillbitManager());
	}
	}