executor/src/main/scala/org/apache/amaterasu/executor/execution/actions/runners/spark/PySpark/PySparkRunner.scala - incubator-retired-amaterasu - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.amaterasu.executor.execution.actions.runners.spark.PySpark

 import java.io.{File, PrintWriter, StringWriter}
 import java.util

 import org.apache.amaterasu.common.configuration.ClusterConfig
 import org.apache.amaterasu.common.execution.actions.Notifier
 import org.apache.amaterasu.common.execution.dependencies.{PythonDependencies, PythonPackage}
 import org.apache.amaterasu.common.logging.Logging
 import org.apache.amaterasu.common.runtime.Environment
 import org.apache.amaterasu.sdk.AmaterasuRunner
 import org.apache.spark.SparkEnv
 import org.apache.spark.sql.SparkSession

 import scala.sys.process.Process


 class PySparkRunner extends AmaterasuRunner with Logging {

   var proc: Process = _
   var notifier: Notifier = _

   override def getIdentifier: String = "pyspark"

   override def executeSource(actionSource: String, actionName: String, exports: util.Map[String, String]): Unit = {
     interpretSources(actionSource, actionName, exports)
   }

   def interpretSources(source: String, actionName: String, exports: util.Map[String, String]): Unit = {

     PySparkEntryPoint.getExecutionQueue.setForExec((source, actionName, exports))
     val resQueue = PySparkEntryPoint.getResultQueue(actionName)

     notifier.info(s"================= started action $actionName =================")

     var res: PySparkResult = null

     do {
       res = resQueue.getNext()
       res.resultType match {
         case ResultType.success =>
           notifier.success(res.statement)
         case ResultType.error =>
           notifier.error(res.statement, res.message)
           throw new Exception(res.message)
         case ResultType.completion =>
           notifier.info(s"================= finished action $actionName =================")
       }
     } while (res != null && res.resultType != ResultType.completion)
   }

 }

 object PySparkRunner {

   def apply(env: Environment,
             jobId: String,
             notifier: Notifier,
             spark: SparkSession,
             pypath: String,
             pyDeps: PythonDependencies,
             config: ClusterConfig): PySparkRunner = {

     //TODO: can we make this less ugly?
     var pysparkPython = "/usr/bin/python"

     if (pyDeps != null &&
         pyDeps.packages.nonEmpty) {
       loadPythonDependencies(pyDeps, notifier)
       pysparkPython = "miniconda/bin/python"
     }

     val result = new PySparkRunner

     PySparkEntryPoint.start(spark, jobId, env, SparkEnv.get)
     val port = PySparkEntryPoint.getPort
     var intpPath = ""
     if (env.configuration.contains("cwd")) {
       val cwd = new File(env.configuration("cwd"))
       intpPath = s"${cwd.getAbsolutePath}/spark_intp.py" // This is to support test environment
     } else {
       intpPath = s"spark_intp.py"
     }
     var pysparkPath = ""
     if (env.configuration.contains("pysparkPath")) {
       pysparkPath = env.configuration("pysparkPath")
     } else {
       pysparkPath = s"${config.spark.home}/bin/spark-submit"
     }
     val proc = Process(Seq(pysparkPath, intpPath, port.toString), None,
       "PYTHONPATH" -> pypath,
       "PYSPARK_PYTHON" -> pysparkPython,
       "PYTHONHASHSEED" -> 0.toString) #> System.out

     proc.run()


     result.notifier = notifier

     result
   }

   /**
     * This installs the required python dependencies.
     * We basically need 2 packages to make pyspark work with customer's scripts:
     * 1. py4j - supplied by spark, for communication between Python and Java runtimes.
     * 2. codegen - for dynamically parsing and converting customer's scripts into executable Python code objects.
     * Currently we only know how to install packages using Anaconda, the reason is 3rd party OS libraries, e.g. libevent
     * Anaconda has the capabilities to automatically resolve the required OS libraries per Python package and install them.
     *
     * TODO - figure out if we really want to support pip directly, or if Anaconda is enough.
     * @param deps All of the customer's supplied Python dependencies, this currently comes from job-repo/deps/python.yml
     * @param notifier
     */
   private def loadPythonDependencies(deps: PythonDependencies, notifier: Notifier): Unit = {
     notifier.info("loading anaconda evn")
     installAnacondaOnNode()
     val codegenPackage = PythonPackage("codegen", channel = Option("auto"))
     installAnacondaPackage(codegenPackage)
     try {
       deps.packages.foreach(pack => {
         pack.index.getOrElse("anaconda").toLowerCase match {
           case "anaconda" => installAnacondaPackage(pack)
           // case "pypi" => installPyPiPackage(pack) TODO: See if we can support this
         }
       })
     }
     catch {

       case rte: RuntimeException =>
         val sw = new StringWriter
         rte.printStackTrace(new PrintWriter(sw))
         notifier.error("", s"Failed to activate environment (runtime) - cause: ${rte.getCause}, message: ${rte.getMessage}, Stack: \n${sw.toString}")
       case e: Exception =>
         val sw = new StringWriter
         e.printStackTrace(new PrintWriter(sw))
         notifier.error("", s"Failed to activate environment (other) - type: ${e.getClass.getName}, cause: ${e.getCause}, message: ${e.getMessage}, Stack: \n${sw.toString}")
     }
   }


   /**
     * Installs one python package using Anaconda.
     * Anaconda works with multiple channels, or better called, repositories.
     * Normally, if a channel isn't specified, Anaconda will fetch the package from the default conda channel.
     * The reason we need to use channels, is that sometimes the required package doesn't exist on the default channel.
     * @param pythonPackage This comes from parsing the python.yml dep file.
     */
   private def installAnacondaPackage(pythonPackage: PythonPackage): Unit = {
     val channel = pythonPackage.channel.getOrElse("anaconda")
     if (channel == "anaconda") {
       Seq("bash", "-c", s"$$PWD/miniconda/bin/python -m conda install -y ${pythonPackage.packageId}")
     } else {
       Seq("bash", "-c", s"$$PWD/miniconda/bin/python -m conda install -y -c $channel ${pythonPackage.packageId}")
     }
   }

   /**
     * Installs Anaconda and then links it with the local spark that was installed on the executor.
     */
   private def installAnacondaOnNode(): Unit = {
     Seq("bash", "-c", "sh Miniconda2-latest-Linux-x86_64.sh -b -p $PWD/miniconda")
     Seq("bash", "-c", "$PWD/miniconda/bin/python -m conda install -y conda-build")
     Seq("bash", "-c", "ln -s $PWD/spark-2.2.1-bin-hadoop2.7/python/pyspark $PWD/miniconda/pkgs/pyspark")
   }


 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.amaterasu.executor.execution.actions.runners.spark.PySpark

	import java.io.{File, PrintWriter, StringWriter}
	import java.util

	import org.apache.amaterasu.common.configuration.ClusterConfig
	import org.apache.amaterasu.common.execution.actions.Notifier
	import org.apache.amaterasu.common.execution.dependencies.{PythonDependencies, PythonPackage}
	import org.apache.amaterasu.common.logging.Logging
	import org.apache.amaterasu.common.runtime.Environment
	import org.apache.amaterasu.sdk.AmaterasuRunner
	import org.apache.spark.SparkEnv
	import org.apache.spark.sql.SparkSession

	import scala.sys.process.Process


	class PySparkRunner extends AmaterasuRunner with Logging {

	var proc: Process = _
	var notifier: Notifier = _

	override def getIdentifier: String = "pyspark"

	override def executeSource(actionSource: String, actionName: String, exports: util.Map[String, String]): Unit = {
	interpretSources(actionSource, actionName, exports)
	}

	def interpretSources(source: String, actionName: String, exports: util.Map[String, String]): Unit = {

	PySparkEntryPoint.getExecutionQueue.setForExec((source, actionName, exports))
	val resQueue = PySparkEntryPoint.getResultQueue(actionName)

	notifier.info(s"================= started action $actionName =================")

	var res: PySparkResult = null

	do {
	res = resQueue.getNext()
	res.resultType match {
	case ResultType.success =>
	notifier.success(res.statement)
	case ResultType.error =>
	notifier.error(res.statement, res.message)
	throw new Exception(res.message)
	case ResultType.completion =>
	notifier.info(s"================= finished action $actionName =================")
	}
	} while (res != null && res.resultType != ResultType.completion)
	}

	}

	object PySparkRunner {

	def apply(env: Environment,
	jobId: String,
	notifier: Notifier,
	spark: SparkSession,
	pypath: String,
	pyDeps: PythonDependencies,
	config: ClusterConfig): PySparkRunner = {

	//TODO: can we make this less ugly?
	var pysparkPython = "/usr/bin/python"

	if (pyDeps != null &&
	pyDeps.packages.nonEmpty) {
	loadPythonDependencies(pyDeps, notifier)
	pysparkPython = "miniconda/bin/python"
	}

	val result = new PySparkRunner

	PySparkEntryPoint.start(spark, jobId, env, SparkEnv.get)
	val port = PySparkEntryPoint.getPort
	var intpPath = ""
	if (env.configuration.contains("cwd")) {
	val cwd = new File(env.configuration("cwd"))
	intpPath = s"${cwd.getAbsolutePath}/spark_intp.py" // This is to support test environment
	} else {
	intpPath = s"spark_intp.py"
	}
	var pysparkPath = ""
	if (env.configuration.contains("pysparkPath")) {
	pysparkPath = env.configuration("pysparkPath")
	} else {
	pysparkPath = s"${config.spark.home}/bin/spark-submit"
	}
	val proc = Process(Seq(pysparkPath, intpPath, port.toString), None,
	"PYTHONPATH" -> pypath,
	"PYSPARK_PYTHON" -> pysparkPython,
	"PYTHONHASHSEED" -> 0.toString) #> System.out

	proc.run()


	result.notifier = notifier

	result
	}

	/**
	* This installs the required python dependencies.
	* We basically need 2 packages to make pyspark work with customer's scripts:
	* 1. py4j - supplied by spark, for communication between Python and Java runtimes.
	* 2. codegen - for dynamically parsing and converting customer's scripts into executable Python code objects.
	* Currently we only know how to install packages using Anaconda, the reason is 3rd party OS libraries, e.g. libevent
	* Anaconda has the capabilities to automatically resolve the required OS libraries per Python package and install them.
	*
	* TODO - figure out if we really want to support pip directly, or if Anaconda is enough.
	* @param deps All of the customer's supplied Python dependencies, this currently comes from job-repo/deps/python.yml
	* @param notifier
	*/
	private def loadPythonDependencies(deps: PythonDependencies, notifier: Notifier): Unit = {
	notifier.info("loading anaconda evn")
	installAnacondaOnNode()
	val codegenPackage = PythonPackage("codegen", channel = Option("auto"))
	installAnacondaPackage(codegenPackage)
	try {
	deps.packages.foreach(pack => {
	pack.index.getOrElse("anaconda").toLowerCase match {
	case "anaconda" => installAnacondaPackage(pack)
	// case "pypi" => installPyPiPackage(pack) TODO: See if we can support this
	}
	})
	}
	catch {

	case rte: RuntimeException =>
	val sw = new StringWriter
	rte.printStackTrace(new PrintWriter(sw))
	notifier.error("", s"Failed to activate environment (runtime) - cause: ${rte.getCause}, message: ${rte.getMessage}, Stack: \n${sw.toString}")
	case e: Exception =>
	val sw = new StringWriter
	e.printStackTrace(new PrintWriter(sw))
	notifier.error("", s"Failed to activate environment (other) - type: ${e.getClass.getName}, cause: ${e.getCause}, message: ${e.getMessage}, Stack: \n${sw.toString}")
	}
	}


	/**
	* Installs one python package using Anaconda.
	* Anaconda works with multiple channels, or better called, repositories.
	* Normally, if a channel isn't specified, Anaconda will fetch the package from the default conda channel.
	* The reason we need to use channels, is that sometimes the required package doesn't exist on the default channel.
	* @param pythonPackage This comes from parsing the python.yml dep file.
	*/
	private def installAnacondaPackage(pythonPackage: PythonPackage): Unit = {
	val channel = pythonPackage.channel.getOrElse("anaconda")
	if (channel == "anaconda") {
	Seq("bash", "-c", s"$$PWD/miniconda/bin/python -m conda install -y ${pythonPackage.packageId}")
	} else {
	Seq("bash", "-c", s"$$PWD/miniconda/bin/python -m conda install -y -c $channel ${pythonPackage.packageId}")
	}
	}

	/**
	* Installs Anaconda and then links it with the local spark that was installed on the executor.
	*/
	private def installAnacondaOnNode(): Unit = {
	Seq("bash", "-c", "sh Miniconda2-latest-Linux-x86_64.sh -b -p $PWD/miniconda")
	Seq("bash", "-c", "$PWD/miniconda/bin/python -m conda install -y conda-build")
	Seq("bash", "-c", "ln -s $PWD/spark-2.2.1-bin-hadoop2.7/python/pyspark $PWD/miniconda/pkgs/pyspark")
	}


	}