crunch-scrunch/src/main/scripts/scrunch-job.py - crunch - Git at Google

 #!/usr/bin/python
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 import glob
 import os
 import re
 import shutil
 import subprocess
 import sys

 # Configuration in script
 ##############################################################
 if not "SCALA_HOME" in os.environ:
   sys.stderr.write("Environment variable SCALA_HOME must be set\n")
   sys.exit(1)
 SCALA_LIB = os.path.join(os.environ["SCALA_HOME"], "lib")

 if not "HADOOP_HOME" in os.environ:
   sys.stderr.write("Environment variable HADOOP_HOME must be set\n")
   sys.exit(1)
 HADOOP_HOME = os.environ["HADOOP_HOME"]
 HADOOP_JARS = ":".join(glob.glob(os.path.join(HADOOP_HOME, "*.jar")))

 #Get the absolute path of the original (non-symlink) file.
 if os.path.islink(__file__):
   ORIGINAL_FILE = os.readlink(__file__)
 else:
   ORIGINAL_FILE = __file__

 DIST_ROOT = os.path.abspath(os.path.dirname(ORIGINAL_FILE)+"/../")
 LIB_DIR = DIST_ROOT + "/lib" # Dir with all scrunch dependencies.
 TMPDIR = "/tmp"
 BUILDDIR = TMPDIR + "/script-build"
 COMPILE_CMD = "java -cp %s/scala-library.jar:%s/scala-compiler.jar -Dscala.home=%s scala.tools.nsc.Main" % (SCALA_LIB, SCALA_LIB, SCALA_LIB)
 ##############################################################

 argv = sys.argv[1:]
 if len(argv) < 1:
   sys.stderr.write("ERROR: insufficient args.\n")
   sys.exit(1)

 JOBFILE = argv.pop(0)

 def file_type():
   m = re.search(r'\.(scala|java)$', JOBFILE)
   if m:
     return m.group(1)
   return None

 def is_file():
   return file_type() is not None

 PACK_RE = r'package ([^;]+)'
 OBJECT_RE = r'object\s+([^\s(]+).*(extends|with)\s+PipelineApp.*'
 EXTENSION_RE = r'(.*)\.(scala|java)$'

 #Get the name of the job from the file.
 #the rule is: last class in the file, or the one that matches the filename
 def get_job_name(file):
   package = ""
   job = None
   default = None
   match = re.search(EXTENSION_RE, file)
   if match:
     default = match.group(1)
     for s in open(file, "r"):
       mp = re.search(PACK_RE, s)
       mo = re.search(OBJECT_RE, s)
       if mp:
         package = mp.group(1).trim() + "."
       elif mo:
         if not job or not default or not job.tolower() == default.tolower():
           #use either the last class, or the one with the same name as the file
           job = mo.group(1)
     if not job:
       raise "Could not find job name"
     return "%s%s" % (package, job)
   else:
     return file

 LIB_PATH = os.path.abspath(LIB_DIR)
 if not os.path.exists(LIB_PATH):
   sys.stderr.write("Scrunch distribution lib directory not found; run mvn package to construct a distribution to run examples from.\n")
   sys.exit(1)
 LIB_JARS = glob.glob(os.path.join(LIB_PATH, "*.jar"))
 LIB_CP = ":".join(LIB_JARS)

 JOBPATH = os.path.abspath(JOBFILE)
 JOB = get_job_name(JOBFILE)
 JOBJAR = JOB + ".jar"
 JOBJARPATH = os.path.join(TMPDIR, JOBJAR)

 def needs_rebuild():
   return not os.path.exists(JOBJARPATH) or os.stat(JOBJARPATH).st_mtime < os.stat(JOBPATH).st_mtime

 def build_job_jar():
   sys.stderr.write("compiling " + JOBFILE + "\n")
   if os.path.exists(BUILDDIR):
     shutil.rmtree(BUILDDIR)
   os.makedirs(BUILDDIR)
   cmd = "%s -classpath %s:%s -d %s %s" % (COMPILE_CMD, LIB_CP, HADOOP_JARS, BUILDDIR, JOBFILE)
   print cmd
   if subprocess.call(cmd, shell=True):
     shutil.rmtree(BUILDDIR)
     sys.exit(1)

   jar_cmd = "jar cf %s -C %s ." % (JOBJARPATH, BUILDDIR)
   subprocess.call(jar_cmd, shell=True)
   shutil.rmtree(BUILDDIR)

 def hadoop_command():
   return "HADOOP_CLASSPATH=%s ; %s/bin/hadoop jar %s %s %s" % (LIB_CP, HADOOP_HOME, JOBJARPATH, JOB, " ".join(argv))

 if is_file() and needs_rebuild():
   build_job_jar()

 SHELL_COMMAND = hadoop_command()
 print SHELL_COMMAND
 os.system(SHELL_COMMAND)
	#!/usr/bin/python
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import glob
	import os
	import re
	import shutil
	import subprocess
	import sys

	# Configuration in script
	##############################################################
	if not "SCALA_HOME" in os.environ:
	sys.stderr.write("Environment variable SCALA_HOME must be set\n")
	sys.exit(1)
	SCALA_LIB = os.path.join(os.environ["SCALA_HOME"], "lib")

	if not "HADOOP_HOME" in os.environ:
	sys.stderr.write("Environment variable HADOOP_HOME must be set\n")
	sys.exit(1)
	HADOOP_HOME = os.environ["HADOOP_HOME"]
	HADOOP_JARS = ":".join(glob.glob(os.path.join(HADOOP_HOME, "*.jar")))

	#Get the absolute path of the original (non-symlink) file.
	if os.path.islink(__file__):
	ORIGINAL_FILE = os.readlink(__file__)
	else:
	ORIGINAL_FILE = __file__

	DIST_ROOT = os.path.abspath(os.path.dirname(ORIGINAL_FILE)+"/../")
	LIB_DIR = DIST_ROOT + "/lib" # Dir with all scrunch dependencies.
	TMPDIR = "/tmp"
	BUILDDIR = TMPDIR + "/script-build"
	COMPILE_CMD = "java -cp %s/scala-library.jar:%s/scala-compiler.jar -Dscala.home=%s scala.tools.nsc.Main" % (SCALA_LIB, SCALA_LIB, SCALA_LIB)
	##############################################################

	argv = sys.argv[1:]
	if len(argv) < 1:
	sys.stderr.write("ERROR: insufficient args.\n")
	sys.exit(1)

	JOBFILE = argv.pop(0)

	def file_type():
	m = re.search(r'\.(scala\|java)$', JOBFILE)
	if m:
	return m.group(1)
	return None

	def is_file():
	return file_type() is not None

	PACK_RE = r'package ([^;]+)'
	OBJECT_RE = r'object\s+([^\s(]+).(extends\|with)\s+PipelineApp.'
	EXTENSION_RE = r'(.*)\.(scala\|java)$'

	#Get the name of the job from the file.
	#the rule is: last class in the file, or the one that matches the filename
	def get_job_name(file):
	package = ""
	job = None
	default = None
	match = re.search(EXTENSION_RE, file)
	if match:
	default = match.group(1)
	for s in open(file, "r"):
	mp = re.search(PACK_RE, s)
	mo = re.search(OBJECT_RE, s)
	if mp:
	package = mp.group(1).trim() + "."
	elif mo:
	if not job or not default or not job.tolower() == default.tolower():
	#use either the last class, or the one with the same name as the file
	job = mo.group(1)
	if not job:
	raise "Could not find job name"
	return "%s%s" % (package, job)
	else:
	return file

	LIB_PATH = os.path.abspath(LIB_DIR)
	if not os.path.exists(LIB_PATH):
	sys.stderr.write("Scrunch distribution lib directory not found; run mvn package to construct a distribution to run examples from.\n")
	sys.exit(1)
	LIB_JARS = glob.glob(os.path.join(LIB_PATH, "*.jar"))
	LIB_CP = ":".join(LIB_JARS)

	JOBPATH = os.path.abspath(JOBFILE)
	JOB = get_job_name(JOBFILE)
	JOBJAR = JOB + ".jar"
	JOBJARPATH = os.path.join(TMPDIR, JOBJAR)

	def needs_rebuild():
	return not os.path.exists(JOBJARPATH) or os.stat(JOBJARPATH).st_mtime < os.stat(JOBPATH).st_mtime

	def build_job_jar():
	sys.stderr.write("compiling " + JOBFILE + "\n")
	if os.path.exists(BUILDDIR):
	shutil.rmtree(BUILDDIR)
	os.makedirs(BUILDDIR)
	cmd = "%s -classpath %s:%s -d %s %s" % (COMPILE_CMD, LIB_CP, HADOOP_JARS, BUILDDIR, JOBFILE)
	print cmd
	if subprocess.call(cmd, shell=True):
	shutil.rmtree(BUILDDIR)
	sys.exit(1)

	jar_cmd = "jar cf %s -C %s ." % (JOBJARPATH, BUILDDIR)
	subprocess.call(jar_cmd, shell=True)
	shutil.rmtree(BUILDDIR)

	def hadoop_command():
	return "HADOOP_CLASSPATH=%s ; %s/bin/hadoop jar %s %s %s" % (LIB_CP, HADOOP_HOME, JOBJARPATH, JOB, " ".join(argv))

	if is_file() and needs_rebuild():
	build_job_jar()

	SHELL_COMMAND = hadoop_command()
	print SHELL_COMMAND
	os.system(SHELL_COMMAND)