uima-ducc-examples/src/main/scripts/prepare - uima-ducc - Git at Google

 #!/usr/bin/env python
 # -----------------------------------------------------------------------
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
 # -----------------------------------------------------------------------

 #
 # This script reads a test configuration and updates the test job descriptions
 # accordingly.
 #
 # The job desciprions contain a collection of "elapsed times", one for each
 # workitem.  Each AE sleeps for this time to simulate compuing.  The driver
 # program, "runducc" submits the jobs accordng to a schedule generated by
 # this program.
 #
 # Each simulated job contains a simulated "start time" based on actual
 # usage in a development cluster.  This is used to calculate a total elapsed
 # time for the run.  The test configuration specifies a comprssion rate and
 # spread, used thus:
 #    Each actual elapsesd time for the job is divided by the compression rate.
 #    The set of jobs is submitted with random spacing over the time specified
 #       by the spread.
 # Knowing the original time span for the jobs, it is possible to execute a
 # compressed run somewhat similar to the original.
 #
 # The configuration file also allows weighted-random selection of job memories,
 # job classes, and service assignments (for running service tests).
 #
 import os
 import sys
 import getopt
 import random

 DUCC_HOME = os.path.abspath(__file__ + '/../../..')
 sys.path.append(DUCC_HOME + '/admin')

 from ducc_util  import DuccUtil
 from properties import Properties
 from ducc       import Ducc

 class Prepare(DuccUtil):

     def usage(self, msg):
         if ( msg != None ):
             print msg

         print 'Usage:'
         print '   prepare.py <properties>'
         sys.exit(1)

     def error(self, *str):
         print ' '.join(str)
         sys.exit(1)

     def toint(self, props, name, dflt=None):
         val = props.get(name)
         if ( val == None ):
             if ( dflt == None ):
                 self.error("Missing required property", name)
             else:
                 return dflt

         try:
             return int(val)
         except:
             self.error("Property", name, "is not an int:", val)

     def toboolean(self, props, name, dflt=None):
         if ( str == None ):
             return deflt
         return str in ['t', 'T', 'true', 'True', 'TRUE','y', 'Y', 'yes', 'Yes', 'YES']

     def toarray(self, props, name):
         '''
         Read 'name' from props, whose value is blank delimeted strings.  Each string is
         used as an associative index 'ndx' into a map.  Construct the index
         'name.ndx' for each index, look up the value in the props file, and return a
         map (dictionary) of the values.
         e.g the properties below this map: {'37': '50', '28': '50'}
            job.memory                  = 28 37
            job.memory.28               = 50
            job.memory.37               = 50

         '''
         stem = props.get(name)
         if ( stem == None ):
             self.error('No such property:', name)
         vals = stem.split()

         ret = {}
         for val in vals:
             d = props.get(name + '.' + val)
             if ( d == None ):
                 self.error('No such property:', d)
             ret[val] = d

         return ret

     def show(self, vals, tag):
         tally = {}
         for v in vals:
             if ( tally.has_key(v) ):
                 tally[v] = tally[v] + 1
             else:
                 tally[v] = 1

         for k in tally.keys():
             print '%10s tally: %12s --> %d' % (tag, k, tally[k])

     def distributeParameters(self, parmset, count, tag):
         '''
         Randomly assign the values in the memory set to the jobs.  Otherwise use the supplied values.

         parmset is a map where the keys are the set of memory values to set, and the values
                 are the weights used to distribute the memories among the jobs.

         count   is the number of things we need to distribute the stuff in parmset over
         '''


         denom = 0
         for v in parmset.values():
             denom = denom + int(v)

         ndx = 0
         tmp = []
         ovfl = []

         # set up an array (list) with the target values according the their configured distribution
         for k in parmset.keys():
             num = float(parmset[k])
             ovfl.append(k)
             val = ((num / denom) * count)
             bound = int(round(val))
             for j in range(0, bound):
                 tmp.append(k)
                 ndx = ndx + 1

         # deal with leftovers (non-integral solution to the loop above)
         if ( ndx < count ):
             while ( ndx < count ):
                 x = ovfl[ random.randint(0, (len(ovfl)-1)) ]
                 tmp.append(x)
                 ndx = ndx + 1

         # gotta love python - now randomly shuffle the values
         random.shuffle(tmp)
         self.show(tmp, tag)
         return tmp

     def writeControlFile(self, allfiles):
         '''
         allfiles  is a list of tuples.  Each tuple is (submittod, filename).  The list is
                   sorted by tod so all we have to do is bop through it picking up files in the
                   right order.  We calculate the "spread" - the approximate elapsed time of the test
                   from the submit tod and compression and write the control file accordingly.
         '''
         size = len(allfiles)
         files_per_interval = self.spread / size
         range = files_per_interval * 2
         total = 0
         print 'Size', size, 'files-per-interval', files_per_interval, 'range', range

         outf = open(self.testdir + '/job.ctl', 'w')
         for (tod, fname) in allfiles:
             outf.write('s -c ' + str(self.compression) + ' ' + fname + '\n')
             delay = random.randint(0, range)
             outf.write('[sleep ' + str(delay) + 'S]\n\n')
             total += delay

         outf.close()
         print 'Spread:', self.spread, 'actual:', total

     def run(self):
         print 'Prepare starts...'

         if ( self.randomseed == 'TOD' ):
             random.seed()
         else:
             random.seed(int(self.randomseed))

         files = os.listdir(self.srcdir)
         count = len(files)
         mem_assignments = self.distributeParameters(self.memory, count, 'memory')
         class_assignments = self.distributeParameters(self.classes, count, 'class')
         if ( self.services != None ):
             service_assignments = self.distributeParameters(self.services, count, 'services')

         if ( not os.path.exists(self.destdir) ):
             os.makedirs(self.destdir)

         ndx = 0
         allfiles = []
         for f in files:
             props = Properties()
             props.load(self.srcdir + '/' + f)
             cls_assignment = class_assignments[ndx]

             mem_override = self.maxmem[cls_assignment]
             if ( mem_override == None ):
                 props.put('memory', mem_assignments[ndx])
             else:
                 props.put('memory', mem_override)

             props.put('class' , cls_assignment)
             if ( self.services != None ):
                 props.put('services', service_assignments[ndx])

             process_override = self.maxproc[cls_assignment]
             if ( process_override != None ):                    # force override of max_processes
                 props.put('machines', process_override)

             type_override = self.type[cls_assignment]
             if ( type_override != None ):                    # force override of max_processes
                 props.put('type', 'reserve')

             fname = self.destdir + '/' + f
             allfiles.append( (int(props.get('tod')), fname) )
             props.write(self.destdir + '/' + f)
             ndx = ndx + 1

         self.writeControlFile(sorted(allfiles))

     def from_keys(self, props, base, stem):
         '''
            Given a base dictionary, "base", use its keyset to form a property from the
            stem and look for it in props.

            Assign None as the value if not found.
         '''
         ret = {}
         for bk in base.keys():
             k = stem + '.' + bk
             print 'LOOKING FOR', k
             if ( props.has_key(k) ):
                 ret[bk] = props.get(k)
             else:
                 ret[bk] = None
         return ret


     def main(self, argv):

         if ( len(argv) < 1 ):
             self.usage("Missing 'prepare' properties.")
         inprops = argv[0]

         if inprops in ('-h', '-?', '--help', '-help'):
             self.usage(None)

         self.testdir = os.path.dirname(inprops)
         props = Properties()
         props.load(inprops)

         self.srcdir       = self.testdir + '/' + props.get('src.dir')
         self.destdir      = self.testdir + '/' + props.get('dest.dir')
         self.spread       = self.toint    (props, 'submission.spread')
         self.compression  = self.toint    (props, 'compression' , 1)
         self.classes      = self.toarray  (props, 'scheduling.classes')
         self.maxproc      = self.from_keys(props, self.classes, 'scheduling.maxproc')
         self.maxmem       = self.from_keys(props, self.classes, 'scheduling.maxmem')
         self.type         = self.from_keys(props, self.classes, 'scheduling.type')
         self.memory       = self.toarray  (props, 'job.memory')
         self.randomseed    = props.get('random.seed')

         if ( props.get('job.services') == None ):
             self.services = None
         else:
             self.services     = self.toarray  (props, 'job.services')

         print 'Running with'
         print '   properties           :', inprops
         print '   testdir              :', self.testdir
         print '   srcdir               :', self.srcdir
         print '   destdir              :', self.destdir
         print '   spread               :', self.spread
         print '   compression          :', self.compression
         print '   classes              :', self.classes
         print '   maxproc              :', self.maxproc
         print '   maxmem               :', self.maxmem
         print '   type                 :', self.type
         print '   memory               :', self.memory
         print '   services             :', self.services
         print '   randomseed           :', self.randomseed

         self.run()

 if __name__ == "__main__":
     prepare = Prepare()
     prepare.main(sys.argv[1:])
	#!/usr/bin/env python
	# -----------------------------------------------------------------------
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.
	# -----------------------------------------------------------------------

	#
	# This script reads a test configuration and updates the test job descriptions
	# accordingly.
	#
	# The job desciprions contain a collection of "elapsed times", one for each
	# workitem. Each AE sleeps for this time to simulate compuing. The driver
	# program, "runducc" submits the jobs accordng to a schedule generated by
	# this program.
	#
	# Each simulated job contains a simulated "start time" based on actual
	# usage in a development cluster. This is used to calculate a total elapsed
	# time for the run. The test configuration specifies a comprssion rate and
	# spread, used thus:
	# Each actual elapsesd time for the job is divided by the compression rate.
	# The set of jobs is submitted with random spacing over the time specified
	# by the spread.
	# Knowing the original time span for the jobs, it is possible to execute a
	# compressed run somewhat similar to the original.
	#
	# The configuration file also allows weighted-random selection of job memories,
	# job classes, and service assignments (for running service tests).
	#
	import os
	import sys
	import getopt
	import random

	DUCC_HOME = os.path.abspath(__file__ + '/../../..')
	sys.path.append(DUCC_HOME + '/admin')

	from ducc_util import DuccUtil
	from properties import Properties
	from ducc import Ducc

	class Prepare(DuccUtil):

	def usage(self, msg):
	if ( msg != None ):
	print msg

	print 'Usage:'
	print ' prepare.py <properties>'
	sys.exit(1)

	def error(self, *str):
	print ' '.join(str)
	sys.exit(1)

	def toint(self, props, name, dflt=None):
	val = props.get(name)
	if ( val == None ):
	if ( dflt == None ):
	self.error("Missing required property", name)
	else:
	return dflt

	try:
	return int(val)
	except:
	self.error("Property", name, "is not an int:", val)

	def toboolean(self, props, name, dflt=None):
	if ( str == None ):
	return deflt
	return str in ['t', 'T', 'true', 'True', 'TRUE','y', 'Y', 'yes', 'Yes', 'YES']

	def toarray(self, props, name):
	'''
	Read 'name' from props, whose value is blank delimeted strings. Each string is
	used as an associative index 'ndx' into a map. Construct the index
	'name.ndx' for each index, look up the value in the props file, and return a
	map (dictionary) of the values.
	e.g the properties below this map: {'37': '50', '28': '50'}
	job.memory = 28 37
	job.memory.28 = 50
	job.memory.37 = 50

	'''
	stem = props.get(name)
	if ( stem == None ):
	self.error('No such property:', name)
	vals = stem.split()

	ret = {}
	for val in vals:
	d = props.get(name + '.' + val)
	if ( d == None ):
	self.error('No such property:', d)
	ret[val] = d

	return ret

	def show(self, vals, tag):
	tally = {}
	for v in vals:
	if ( tally.has_key(v) ):
	tally[v] = tally[v] + 1
	else:
	tally[v] = 1

	for k in tally.keys():
	print '%10s tally: %12s --> %d' % (tag, k, tally[k])

	def distributeParameters(self, parmset, count, tag):
	'''
	Randomly assign the values in the memory set to the jobs. Otherwise use the supplied values.

	parmset is a map where the keys are the set of memory values to set, and the values
	are the weights used to distribute the memories among the jobs.

	count is the number of things we need to distribute the stuff in parmset over
	'''


	denom = 0
	for v in parmset.values():
	denom = denom + int(v)

	ndx = 0
	tmp = []
	ovfl = []

	# set up an array (list) with the target values according the their configured distribution
	for k in parmset.keys():
	num = float(parmset[k])
	ovfl.append(k)
	val = ((num / denom) * count)
	bound = int(round(val))
	for j in range(0, bound):
	tmp.append(k)
	ndx = ndx + 1

	# deal with leftovers (non-integral solution to the loop above)
	if ( ndx < count ):
	while ( ndx < count ):
	x = ovfl[ random.randint(0, (len(ovfl)-1)) ]
	tmp.append(x)
	ndx = ndx + 1

	# gotta love python - now randomly shuffle the values
	random.shuffle(tmp)
	self.show(tmp, tag)
	return tmp

	def writeControlFile(self, allfiles):
	'''
	allfiles is a list of tuples. Each tuple is (submittod, filename). The list is
	sorted by tod so all we have to do is bop through it picking up files in the
	right order. We calculate the "spread" - the approximate elapsed time of the test
	from the submit tod and compression and write the control file accordingly.
	'''
	size = len(allfiles)
	files_per_interval = self.spread / size
	range = files_per_interval * 2
	total = 0
	print 'Size', size, 'files-per-interval', files_per_interval, 'range', range

	outf = open(self.testdir + '/job.ctl', 'w')
	for (tod, fname) in allfiles:
	outf.write('s -c ' + str(self.compression) + ' ' + fname + '\n')
	delay = random.randint(0, range)
	outf.write('[sleep ' + str(delay) + 'S]\n\n')
	total += delay

	outf.close()
	print 'Spread:', self.spread, 'actual:', total

	def run(self):
	print 'Prepare starts...'

	if ( self.randomseed == 'TOD' ):
	random.seed()
	else:
	random.seed(int(self.randomseed))

	files = os.listdir(self.srcdir)
	count = len(files)
	mem_assignments = self.distributeParameters(self.memory, count, 'memory')
	class_assignments = self.distributeParameters(self.classes, count, 'class')
	if ( self.services != None ):
	service_assignments = self.distributeParameters(self.services, count, 'services')

	if ( not os.path.exists(self.destdir) ):
	os.makedirs(self.destdir)

	ndx = 0
	allfiles = []
	for f in files:
	props = Properties()
	props.load(self.srcdir + '/' + f)
	cls_assignment = class_assignments[ndx]

	mem_override = self.maxmem[cls_assignment]
	if ( mem_override == None ):
	props.put('memory', mem_assignments[ndx])
	else:
	props.put('memory', mem_override)

	props.put('class' , cls_assignment)
	if ( self.services != None ):
	props.put('services', service_assignments[ndx])

	process_override = self.maxproc[cls_assignment]
	if ( process_override != None ): # force override of max_processes
	props.put('machines', process_override)

	type_override = self.type[cls_assignment]
	if ( type_override != None ): # force override of max_processes
	props.put('type', 'reserve')

	fname = self.destdir + '/' + f
	allfiles.append( (int(props.get('tod')), fname) )
	props.write(self.destdir + '/' + f)
	ndx = ndx + 1

	self.writeControlFile(sorted(allfiles))

	def from_keys(self, props, base, stem):
	'''
	Given a base dictionary, "base", use its keyset to form a property from the
	stem and look for it in props.

	Assign None as the value if not found.
	'''
	ret = {}
	for bk in base.keys():
	k = stem + '.' + bk
	print 'LOOKING FOR', k
	if ( props.has_key(k) ):
	ret[bk] = props.get(k)
	else:
	ret[bk] = None
	return ret


	def main(self, argv):

	if ( len(argv) < 1 ):
	self.usage("Missing 'prepare' properties.")
	inprops = argv[0]

	if inprops in ('-h', '-?', '--help', '-help'):
	self.usage(None)

	self.testdir = os.path.dirname(inprops)
	props = Properties()
	props.load(inprops)

	self.srcdir = self.testdir + '/' + props.get('src.dir')
	self.destdir = self.testdir + '/' + props.get('dest.dir')
	self.spread = self.toint (props, 'submission.spread')
	self.compression = self.toint (props, 'compression' , 1)
	self.classes = self.toarray (props, 'scheduling.classes')
	self.maxproc = self.from_keys(props, self.classes, 'scheduling.maxproc')
	self.maxmem = self.from_keys(props, self.classes, 'scheduling.maxmem')
	self.type = self.from_keys(props, self.classes, 'scheduling.type')
	self.memory = self.toarray (props, 'job.memory')
	self.randomseed = props.get('random.seed')

	if ( props.get('job.services') == None ):
	self.services = None
	else:
	self.services = self.toarray (props, 'job.services')

	print 'Running with'
	print ' properties :', inprops
	print ' testdir :', self.testdir
	print ' srcdir :', self.srcdir
	print ' destdir :', self.destdir
	print ' spread :', self.spread
	print ' compression :', self.compression
	print ' classes :', self.classes
	print ' maxproc :', self.maxproc
	print ' maxmem :', self.maxmem
	print ' type :', self.type
	print ' memory :', self.memory
	print ' services :', self.services
	print ' randomseed :', self.randomseed

	self.run()

	if __name__ == "__main__":
	prepare = Prepare()
	prepare.main(sys.argv[1:])