| #!/usr/bin/env python3 |
| """ |
| gpcheckperf -- Check the hardware for Apache Cloudberry |
| |
| Usage: gpcheckperf <options> |
| |
| --version : print version information |
| -d segdir : where segdir is a directory on which to test I/O performance. |
| Multiple -d flags may be specified for multiple directories |
| on each host. These will typically be your Cloudberry primary |
| (and/or mirror) data directories. |
| e.g. -d /dbfast1 -d /dbfast2 |
| -? : print this help text |
| -v : enable verbose |
| -V : enable very verbose |
| -D : print per host disk stats on disk read/write tests |
| -r flags : where flags specifies the tests to run [default=dsn]. |
| Supported test flags are: |
| d: runs only disk tests |
| s: runs only stream benchmark |
| n: runs only netperf tests (serial) |
| N: runs only netperf tests (parallel) |
| M: runs only netperf tests (full matrix) |
| -B size : where size specifies the block size for disk performance tests |
| [default=32KB] e.g. 1KB, 4MB |
| -S size : where size specifies the file size for disk performance tests |
| [default=2X Memory] e.g. 500MB, 16GB |
| -h host : the host to connect to (multiple -h is okay) |
| -f file : a file listing all hosts to connect to |
| --duration : how long to run network test (default 5 seconds) |
| --netperf : use netperf instead of gpnetbenchServer/gpnetbenchClient |
| --buffer-size : the size of the send buffer in kilobytes ( default 8 kilobytes) |
| """ |
| |
| import datetime |
| import os |
| import sys |
| import re |
| |
| sys.path.append(sys.path[0] + '/lib') |
| try: |
| import getopt, math, io, stat, subprocess, signal |
| from gppylib.gpparseopts import OptParser |
| from gppylib.util import ssh_utils |
| from functools import reduce |
| except ImportError as e: |
| sys.exit('Error: unable to import module: ' + str(e)) |
| |
| GPHOME = os.getenv('GPHOME') |
| if GPHOME is None: |
| sys.exit('Please set GPHOME environment variable') |
| |
| USER = os.getenv('USER') |
| if USER is None or USER == ' ': |
| sys.exit('Please set USER environment variable') |
| |
| |
| def usage(exitarg): |
| parser = OptParser() |
| try: |
| parser.print_help() |
| except: |
| print(__doc__) |
| sys.exit(exitarg) |
| |
| |
| class Global(): |
| script_name = os.path.split(__file__)[-1] |
| opt = {'-d': [], '-D': False, '-v': False, '-V': False, '-r': '', |
| '-B': 1024 * 32, '-S': 0, '-h': [], '-f': None, |
| '--duration': 15, '--net': None, '--netserver': 'gpnetbenchServer', |
| '--netclient': 'gpnetbenchClient', '--buffer-size': 0} |
| |
| |
| GV = Global() |
| |
| |
| def killall(procname): |
| return '(pkill %s || killall -9 %s) > /dev/null 2>&1 || true' % (procname[0:15], procname) |
| |
| |
| def strcmd(cmd): |
| return reduce(lambda x, y: x + ' ' + y, map(lambda x: x.find(' ') > 0 and "'" + x + "'" or x, cmd)) |
| |
| |
| def gpssh(cmd, verbose): |
| c = ['%s/bin/gpssh' % GPHOME] |
| if verbose: |
| c.append('-v') |
| if GV.opt['-f']: |
| c.append('-f') |
| c.append(GV.opt['-f']) |
| else: |
| for h in GV.opt['-h']: |
| c.append('-h') |
| c.append(h) |
| c.append(cmd) |
| |
| if GV.opt['-v']: |
| print('[Info]', strcmd(c)) |
| p = subprocess.Popen(c, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) |
| out = p.stdout.read(-1).decode() |
| rc = p.wait() |
| return not rc, out |
| |
| |
| def gpsync(src, dst): |
| c = ['%s/bin/gpsync' % GPHOME] |
| if GV.opt['-V']: |
| c.append('-v') |
| if GV.opt['-f']: |
| c.append('-f') |
| c.append(GV.opt['-f']) |
| else: |
| for h in GV.opt['-h']: |
| c.append('-h') |
| c.append(h) |
| c.append(src) |
| c.append(dst) |
| if GV.opt['-v']: |
| print ('[Info]', strcmd(c)) |
| p = subprocess.Popen(c, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) |
| out = p.stdout.read(-1) |
| rc = p.wait() |
| return not rc, out |
| |
| |
| def getPlatform(): |
| if sys.platform.find('linux') >= 0: |
| return 'linux' |
| if sys.platform.find('darwin') >= 0: |
| return 'darwin' |
| return '?' |
| |
| |
| def getMemory(): |
| platform = getPlatform() |
| |
| if platform == 'linux': |
| rc, out = run("sh -c 'cat /proc/meminfo | grep MemTotal'") |
| if rc != 0: |
| return None |
| word_list = out.strip().split(' ') |
| val = int(word_list[len(word_list) - 2]) |
| factor = word_list[len(word_list) - 1] |
| if factor == 'kB': |
| return val * 1024 if val else None |
| |
| if platform == 'darwin': |
| rc, out = run("/usr/sbin/sysctl hw.physmem") |
| if rc != 0: |
| return None |
| word_list = out.strip().split(' ') |
| val = int(word_list[1]) |
| return val if val else None |
| |
| return None |
| |
| |
| def parseMemorySize(line): |
| factor = 1 |
| try: |
| line = line.strip().upper() |
| if line.endswith('B'): |
| line = line[:-1] |
| if line.endswith('G'): |
| factor = 1024 * 1024 * 1024 |
| elif line.endswith('M'): |
| factor = 1024 * 1024 |
| elif line.endswith('K'): |
| factor = 1024 |
| if factor > 1: |
| line = line[:-1] |
| return int(line) * factor |
| except Exception as err: |
| print ('Error: ' + err) |
| sys.exit(-1) |
| |
| |
| def parseDuration(val): |
| factor = 1 |
| try: |
| val = val.strip().upper() |
| if val.endswith('S'): |
| val = val[:-1] |
| if val.endswith('M'): |
| factor = 60 |
| if val.endswith('H'): |
| factor = 3600 |
| if val.endswith('D'): |
| factor = 86400 |
| if factor > 1: |
| val = val[:-1] |
| return int(val) * factor |
| except Exception as err: |
| print("exception: %s" % err) |
| sys.exit(-1) |
| |
| |
| def print_version(): |
| print('%s version $Revision$' % GV.script_name) |
| sys.exit(0) |
| |
| |
| def parseCommandLine(): |
| try: |
| (options, args) = getopt.getopt(sys.argv[1:], '?vVDd:r:B:S:p:h:f:', ['duration=', 'version', 'netperf', 'buffer-size=']) |
| except Exception as e: |
| usage('Error: ' + str(e)) |
| exit(1) |
| |
| for (switch, val) in options: |
| if switch == '-?': |
| usage(0) |
| elif switch[1] in 'vVD': |
| GV.opt[switch] = True |
| elif switch[1] in 'dh': |
| GV.opt[switch].append(val) |
| elif switch[1] in 'rf': |
| GV.opt[switch] = val |
| elif switch[1] in 'BS': |
| GV.opt[switch] = parseMemorySize(val) |
| elif switch == '--duration': |
| GV.opt[switch] = parseDuration(val) |
| elif switch == '--version': |
| print_version() |
| elif switch == '--netperf': |
| GV.opt['--netserver'] = 'netserver' |
| GV.opt['--netclient'] = 'netperf' |
| elif switch == '--buffer-size': |
| GV.opt[switch] = int(val) |
| |
| # run default tests (if not specified) |
| if GV.opt['-r'] == '': |
| GV.opt['-r'] = 'dsN' |
| |
| # parse netperf test type, error if more than one specified |
| netcount = 0 |
| if GV.opt['-r'].find('n') >= 0: |
| GV.opt['--net'] = 'netperf' |
| netcount += 1 |
| if GV.opt['-r'].find('N') >= 0: |
| GV.opt['--net'] = 'parallel' |
| netcount += 1 |
| if GV.opt['-r'].find('M') >= 0: |
| GV.opt['--net'] = 'matrix' |
| netcount += 1 |
| if netcount > 1: |
| usage('Error: please only specify one network test at a time (-r [n|M|N])') |
| |
| if GV.opt['-V']: |
| GV.opt['-v'] = True |
| |
| if len(GV.opt['-d']) == 0: |
| usage('Error: please specify at least one segdir (-d switch)') |
| |
| if not (0 <= GV.opt['-B'] and 0 <= GV.opt['-S']): |
| usage('Error: please enter valid memory sizes for -B or -S switches') |
| |
| hf = (len(GV.opt['-h']) and 1 or 0) + (GV.opt['-f'] and 1 or 0) |
| if hf != 1: |
| usage('Error: please specify at least one of -h or -f args, but not both') |
| |
| if (GV.opt['-B']) > 1024 * 1024: |
| usage('Error: maximum size for -B parameter is 1MB') |
| |
| if GV.opt['-S'] == 0: |
| system_mem_size = getMemory() |
| if system_mem_size is not None: |
| GV.opt['-S'] = 2 * system_mem_size / len(GV.opt['-d']) |
| else: |
| sys.exit('[Error] could not get system memory size. Instead, you can use the -S option to provide the file size value') |
| |
| else: |
| GV.opt['-S'] /= len(GV.opt['-d']) |
| |
| if GV.opt['--duration'] <= 0: |
| GV.opt['--duration'] = 15 |
| print('[INFO] Invalid network duration specified. Using default (15 seconds)') |
| |
| if GV.opt['--netclient'].find('netperf') >= 0: |
| if GV.opt['--buffer-size']: |
| print('[Warning] --buffer-size option will be ignored when the --netperf option is enabled') |
| else: |
| if GV.opt['--buffer-size'] <= 0: |
| print('[INFO] --buffer-size value is not specified or invalid. Using default (8 kilobytes)') |
| GV.opt['--buffer-size'] = 8 |
| |
| # strip the last '/' from the dir |
| dd = [] |
| for d in GV.opt['-d']: |
| if d and d[-1] == '/': |
| d = d[:-1] |
| dd.append(d) |
| GV.opt['-d'] = dd |
| |
| def strcmd(cmd): |
| return reduce(lambda x, y: x + ' ' + y, map(lambda x: x.find(' ') > 0 and "'" + x + "'" or x, cmd)) |
| |
| print(strcmd(sys.argv)) |
| |
| |
| def run(cmd): |
| try: |
| if GV.opt['-v']: |
| print('[Info]', cmd) |
| p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) |
| out = p.stdout.read(-1).decode() |
| ok = p.wait() |
| except KeyboardInterrupt: |
| raise |
| |
| if GV.opt['-v'] and out and out != '': |
| print(out) |
| |
| if out and out.find('Permission denied') >= 0 and out.find('publickey') >= 0: |
| if not GV.opt['-v']: |
| print(out) |
| print('Please use gpssh-exkeys to exchange keys ...') |
| sys.exit(1) |
| |
| return ok, out |
| |
| |
| runTeardown_torndown = False |
| |
| |
| def runTeardown(): |
| global runTeardown_torndown |
| if runTeardown_torndown: |
| return |
| if GV.opt['-v']: |
| print('--------------------') |
| print(' TEARDOWN') |
| print('--------------------') |
| dirs = '' |
| for d in GV.opt['-d']: |
| dirs = '%s %s/gpcheckperf_$USER' % (dirs, d) |
| try: |
| gpssh('rm -rf ' + dirs, GV.opt['-V']) |
| except: |
| pass |
| |
| try: |
| if GV.opt['--net']: |
| gpssh(killall(GV.opt['--netserver']), GV.opt['-V']) |
| except: |
| pass |
| |
| runTeardown_torndown = True |
| |
| |
| def runSetup(): |
| if GV.opt['-v']: |
| print('--------------------') |
| print(' SETUP %s' % datetime.datetime.now().isoformat()) |
| print('--------------------') |
| okCount = 0 |
| try: |
| # Verify python3 is accessible |
| if GV.opt['-v']: |
| print('[Info] verify python3 interpreter exists') |
| (ok, out) = gpssh('python3 -c print', GV.opt['-V']) |
| if not ok: |
| if not GV.opt['-v']: |
| print(out) |
| sys.exit("[Error] unable to find python3 interpreter on some hosts\n" |
| + " verify PATH variables on the hosts") |
| |
| # mkdir cperf |
| if GV.opt['-v']: |
| print('[Info] making gpcheckperf directory on all hosts ... ') |
| dirs = '' |
| for d in GV.opt['-d']: |
| dirs = '%s %s/gpcheckperf_$USER' % (dirs, d) |
| |
| cmd = 'rm -rf %s ; mkdir -p %s' % (dirs, dirs) |
| (ok, out) = gpssh(cmd, GV.opt['-V']) |
| if not ok: |
| print('failed gpssh: %s' % out) |
| sys.exit("[Error] unable to make gpcheckperf directory. \n" |
| + " command failed: " + cmd) |
| |
| except: |
| runTeardown() |
| raise |
| |
| |
| def copyExecOver(fname): |
| # copy to this dir |
| d = GV.opt['-d'][0] |
| target = '%s/gpcheckperf_$USER/%s' % (d, fname) |
| path = '%s/lib/%s' % (sys.path[0], fname) |
| |
| if not os.path.isfile(path): |
| print("binary file not found: %s" % path) |
| sys.exit(1) |
| |
| if GV.opt['-v']: |
| print('[Info] copy local %s to remote %s' % (path, target)) |
| |
| try: |
| pathStat = os.stat(path) |
| except: |
| sys.exit('[Error] cannot stat file ' + path) |
| if not stat.S_ISREG(pathStat.st_mode): |
| sys.exit('[Error] invalid file ' + path) |
| if not os.access(path, os.X_OK): |
| sys.exit('[Exit] file not executable: ' + path) |
| |
| (ok, out) = gpsync(path, '=:%s' % target) |
| if not ok: |
| sys.exit('[Error] command failed: gpsync %s =:%s with output: %s' % (path, target, out)) |
| |
| # chmod +x file |
| (ok, out) = gpssh('chmod a+rx %s' % target, GV.opt['-V']) |
| if not ok: |
| sys.exit('[Error] command failed: chmod a+rx %s with output: %s' % (target, out)) |
| |
| return target |
| |
| def parseMultiDDResult(out): |
| # parse output of "time -p" |
| out = io.StringIO(out) |
| result = {} |
| bytes = 0 |
| for line in out: |
| i = line.find(']') |
| if i == -1: |
| continue |
| o = line[i + 2:] |
| |
| |
| if o.find('multidd total bytes ') >= 0: |
| h = line[1:i] |
| o = o.split() |
| m = re.search("(^\d+)", o[-1]) |
| if m is None: |
| sys.exit('[Error] expected %s to be a number' % o[-1]) |
| bytes = int(m.group(1)) |
| continue |
| |
| if o.find('real') >= 0: |
| h = line[1:i] |
| o = o.split() |
| o[1] = o[1].replace(',', '.') |
| m = re.search("(^\d+.\d+)", o[1]) |
| if m is None: |
| sys.exit('[Error] expected %s to be a floating point number' % o[1]) |
| time = float(m.group(1)) |
| if time < 0.1: |
| time = 0.1 # avoid division by zero |
| mbps = bytes / time / 1024 / 1024 |
| result[h] = (mbps, time, bytes) |
| continue |
| |
| return result |
| |
| |
| def runDiskWriteTest(multidd): |
| print |
| print('--------------------') |
| print('-- DISK WRITE TEST') |
| print('--------------------') |
| |
| cmd = 'time -p ' + multidd |
| for d in GV.opt['-d']: |
| cmd = cmd + (' -i /dev/zero -o %s/gpcheckperf_$USER/ddfile' % d) |
| if GV.opt['-B']: |
| cmd = cmd + (' -B %d' % GV.opt['-B']) |
| if GV.opt['-S']: |
| cmd = cmd + (' -S %d' % GV.opt['-S']) |
| (ok, out) = gpssh(cmd, GV.opt['-V']) |
| if not ok: |
| sys.exit('[Error] command failed: %s with output: %s' % (cmd, out)) |
| return parseMultiDDResult(out) |
| |
| |
| def runDiskReadTest(multidd): |
| print |
| print('--------------------') |
| print('-- DISK READ TEST') |
| print('--------------------') |
| |
| cmd = 'time -p ' + multidd |
| for d in GV.opt['-d']: |
| cmd = cmd + (' -o /dev/null -i %s/gpcheckperf_$USER/ddfile' % d) |
| if GV.opt['-B']: |
| cmd = cmd + (' -B %d' % GV.opt['-B']) |
| if GV.opt['-S']: |
| cmd = cmd + (' -S %d' % GV.opt['-S']) |
| (ok, out) = gpssh(cmd, GV.opt['-V']) |
| if not ok: |
| sys.exit('[Error] command failed: %s with output: %s' % (cmd, out)) |
| return parseMultiDDResult(out) |
| |
| |
| def runStreamTest(): |
| print |
| print('--------------------') |
| print('-- STREAM TEST') |
| print('--------------------') |
| |
| cmd = copyExecOver('stream') |
| (ok, out) = gpssh(cmd, GV.opt['-V']) |
| if not ok: |
| sys.exit('[Error] command failed: %s with output: %s' % (cmd, out)) |
| out = io.StringIO(out) |
| result = {} |
| for line in out: |
| i = line.find(']') |
| o = line[i + 2:] |
| if o.startswith('Copy:'): |
| h = line[1:i] |
| o = o.split() |
| result[h] = (float(o[1]), 0, 0) |
| return result |
| |
| |
| def startNetServer(): |
| port = 23000 |
| rmtPath = copyExecOver(GV.opt['--netserver']) |
| |
| for i in range(5): |
| if i > 0: |
| print('[Warning] retrying with port %d' % port) |
| (ok, out) = gpssh(killall(GV.opt['--netserver']), GV.opt['-V']) |
| |
| (ok, out) = gpssh('%s -p %d > /dev/null 2>&1' % (rmtPath, port), GV.opt['-V']) |
| if ok: |
| return port |
| |
| if GV.opt['-v']: |
| print('[Warning] start netserver with port %d failed' % port) |
| port += 12 |
| |
| return 0 |
| |
| |
| def killProc(proc): |
| if proc: |
| try: |
| os.kill(proc.pid, signal.SIGKILL) |
| except OSError: |
| pass |
| return proc.wait() |
| return 0 |
| |
| |
| def spawnNetperfTestBetween(x, y, netperf_path, netserver_port, sec=5): |
| if GV.opt['--netclient'] == 'netperf' and GV.opt['--netserver'] == 'netserver': |
| cmd = ('%s -H %s -p %d -t TCP_STREAM -l %s -f M -P 0 ' |
| % (netperf_path, y, netserver_port, sec)) |
| else: |
| cmd = ('%s -H %s -p %d -l %s -P 0 -b %s' |
| % (netperf_path, y, netserver_port, sec, GV.opt['--buffer-size'])) |
| |
| c = ['ssh', '-o', 'BatchMode yes', |
| '-o', 'StrictHostKeyChecking no', |
| x, cmd] |
| proc = None |
| try: |
| if GV.opt['-v'] or GV.opt['-V']: |
| print('[Info]', strcmd(c)) |
| proc = subprocess.Popen(c, stdout=subprocess.PIPE) |
| except KeyboardInterrupt: |
| killProc(proc) |
| raise |
| if not proc: |
| print('[Warning] netperf failed on %s -> %s' % (x, y)) |
| return None, None, None |
| |
| return proc, x, y |
| |
| |
| def reapNetperfTest(proc, x, y): |
| ok = True |
| out = None |
| if proc: |
| try: |
| rc = proc.wait() |
| out = proc.stdout.read(-1).decode() |
| ok = not killProc(proc) |
| proc = None |
| except KeyboardInterrupt as ki: |
| raise |
| except: |
| pass |
| finally: |
| proc = None |
| |
| if GV.opt['-v'] and out and out != '': |
| print('[Info]', out) |
| if out and out.find('Permission denied') >= 0 and out.find('publickey') >= 0: |
| if not GV.opt['-v']: |
| print(out) |
| print('[Error] Please use gpssh-exkeys to exchange keys ...') |
| sys.exit(1) |
| |
| if not ok: |
| print('[Warning] netperf failed on %s -> %s' % (x, y)) |
| return [] |
| |
| for line in io.StringIO(out): |
| line = line.split() |
| if len(line) != 5: |
| continue |
| l = [x, y] |
| l.extend(map(lambda x: float(x), line)) |
| if GV.opt['-v']: |
| print('[Info] %s -> %s : %s' % (x, y, str(line))) |
| return l |
| |
| print('[Warning] netperf failed on %s -> %s (invalid result)' % (x, y)) |
| print('[Info]', out) |
| return [] |
| |
| |
| def runNetperfTestBetween(x, y, netperf_path, netserver_port): |
| (proc, x, y) = spawnNetperfTestBetween(x, y, netperf_path, netserver_port, GV.opt['--duration']) |
| if not proc: |
| return [] |
| |
| return reapNetperfTest(proc, x, y) |
| |
| |
| def setupNetPerfTest(): |
| print |
| print('-------------------') |
| print('-- NETPERF TEST') |
| print('-------------------') |
| |
| hostlist = ssh_utils.HostList() |
| for h in GV.opt['-h']: |
| hostlist.add(h) |
| if GV.opt['-f']: |
| hostlist.parseFile(GV.opt['-f']) |
| |
| h = hostlist.get() |
| if len(h) == 0: |
| usage('Error: missing hosts in -h and/or -f arguments') |
| |
| if len(h) == 1: |
| print('[Warning] single host only - abandon netperf test') |
| return (None, None, None) |
| |
| # start netserver |
| netserver_port = startNetServer() |
| if netserver_port == 0: |
| print('[Error] unable to start netserver ... abort netperf test') |
| return (None, None, None) |
| |
| netperf = copyExecOver(GV.opt['--netclient']) |
| return (netperf, h, netserver_port) |
| |
| |
| def runNetPerfTest(): |
| (netperf, hostlist, netserver_port) = setupNetPerfTest() |
| if not netperf: |
| return None |
| |
| # make sure we have even number of hosts |
| odd = None |
| h = hostlist[:] |
| if len(h) & 1 == 1: |
| odd = h.pop() # take one out |
| |
| # run netperf test between host[0] & host[1], host[2] & host[3], ... |
| result = [] |
| for i in range(len(h)): |
| if i & 1 == 0: |
| continue |
| x = h[i - 1] |
| y = h[i] |
| res = runNetperfTestBetween(x, y, netperf, netserver_port) |
| if res and len(res) >= 6: |
| result.append(res) |
| res = runNetperfTestBetween(y, x, netperf, netserver_port) |
| if res and len(res) >= 6: |
| result.append(res) |
| |
| # run netperf test between the odd host and host 0 |
| if odd: |
| x = odd |
| y = h[0] |
| res = runNetperfTestBetween(x, y, netperf, netserver_port) |
| if res and len(res) >= 6: |
| result.append(res) |
| res = runNetperfTestBetween(y, x, netperf, netserver_port) |
| if res and len(res) >= 6: |
| result.append(res) |
| |
| return result |
| |
| |
| def runNetPerfTestParallel(): |
| (netperf, hostlist, netserver_port) = setupNetPerfTest() |
| if not netperf: |
| return None |
| |
| # make sure we have even number of hosts |
| h = hostlist[:] |
| if len(h) & 1 == 1: |
| h.append(h[0]) |
| |
| # run netperf test between host[0] & host[1], host[2] & host[3], ... |
| result = [] |
| procList = [] |
| for i in range(len(h)): |
| if i & 1 == 0: |
| continue |
| x = h[i - 1] |
| y = h[i] |
| p = spawnNetperfTestBetween(x, y, netperf, netserver_port, GV.opt['--duration']) |
| if p[0]: |
| procList.append(p) |
| |
| for (proc, x, y) in procList: |
| res = reapNetperfTest(proc, x, y) |
| if res and len(res) >= 6: |
| result.append(res) |
| |
| # second round: test between host[1] & host[0], host[3] & host[2], ... |
| procList = [] |
| for i in range(len(h)): |
| if i & 1 == 0: |
| continue |
| x = h[i - 1] |
| y = h[i] |
| p = spawnNetperfTestBetween(y, x, netperf, netserver_port, GV.opt['--duration']) |
| if p[0]: |
| procList.append(p) |
| |
| for (proc, x, y) in procList: |
| res = reapNetperfTest(proc, x, y) |
| if res and len(res) >= 6: |
| result.append(res) |
| |
| return result |
| |
| |
| def get_host_map(hostlist): |
| ''' |
| Returns a seglist dictionary that maps segment name to hostname and |
| a uniqhosts dictionary that maps each hostname to one segment name. |
| |
| For example: |
| seglist = |
| seglist['sdw1-1'] = apollo1 |
| seglist['sdw1-2'] = apollo1 |
| seglist['sdw2-1'] = apollo2 |
| seglist['sdw2-2'] = apollo2 |
| uniqhosts = |
| uniqhosts['apollo1'] = 'sdw1-1' |
| uniqhosts['apollo2'] = 'sdw2-1' |
| |
| ''' |
| seglist = dict() # segment list |
| uniqhosts = dict() # unique host list |
| |
| ''' |
| Get hostnames using non-verbose mode since verbose output makes parsing difficult with extra lines as show: |
| Using delaybeforesend 0.05 and prompt_validation_timeout 1.0 |
| [Reset ...] |
| [INFO] login sdw2 |
| [sdw2] sdw2 |
| [INFO] completed successfully |
| [Cleanup...] |
| ''' |
| rc, out = gpssh('hostname', False) |
| |
| if not rc: |
| raise Exception('Encountered error running hostname') |
| |
| ''' Sample output: |
| [sdw1] sdw1 |
| [sdw2] sdw2 |
| ''' |
| |
| # get unique hostname list |
| for line in out.splitlines(): |
| seg, host = line.translate(str.maketrans('','','[]')).split() |
| uniqhosts[host] = seg |
| |
| # get list of segments associated with each host (can't use gpssh since it de-dupes hosts) |
| for host in hostlist: |
| cmd = ['ssh', '-o', 'BatchMode yes', '-o', 'StrictHostKeyChecking no', host, 'hostname'] |
| |
| proc = None |
| try: |
| if GV.opt['-v']: |
| print('[Info]', strcmd(cmd)) |
| proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) |
| out = proc.stdout.read(-1) |
| rc = proc.wait() |
| if rc: |
| raise Exception('ssh error with the following command:\n%s with output: %s' % (cmd, out)) |
| seglist[host] = out.strip() |
| except KeyboardInterrupt: |
| killProc(proc) |
| raise |
| except Exception as err: |
| print(err) |
| sys.exit(-1) |
| |
| return seglist, uniqhosts |
| |
| |
| def runNetPerfTestMatrix(): |
| ''' |
| Runs full matrix network test where each host connects with |
| every link on every other host. |
| Returns performance results and a dictionary mapping segment_name |
| to hostname |
| ''' |
| (netperf, hostlist, netserver_port) = setupNetPerfTest() |
| if not netperf: |
| return None, None |
| |
| # dict() of seglist[segname] = hostname, uniqhosts[hostname] = 1 segment name |
| seglist, uniqhosts = get_host_map(hostlist) |
| |
| # spawn netperf between all hosts |
| result = [] |
| procList = [] |
| for host in uniqhosts: |
| for segment in seglist: |
| if seglist[segment] == host: |
| pass |
| else: |
| p = spawnNetperfTestBetween(host, segment, netperf, netserver_port, GV.opt['--duration']) |
| if p[0]: |
| procList.append(p) |
| |
| for (proc, srcHost, dstHost) in procList: |
| res = reapNetperfTest(proc, srcHost, dstHost) |
| if res and len(res) >= 6: |
| result.append(res) |
| |
| return result, seglist |
| |
| |
| def printMatrixResult(result, seglist): |
| print('Full matrix netperf bandwidth test') |
| |
| # sum up Rx/Tx rate for each host |
| netTx = dict() |
| netRx = dict() |
| for h in result: |
| if h[0] in netTx: |
| netTx[h[0]] += float(h[6]) |
| else: |
| netTx[h[0]] = float(h[6]) |
| |
| # netRx requires that we lookup the hostname for a given segment name |
| if seglist[h[1]] in netRx: |
| netRx[seglist[h[1]]] += float(h[6]) |
| else: |
| netRx[seglist[h[1]]] = float(h[6]) |
| |
| # print Tx rates |
| print("\nPer host transfer rates") |
| for host in netTx: |
| print('%s Tx rate: %.2f' % (host, netTx[host])) |
| print |
| |
| # print Rx rates |
| print("Per host receive rates") |
| for host in netRx: |
| print('%s Rx rate: %.2f' % (host, netRx[host])) |
| print |
| |
| # print per link results in verbose mode |
| if GV.opt['-v']: |
| for h in result: |
| print('%s -> %s = %f' % (h[0], h[1], h[6])) |
| |
| sum = min = max = avg = 0 |
| n = list(map(lambda x: float(x[6]), result)) |
| sum = reduce(lambda x, y: x + y, n) |
| min = reduce(lambda x, y: x > y and y or x, n, 9999999999) |
| max = reduce(lambda x, y: x > y and x or y, n) |
| avg = float(sum) / len(result) |
| |
| copy = n[:] |
| copy.sort() |
| median = copy[len(copy) // 2] |
| |
| print('') |
| print('Summary:') |
| print('sum = %.2f MB/sec' % sum) |
| print('min = %.2f MB/sec' % min) |
| print('max = %.2f MB/sec' % max) |
| print('avg = %.2f MB/sec' % avg) |
| print('median = %.2f MB/sec' % median) |
| print('') |
| |
| |
| def printNetResult(result): |
| print('Netperf bisection bandwidth test') |
| for h in result: |
| print('%s -> %s = %f' % (h[0], h[1], h[6])) |
| |
| sum = min = max = avg = 0 |
| n = list(map(lambda x: float(x[6]), result)) |
| sum = reduce(lambda x, y: x + y, n) |
| min = reduce(lambda x, y: x > y and y or x, n, 9999999999) |
| max = reduce(lambda x, y: x > y and x or y, n, 0) |
| avg = float(sum) / len(result) |
| |
| copy = n[:] |
| copy.sort() |
| median = copy[len(copy) // 2] |
| |
| print('') |
| print('Summary:') |
| print('sum = %.2f MB/sec' % sum) |
| print('min = %.2f MB/sec' % min) |
| print('max = %.2f MB/sec' % max) |
| print('avg = %.2f MB/sec' % avg) |
| print('median = %.2f MB/sec' % median) |
| print('') |
| |
| limit = max * 0.9 |
| for r in result: |
| if r[6] < limit: |
| print('[Warning] connection between %s and %s is no good' % (r[0], r[1])) |
| |
| |
| def printResult(title, result): |
| totTime = 0 |
| totBytes = 0 |
| totMBPS = 0 |
| min = 99999999 |
| max = 0 |
| minHost = maxHost = None |
| for h in result: |
| (mbps, time, bytes) = result[h] |
| totTime += time |
| totBytes += bytes |
| totMBPS += mbps |
| if min > mbps: |
| min = mbps |
| minHost = h |
| if max < mbps: |
| max = mbps |
| maxHost = h |
| |
| avgTime = totTime / len(result) |
| |
| print('') |
| if totTime > 0: |
| print(' %s avg time (sec): %3.2f' % (title, avgTime)) |
| if totBytes > 0: |
| print(' %s tot bytes: %d' % (title, totBytes)) |
| print(' %s tot bandwidth (MB/s): %3.2f' % (title, totMBPS)) |
| print(' %s min bandwidth (MB/s): %3.2f [%s]' % (title, min, minHost)) |
| print(' %s max bandwidth (MB/s): %3.2f [%s]' % (title, max, maxHost)) |
| if GV.opt['-D']: |
| print(' -- per host bandwidth --') |
| for h in result: |
| (mbps, time, bytes) = result[h] |
| print(' %s bandwidth (MB/s): %3.2f [%s]' % (title, mbps, h)) |
| print('') |
| |
| |
| def main(): |
| try: |
| parseCommandLine() |
| runSetup() |
| diskWriteResult = diskReadResult = streamResult = netResult = None |
| tornDown = False |
| try: |
| if GV.opt['-r'].find('d') >= 0: |
| print('[Warning] Using %d bytes for disk performance test. This might take some time' % (GV.opt['-S'] * len(GV.opt['-d']))) |
| multidd = copyExecOver('multidd') |
| diskWriteResult = runDiskWriteTest(multidd) |
| diskReadResult = runDiskReadTest(multidd) |
| |
| if GV.opt['-r'].find('s') >= 0: |
| streamResult = runStreamTest() |
| |
| if GV.opt['--net'] == 'netperf': |
| netResult = runNetPerfTest() |
| elif GV.opt['--net'] == 'parallel': |
| netResult = runNetPerfTestParallel() |
| elif GV.opt['--net'] == 'matrix': |
| netResult, seglist = runNetPerfTestMatrix() |
| |
| runTeardown() |
| |
| finally: |
| print('') |
| print('====================') |
| print('== RESULT %s' % datetime.datetime.now().isoformat()) |
| print('====================') |
| |
| if diskWriteResult: |
| printResult('disk write', diskWriteResult) |
| |
| if diskReadResult: |
| printResult('disk read', diskReadResult) |
| |
| if streamResult: |
| printResult('stream', streamResult) |
| |
| if netResult and GV.opt['--net'] == 'matrix': |
| printMatrixResult(netResult, seglist) |
| elif netResult and GV.opt['--net']: |
| printNetResult(netResult) |
| |
| runTeardown() |
| |
| except KeyboardInterrupt: |
| print('[Abort] Keyboard Interrupt ...') |
| |
| if __name__ == '__main__': |
| main() |