UIMA-6103 Always use start_component for agents and check for ssh errors
git-svn-id: https://svn.apache.org/repos/asf/uima/uima-ducc/trunk@1864026 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/src/main/admin/ducc_util.py b/src/main/admin/ducc_util.py
index 901b868..b7f8024 100644
--- a/src/main/admin/ducc_util.py
+++ b/src/main/admin/ducc_util.py
@@ -556,7 +556,7 @@
cmd = '/bin/hostname'
if(node == 'localhost'):
req = self.get_hostname()
- ssh_cmd = 'ssh -q -o BatchMode=yes -o ConnectTimeout=10'+' '+node+" "+cmd
+ ssh_cmd = 'ssh -o BatchMode=yes -o ConnectTimeout=10'+' '+node+" "+cmd
resp = self.popen(ssh_cmd)
lines = resp.readlines()
if(len(lines)== 1):
@@ -564,21 +564,19 @@
line = line.strip();
rsp = line.split('.')[0]
if(req == rsp):
- is_operational = True;
- if(not is_operational):
- if(verbosity):
- print 'ssh not operational - unexpected results'
- print ssh_cmd
- for line in lines:
- print line
- return is_operational
+ return True;
+ if(verbosity):
+ print 'ssh not operational - unexpected results from:', ssh_cmd
+ for line in lines:
+ print '>>>>>',line
+ return False
# like popen, only it spawns via ssh
# Skip use of ssh?
# NOTE: Current callers always have do_wait True
def ssh(self, host, do_wait, *CMD):
cmd = ' '.join(CMD)
- # Some callers quote the string which is OK for ssh but not the direct call
+ # Some callers quote the string which is OK for ssh but not the direct call
if cmd[0] == "'" and cmd[-1] == "'":
cmd = cmd[1:len(cmd)-2]
if ( do_wait ):
diff --git a/src/main/admin/start_ducc b/src/main/admin/start_ducc
index 90a6040..6b308b0 100644
--- a/src/main/admin/start_ducc
+++ b/src/main/admin/start_ducc
@@ -69,15 +69,15 @@
def start_component(self, args):
- ducc, component, or_parms = args
+ component, or_parms = args
msgs = []
-
- node = self.ducc_properties.get('ducc.head')
-
+
com = component
if ( com.find('@') >= 0 ):
com, node = com.split('@')
-
+ else:
+ node = self.ducc_properties.get('ducc.head')
+
if (com in self.local_components):
node = self.localhost
@@ -87,9 +87,11 @@
if (com in ['db', 'database']):
msgs.append(('Unmanaged component', component))
return msgs
-
+
+ if com == 'ag':
+ com = 'agent'
if ((com in self.default_components) or ( com == 'agent')) :
- msgs.append((node, 'Starting', com))
+ msgs.append(('Starting', com, 'on', node))
else:
msgs.append(('Unrecognized component', component))
return msgs
@@ -102,11 +104,16 @@
if ( node == 'local' ):
node = self.localhost
+ else:
+ if not self.ssh_operational(node):
+ msgs.append(('>>>>> ERROR - cannot ssh to', node))
+ return msgs
lines = self.ssh(node, True, "'", self.DUCC_HOME + '/admin/ducc.py', '-c', com, '-b', or_parms, '-d', str(time.time()), '--nodup', "'")
# we'll capture anything that the python shell spews because it may be useful, and then drop the
# pipe when we see a PID message
+ pid = None
while 1:
line = lines.readline().strip()
if ( not line ):
@@ -114,13 +121,24 @@
#msgs.append(('[]', line))
if ( line.startswith('PID') ):
toks = line.split(' ') # get the PID
- msgs.append((' PID', toks[1]))
- #self.pids_daemons.put(com + '@' + node, toks[1])
+ pid = toks[1]
lines.close()
break
if ( line.startswith('WARN') ):
msgs.append((' ', line))
-
+ pid = '?'
+
+ sshmsgs = self.ssh_ok(node, line )
+ if ( sshmsgs != None ):
+ for m in sshmsgs:
+ print '[S]', m
+
+ if not pid:
+ msgs.append(('ERROR - failed to start', com, 'on', node))
+ return msgs;
+ if pid != '?':
+ msgs.append(('Started', com, 'on', node, 'with PID', pid))
+
#if ( com in self.default_components ): # tracks where the management processes are
# self.pidlock.acquire()
# self.pids_daemons.put(com, com + '@' + node)
@@ -130,48 +148,6 @@
return msgs
- def start_one_agent(self, args):
-
- host = args[0]
- msgs = []
- spacer = ' '
- msgs.append((host, ""))
- lines = self.ssh(host, True, "'", self.DUCC_HOME + '/admin/ducc.py', '-c' 'agent', '-b', '-d', str(time.time()), '--nodup', "'")
- for line in lines:
- line = line.strip()
- # print '[]', host, line
- # msgs.append(('[l]', line))
- if ( line.startswith('PID') ):
- toks = line.split(' ')
- pid = toks[1]
- #self.pidlock.acquire()
- #self.pids_agents.put('agent@' + host, pid)
- #self.pidlock.release()
-
- lines.close()
- msgs.append((spacer, 'DUCC Agent started PID', pid))
- break
-
- if ( 'tty' in line ):
- # ssh junk if mesg is set
- continue
-
- toks = line.split()
-
- sshmsgs = self.ssh_ok(host, line )
- if ( sshmsgs != None ):
- for m in sshmsgs:
- print '[S]', m
-
- if ( toks[0] == 'NOTOK' ):
- msgs.append((spacer, 'NOTOK Not started:', ' '.join(toks[1:])))
- else:
- msgs.append((spacer, line))
-
- self.db_acct_start(host,'agent')
-
- return msgs
-
def verify_required_directories(self):
for dir in ('history', 'state', 'logs'):
d = self.DUCC_HOME + '/' + dir
@@ -205,7 +181,7 @@
print " component name with a destination node, use the notation component@nodename."
print " Multiple components may be specified:"
print ""
- print " start_ducc -c sm -c pm -c rm@node1 -c or@node2 -c agent@remote1 -c agent@remote2"
+ print " start_ducc -c sm -c pm -c rm@node1 -c or@node2 -c agent@remote1 -c ag@remote2"
print ""
print " Components include:"
print " rm - resource manager"
@@ -213,7 +189,7 @@
print " pm - process manager"
print " sm - services manager"
print " ws - web server"
- print " agent - node agent"
+ print " ag or agent - node agent"
print ' head = { or, pm, rm, sm, ws, db, broker }'
print ""
print " --nothreading"
@@ -408,8 +384,7 @@
for com in components:
if ( com in ('or') ):
try:
- self.threadpool.invoke(self.start_component, ducc, com, or_parms)
- #self.start_component(ducc, com, or_parms)
+ self.threadpool.invoke(self.start_component, com, or_parms)
except:
self.threadpool.quit()
print sys.exc_info()[0], "DUCC may not be started correctly."
@@ -420,12 +395,13 @@
if(self.is_reliable_backup()):
print '********** "backup" head node -> not starting agents'
else:
- print "Starting", n_nodes, "agents"
+ if n_nodes > 0:
+ print "Starting", n_nodes, "agents"
for (nodefile, nodelist) in nodes.items():
print '********** Starting agents from file', nodefile
try:
for node in nodelist:
- self.threadpool.invoke(self.start_one_agent, node)
+ self.threadpool.invoke(self.start_component, 'agent@'+node, None)
except:
self.threadpool.quit()
print sys.exc_info()[0], "DUCC may not be started correctly."
@@ -439,8 +415,7 @@
pass # already started
else:
try:
- self.threadpool.invoke(self.start_component, ducc, com, or_parms)
- #self.start_component(ducc, com, or_parms)
+ self.threadpool.invoke(self.start_component, com, or_parms)
except:
self.threadpool.quit()
print sys.exc_info()[0], "DUCC may not be started correctly."