blob: 99d080234f2c54e764adf0da3a6a0002a00ead57 [file] [log] [blame]
#!/usr/bin/env python
# -----------------------------------------------------------------------
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# -----------------------------------------------------------------------
import os
import sys
from time import time
import getopt
import signal
from ducc_util import DuccUtil
from properties import Properties
from local_hooks import verify_slave_node
from local_hooks import verify_master_node
#from ducc_util import ThreadWorker
from ducc_util import ThreadPool
class CheckDucc(DuccUtil):
def __init__(self):
DuccUtil.__init__(self)
self.badnodes = []
def validate(self, checkdate):
verify_slave_node(checkdate, self.ducc_properties)
self.check_clock_skew(checkdate)
self.verify_jvm()
self.verify_limits()
(viable, elevated, safe) = self.verify_duccling()
self.duccling_ok(viable, elevated, safe)
if ( not safe or not viable ):
print 'NOTOK ducc_ling is not installed correctly.'
return
def verify_database(self):
if ( self.db_bypass == True ):
return True
ret = self.db_alive(1)
if ( ret ):
print 'The database is running'
else:
print 'The database is not running'
def verify_activemq(self):
if ( self.is_amq_active() ):
print 'ActiveMQ is found listening at', self.broker_protocol + "://" + self.broker_host + ':' + self.broker_port
return True
return False
def check_node(self, args):
messages = []
spacer = ' '
node = args[0]
messages.append((' '))
messages.append(('Checking', node, '...'))
if(self.ssh_operational(node)):
text = "ssh is operational to "+node
#print text
else:
text = "ssh is NOT operational to "+node
print text
messages.append((spacer, text))
return messages
response = self.find_ducc_process(node) # a tuple, (True|False, proclist)
if ( not response[0] ):
messages.append((spacer, "No response."))
return messages
proclist = response[1] # a list of tuples, tuple is (component, pid, user)
if ( len(proclist) > 0 ):
for proc in proclist:
component = proc[0]
pid = proc[1]
found_user = proc[2]
signal = self.kill_signal
if(self.is_reliable_backup()):
if ( component == 'agent' ):
continue
if ( component == 'orchestrator' ):
component = 'or'
if ( component == 'database' ):
if ( signal != None ):
if ( self.kill_db9 == False ):
signal = '-QUIT'
process_id = found_user + ' ' + component + '@' + node + ' PID ' + pid
if ( signal != None ) :
if ( self.user != found_user ):
messages.append((spacer, "Not killing someone else's process.", process_id))
elif ( component == 'unknown-java' ):
messages.append((spacer, 'Not killing non-ducc process', process_id))
else:
messages.append((spacer, 'Killing (' + signal + ')', process_id))
self.kill_process(node, proc, signal)
if ( component == 'agent' ):
self.pids_agents.delete(pid)
else:
self.pids_daemons.delete(pid)
process_changes = True
else:
messages.append((spacer, 'Found', process_id))
full_name = component + '@' + node
if ( component == 'agent' ):
self.pids_agents.put(full_name, pid)
else:
if ( component in self.default_components ):
self.pids_daemons.put(full_name, pid)
self.pids_daemons.put(component, full_name)
else:
messages.append((spacer, 'no processes found.'))
if ( self.kill_signal == None ):
response = "Node health checks return."
lines = self.ssh(node, True, self.DUCC_HOME + "/admin/check_ducc", "-x", str(int(time())))
while 1:
line = lines.readline()
if ( 'signal' in line ):
response = "Node health did not complete: " + line
self.badnodes.append(node)
# these next two filter junk if 'mesg' is running in a shell rc
if ( 'stdin: is not a tty' in line ):
continue
if ( 'mesg' in line ):
continue
if ( not line ):
break
line = line.strip()
messages.append((spacer, line))
#messages.append((spacer, '[]', line))
messages.append((spacer, response))
return messages
def signalHandler(self, signum, frame):
print "-------- Caught signal", signum, "--------"
if ( len(self.badnodes) != 0 ):
print "Health checks on these nodes did not return:"
for n in self.badnodes:
print n,
print ''
sys.exit(1)
def usage(self, msg):
if ( msg != None ):
print msg
print "Usage:"
print " check_ducc [options]"
print " If no options are given this is the equivalent of:"
print ""
print " check_ducc -n ../resources/ducc.nodes"
print ""
print " For reliable DUCC agents will not be killed from backup head node. "
print ""
print " Broker will not be killed when ducc.broker.automanage = false. "
print " Database will not be killed when ducc.database.automanage = false. "
print ""
print "Options:"
print " -n --nodelist nodefile"
print " Check for agents on the nodes in nodefile. This option may be specified multiple time"
print " for multiple nodefiles. The 'local' node is always checked"
print ""
print " -c --configuration"
print " Do basic sanity checking on the configuration only. Note that configuration checking is always"
print " performed with most options. The [-c, --configuration] option does ONLY configuration checking."
print ""
print " -k --kill"
print " Force-kill any DUCC process you find on a node (if normal stop_ducc isn't working. This"
print " uses kill -KILL (-9) for all daemons, except database which uses -QUIT (3),"
print " and only kills processes owned by the invoking user."
print ""
print " --db-9"
print " Use signal -KILL (-9) to kill database, rather than the default -QUIT (-3)"
print ""
print " -i --int"
print " Force-kill any DUCC process you find on a node (if normal stop_ducc isn't working. This"
print " uses kill -INT (-2) and only kills processes owned by the invoking user."
print ""
print " -q --quit"
print " Force-kill any DUCC process you find on a node (if normal stop_ducc isn't working. This"
print " uses kill -QUIT (-3) and only kills processes owned by the invoking user."
print ""
print " -p --pids"
print " Rewrite the PID file. The PID file is always rewritten if any changes to processes are made. Sometimes"
print " the PID file needs rebuilding. This option causes the file to be rebuilt regardless of"
print " changes."
print ""
print " -x localdate"
print " Validate the local installation, called via ssh usually. The date is the date on the calling machine."
print ""
print " --nothreading"
print " Disable multithreaded operation if it would otherwise be used"
print ""
print " -v --verbose"
print " If specified, print the validated configuration to the console."
print ""
print " -? prints this message."
sys.exit(1)
def main(self, argv):
try:
opts, args = getopt.getopt(argv, 'cikn:opqx:h?v', ['configuration', 'nodelist=', 'int', 'quit', 'kill', 'db-9', 'pids', 'verbose', 'nothreading', ])
except:
self.usage("Invalid arguments " + ' '.join(argv))
nodefiles = []
self.user = os.environ['LOGNAME']
self.kill_signal = None
self.kill_db9 = False
redo_pids = False
process_changes = False
do_validate = False
checkdate = 0
config_only = False
verbose = False
for ( o, a ) in opts:
if o in ('-c', '--configuration'):
config_only = True
elif o in ('-n', '--nodelist'):
nodefiles.append(a)
elif o in ('-i', '--int'):
if ( self.kill_signal != None ):
print 'Conflicting kill signals: -INT and', self.kill_signal
return
self.kill_signal = '-INT'
elif o in ('-q', '--quit'):
if ( self.kill_signal != None ):
print 'Conflicting kill signals: -QUIT and', self.kill_signal
return
self.kill_signal = '-QUIT'
elif o in ('-k', '--kill'):
if ( self.kill_signal != None ):
print 'Conflicting kill signals: -KILL and', self.kill_signal
return
self.kill_signal = '-KILL'
elif o in ('--db-9'):
self.kill_db9 = True
elif o in ( '--nothreading' ):
self.disable_threading()
elif o in ('-p', '--pids'):
redo_pids = True
elif o in ('-x'):
# intended to be called recursively from check_ducc, NOT from the command line
do_validate = True
checkdate = float(a)
elif o in ('-v', '--verbose'):
verbose = True
elif o in ('-h', '-?', '--help'):
self.usage(None)
else:
print 'badarg', a
usage('bad arg: ' + a)
if not self.installed():
print "Head node is not initialized. Have you run ducc_post_install?"
return
self.check_properties()
if ( do_validate ):
# if validating, ONLY validate, called via ssh usually
self.validate(checkdate)
return
# When called directly must be from the head node
self.verify_head()
self.set_duccling_version()
os.system('cat ' + self.DUCC_HOME + '/state/duccling.version')
# not -x option, do this only on local node
env = self.show_ducc_environment()
for e in env:
print e
jvm = self.ducc_properties.get('ducc.jvm')
if ( jvm == None ):
print 'WARN: ducc.jvm is not specified in ducc.properties. Default is simply "java" which may not work on all nodes.'
if ( not verify_master_node(self.ducc_properties) ):
print 'FAIL: Cannot verify master mode'
return
if ( not self.verify_activemq() ):
print 'ActiveMQ broker is not running on', self.broker_protocol + "://" + self.broker_host + ':' + self.broker_port
self.verify_database()
# init the PID file
if(not self.is_reliable_backup()):
self.pids_agents = Properties()
self.pids_agents.load_if_exists(self.pid_file_agents)
self.pids_daemons = Properties()
self.pids_daemons.load_if_exists(self.pid_file_daemons)
# read the nodelists
if ( len(nodefiles) == 0 ):
nodefiles = self.default_nodefiles
check_nodepools = True
else:
# if using other than the fully configured set of nodes we can't reliably check nodepools
# because anything other than the full set of nodes may be missing something
check_nodepools = False
nodes = {}
n_nodes = 0
for nf in nodefiles:
n_nodes, nodes = self.read_nodefile(nf, nodes)
#
# add in the local host if needed, and the webserver node
#
localnodes = []
if ( not self.localhost in nodes ):
localnodes.append(self.localhost)
if ( not (self.webserver_node in ['localhost', self.localhost, None]) ):
localnodes.append(self.webserver_node)
if ( len(localnodes) > 0 ):
nodes['local'] = localnodes
self.verify_jvm()
if ( config_only ):
if ( nodefiles != self.default_nodefiles):
print "NOTOK: Config check only works with full, default nodefile:", self.default_nodefiles
return
if self.verify_class_configuration(nodefiles[0], verbose):
print "OK: Class configuration checked"
else:
print "NOTOK: Errors in class or node configuration."
return
# checking starts here
print "Checking", n_nodes, "nodes"
self.threadpool = ThreadPool(n_nodes + 5) # more for the head processes
checked = {}
signal.signal(signal.SIGINT, self.signalHandler)
try:
for (nodefile, nodelist) in nodes.items():
if ( nodelist == None ):
# loading the nodes prints the necessary message
continue
for node in nodelist:
if ( checked.has_key(node) ):
continue
checked[node] = node
self.threadpool.invoke(self.check_node, node)
# check backup head node(s)
for node in self.get_head_node_list():
if(not node in checked):
checked[node] = node
self.threadpool.invoke(self.check_node, node)
except:
self.threadpool.quit()
print sys.exc_info()[0], "Exiting."
sys.exit(1)
self.threadpool.quit()
if ( self.kill_signal != None ):
if(self.automanage_broker):
print 'Stopping broker'
self.stop_broker()
else:
print 'Not stopping broker - not automanaged'
if(self.automanage_database):
print 'Stopping database'
self.db_stop()
else:
print 'Not stopping database - not automanaged'
if(not self.is_reliable_backup()):
if ( len(self.pids_agents) == 0):
if ( os.path.exists(self.pid_file_agents) ):
os.remove(self.pid_file_agents)
elif (process_changes or redo_pids):
self.pids_agents.write(self.pid_file_agents)
if ( len(self.pids_daemons) == 0):
if ( os.path.exists(self.pid_file_daemons) ):
os.remove(self.pid_file_daemons)
elif (process_changes or redo_pids):
self.pids_daemons.write(self.pid_file_daemons)
if __name__ == "__main__":
checker = CheckDucc()
checker.main(sys.argv[1:])