| #!/usr/bin/env python |
| # ----------------------------------------------------------------------- |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # ----------------------------------------------------------------------- |
| |
| |
| import os |
| import sys |
| from time import time |
| import getopt |
| import signal |
| |
| from ducc_util import DuccUtil |
| from properties import Properties |
| from local_hooks import verify_slave_node |
| from local_hooks import verify_master_node |
| |
| #from ducc_util import ThreadWorker |
| from ducc_util import ThreadPool |
| |
| class CheckDucc(DuccUtil): |
| |
| def __init__(self): |
| DuccUtil.__init__(self,merge=True) |
| self.badnodes = [] |
| |
| def validate(self, checkdate): |
| verify_slave_node(checkdate, self.ducc_properties) |
| self.check_clock_skew(checkdate) |
| self.verify_jvm() |
| self.verify_limits() |
| (viable, elevated, safe) = self.verify_duccling() |
| self.duccling_ok(viable, elevated, safe) |
| if ( not safe or not viable ): |
| print 'NOTOK ducc_ling is not installed correctly.' |
| |
| return |
| |
| def verify_database(self): |
| if ( self.db_bypass == True ): |
| return True |
| |
| ret = self.db_alive(1) |
| if ( ret ): |
| print 'The database is running' |
| else: |
| print 'The database is not running' |
| |
| def verify_activemq(self): |
| if ( self.is_amq_active() ): |
| print 'ActiveMQ is found listening at', self.broker_protocol + "://" + self.broker_host + ':' + self.broker_port |
| return True |
| return False |
| |
| def check_node(self, args): |
| |
| messages = [] |
| spacer = ' ' |
| node = args[0] |
| |
| messages.append((' ')) |
| messages.append(('Checking', node, '...')) |
| |
| if(self.ssh_operational(node)): |
| text = "ssh is operational to "+node |
| #print text |
| else: |
| text = "ssh is NOT operational to "+node |
| print text |
| messages.append((spacer, text)) |
| return messages |
| |
| response = self.find_ducc_process(node) # a tuple, (True|False, proclist) |
| if ( not response[0] ): |
| messages.append((spacer, "No response.")) |
| return messages |
| |
| proclist = response[1] # a list of tuples, tuple is (component, pid, user) |
| if ( len(proclist) > 0 ): |
| for proc in proclist: |
| component = proc[0] |
| pid = proc[1] |
| found_user = proc[2] |
| |
| signal = self.kill_signal |
| |
| if(self.is_reliable_backup()): |
| if ( component == 'agent' ): |
| continue |
| |
| if ( component == 'orchestrator' ): |
| component = 'or' |
| |
| if ( component == 'database' ): |
| if ( signal != None ): |
| if ( self.kill_db9 == False ): |
| signal = '-QUIT' |
| |
| process_id = found_user + ' ' + component + '@' + node + ' PID ' + pid |
| if ( signal != None ) : |
| if ( self.user != found_user ): |
| messages.append((spacer, "Not killing someone else's process.", process_id)) |
| elif ( component == 'unknown-java' ): |
| messages.append((spacer, 'Not killing non-ducc process', process_id)) |
| else: |
| messages.append((spacer, 'Killing (' + signal + ')', process_id)) |
| self.kill_process(node, proc, signal) |
| if ( component == 'agent' ): |
| self.pids_agents.delete(pid) |
| else: |
| self.pids_daemons.delete(pid) |
| process_changes = True |
| |
| else: |
| messages.append((spacer, 'Found', process_id)) |
| #full_name = component + '@' + node |
| #if ( component == 'agent' ): |
| # self.pids_agents.put(full_name, pid) |
| #else: |
| # if ( component in self.default_components ): |
| # self.pids_daemons.put(full_name, pid) |
| # self.pids_daemons.put(component, full_name) |
| else: |
| messages.append((spacer, 'no processes found.')) |
| |
| if ( self.kill_signal == None ): |
| response = "Node health checks return." |
| lines = self.ssh(node, True, self.DUCC_HOME + "/admin/check_ducc", "-x", str(int(time()))) |
| while 1: |
| line = lines.readline() |
| if ( 'signal' in line ): |
| response = "Node health did not complete: " + line |
| self.badnodes.append(node) |
| # these next two filter junk if 'mesg' is running in a shell rc |
| if ( 'stdin: is not a tty' in line ): |
| continue |
| if ( 'mesg' in line ): |
| continue |
| |
| if ( not line ): |
| break |
| line = line.strip() |
| messages.append((spacer, line)) |
| #messages.append((spacer, '[]', line)) |
| messages.append((spacer, response)) |
| |
| return messages |
| |
| def signalHandler(self, signum, frame): |
| print "-------- Caught signal", signum, "--------" |
| if ( len(self.badnodes) != 0 ): |
| print "Health checks on these nodes did not return:" |
| for n in self.badnodes: |
| print n, |
| print '' |
| sys.exit(1) |
| |
| def usage(self, msg): |
| if ( msg != None ): |
| print msg |
| print "Usage:" |
| print " check_ducc [options]" |
| print " If no options are given this is the equivalent of:" |
| print "" |
| print " check_ducc -n ../resources/ducc.nodes" |
| print "" |
| #print " For reliable DUCC agents will not be killed from backup head node. " |
| #print "" |
| #print " Broker will not be killed when ducc.broker.automanage = false. " |
| #print " Database will not be killed when ducc.database.automanage = false. " |
| #print "" |
| print "Options:" |
| print " -n --nodelist nodefile" |
| print " Check for agents on the nodes in nodefile. This option may be specified multiple time" |
| print " for multiple nodefiles. The 'local' node is always checked" |
| print "" |
| print " -c --configuration" |
| print " Do basic sanity checking on the configuration only. Note that configuration checking is always" |
| print " performed with most options. The [-c, --configuration] option does ONLY configuration checking." |
| print "" |
| #print " -k --kill" |
| #print " Force-kill any DUCC process you find on a node (if normal stop_ducc isn't working. This" |
| #print " uses kill -KILL (-9) for all daemons, except database which uses -QUIT (3)," |
| #print " and only kills processes owned by the invoking user." |
| #print "" |
| #print " --db-9" |
| #print " Use signal -KILL (-9) to kill database, rather than the default -QUIT (-3)" |
| #print "" |
| #print " -i --int" |
| #print " Force-kill any DUCC process you find on a node (if normal stop_ducc isn't working. This" |
| #print " uses kill -INT (-2) and only kills processes owned by the invoking user." |
| #print "" |
| #print " -q --quit" |
| #print " Force-kill any DUCC process you find on a node (if normal stop_ducc isn't working. This" |
| #print " uses kill -QUIT (-3) and only kills processes owned by the invoking user." |
| #print "" |
| #print " -p --pids" |
| #print " Rewrite the PID file. The PID file is always rewritten if any changes to processes are made. Sometimes" |
| #print " the PID file needs rebuilding. This option causes the file to be rebuilt regardless of" |
| #print " changes." |
| #print "" |
| print " -x localdate" |
| print " Validate the local installation, called via ssh usually. The date is the date on the calling machine." |
| print "" |
| print " --nothreading" |
| print " Disable multithreaded operation if it would otherwise be used" |
| print "" |
| print " -v --verbose" |
| print " If specified, print the validated configuration to the console." |
| print "" |
| print " -? prints this message." |
| sys.exit(1) |
| |
| def main(self, argv): |
| |
| try: |
| opts, args = getopt.getopt(argv, 'cn:x:h?v', ['configuration', 'nodelist=', 'verbose', 'nothreading', ]) |
| #opts, args = getopt.getopt(argv, 'cikn:opqx:h?v', ['configuration', 'nodelist=', 'int', 'quit', 'kill', 'db-9', 'pids', 'verbose', 'nothreading', ]) |
| except: |
| self.usage("Invalid arguments " + ' '.join(argv)) |
| |
| nodefiles = [] |
| self.user = os.environ['LOGNAME'] |
| self.kill_signal = None |
| self.kill_db9 = False |
| redo_pids = False |
| process_changes = False |
| do_validate = False |
| checkdate = 0 |
| config_only = False |
| verbose = False |
| |
| for ( o, a ) in opts: |
| if o in ('-c', '--configuration'): |
| config_only = True |
| elif o in ('-n', '--nodelist'): |
| nodefiles.append(a) |
| #elif o in ('-i', '--int'): |
| # if ( self.kill_signal != None ): |
| # print 'Conflicting kill signals: -INT and', self.kill_signal |
| # return |
| # self.kill_signal = '-INT' |
| #elif o in ('-q', '--quit'): |
| # if ( self.kill_signal != None ): |
| # print 'Conflicting kill signals: -QUIT and', self.kill_signal |
| # return |
| # self.kill_signal = '-QUIT' |
| #elif o in ('-k', '--kill'): |
| # if ( self.kill_signal != None ): |
| # print 'Conflicting kill signals: -KILL and', self.kill_signal |
| # return |
| # self.kill_signal = '-KILL' |
| #elif o in ('--db-9'): |
| # self.kill_db9 = True |
| elif o in ( '--nothreading' ): |
| self.disable_threading() |
| #elif o in ('-p', '--pids'): |
| # redo_pids = True |
| elif o in ('-x'): |
| # intended to be called recursively from check_ducc, NOT from the command line |
| do_validate = True |
| checkdate = float(a) |
| elif o in ('-v', '--verbose'): |
| verbose = True |
| elif o in ('-h', '-?', '--help'): |
| self.usage(None) |
| else: |
| print 'badarg', a |
| usage('bad arg: ' + a) |
| |
| |
| if not self.installed(): |
| print "Head node is not initialized. Have you run ducc_post_install?" |
| return |
| |
| self.check_properties() |
| |
| if ( do_validate ): |
| # if validating, ONLY validate, called via ssh usually |
| self.validate(checkdate) |
| return |
| |
| # When called directly must be from the head node |
| self.verify_head() |
| |
| self.set_duccling_version() |
| |
| os.system('cat ' + self.DUCC_HOME + '/state/duccling.version') |
| # not -x option, do this only on local node |
| env = self.show_ducc_environment() |
| for e in env: |
| print e |
| |
| |
| jvm = self.ducc_properties.get('ducc.jvm') |
| if ( jvm == None ): |
| print 'WARN: ducc.jvm is not specified in ducc.properties. Default is simply "java" which may not work on all nodes.' |
| |
| if ( not verify_master_node(self.ducc_properties) ): |
| print 'FAIL: Cannot verify master mode' |
| return |
| |
| if ( not self.verify_activemq() ): |
| print 'ActiveMQ broker is not running on', self.broker_protocol + "://" + self.broker_host + ':' + self.broker_port |
| |
| self.verify_database() |
| |
| # init the PID file |
| #if(not self.is_reliable_backup()): |
| # self.pids_agents = Properties() |
| # self.pids_agents.load_if_exists(self.pid_file_agents) |
| #self.pids_daemons = Properties() |
| #self.pids_daemons.load_if_exists(self.pid_file_daemons) |
| |
| # read the nodelists |
| if ( len(nodefiles) == 0 ): |
| nodefiles = self.default_nodefiles |
| check_nodepools = True |
| else: |
| # if using other than the fully configured set of nodes we can't reliably check nodepools |
| # because anything other than the full set of nodes may be missing something |
| check_nodepools = False |
| |
| nodes = {} |
| n_nodes = 0 |
| for nf in nodefiles: |
| n_nodes, nodes = self.read_nodefile(nf, nodes) |
| |
| # |
| # add in the local host if needed, and the webserver node |
| # |
| localnodes = [] |
| if ( not self.localhost in nodes ): |
| localnodes.append(self.localhost) |
| |
| if ( not (self.webserver_node in ['localhost', self.localhost, None]) ): |
| localnodes.append(self.webserver_node) |
| |
| if ( len(localnodes) > 0 ): |
| nodes['local'] = localnodes |
| |
| self.verify_jvm() |
| |
| |
| if ( config_only ): |
| if ( nodefiles != self.default_nodefiles): |
| print "NOTOK: Config check only works with full, default nodefile:", self.default_nodefiles |
| return |
| if self.verify_class_configuration(nodefiles[0], verbose): |
| print "OK: Class configuration checked" |
| else: |
| print "NOTOK: Errors in class or node configuration." |
| |
| return |
| |
| # checking starts here |
| print "Checking", n_nodes, "nodes" |
| self.threadpool = ThreadPool(n_nodes + 5) # more for the head processes |
| checked = {} |
| |
| signal.signal(signal.SIGINT, self.signalHandler) |
| |
| try: |
| for (nodefile, nodelist) in nodes.items(): |
| if ( nodelist == None ): |
| # loading the nodes prints the necessary message |
| continue |
| for node in nodelist: |
| if ( checked.has_key(node) ): |
| continue |
| |
| checked[node] = node |
| self.threadpool.invoke(self.check_node, node) |
| # check backup head node(s) |
| for node in self.get_head_node_list(): |
| if(not node in checked): |
| checked[node] = node |
| self.threadpool.invoke(self.check_node, node) |
| except: |
| self.threadpool.quit() |
| print sys.exc_info()[0], "Exiting." |
| sys.exit(1) |
| |
| self.threadpool.quit() |
| |
| #if ( self.kill_signal != None ): |
| # if(self.automanage_broker): |
| # print 'Stopping broker' |
| # self.stop_broker() |
| # else: |
| # print 'Not stopping broker - not automanaged' |
| # if(self.automanage_database): |
| # print 'Stopping database' |
| # self.db_stop() |
| # else: |
| # print 'Not stopping database - not automanaged' |
| |
| #if(not self.is_reliable_backup()): |
| # if ( len(self.pids_agents) == 0): |
| # if ( os.path.exists(self.pid_file_agents) ): |
| # os.remove(self.pid_file_agents) |
| # elif (process_changes or redo_pids): |
| # self.pids_agents.write(self.pid_file_agents) |
| |
| #if ( len(self.pids_daemons) == 0): |
| # if ( os.path.exists(self.pid_file_daemons) ): |
| # os.remove(self.pid_file_daemons) |
| #elif (process_changes or redo_pids): |
| # self.pids_daemons.write(self.pid_file_daemons) |
| |
| if __name__ == "__main__": |
| checker = CheckDucc() |
| checker.main(sys.argv[1:]) |
| |