blob: dd04c7d056a50f7640546da612dccfe4f2ccee14 [file] [log] [blame]
#!/usr/bin/env python
# -----------------------------------------------------------------------
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# -----------------------------------------------------------------------
import os
import sys
from time import time
import getopt
import signal
import socket
from ducc_util import DuccUtil
from properties import Properties
from local_hooks import verify_slave_node
from local_hooks import verify_master_node
#from ducc_util import ThreadWorker
from ducc_util import ThreadPool
class CheckDucc(DuccUtil):
def __init__(self):
DuccUtil.__init__(self,merge=True)
self.badnodes = []
def validate(self, checkdate):
if (not self.is_head_node()):
verify_slave_node(checkdate, self.ducc_properties)
self.check_clock_skew(checkdate)
self.verify_jvm()
self.verify_limits()
(viable, elevated, safe) = self.verify_duccling()
self.duccling_ok(viable, elevated, safe)
if ( not safe or not viable ):
print 'NOTOK ducc_ling is not installed correctly.'
return
def verify_database(self):
if ( self.db_bypass == True ):
return True
ret = self.db_alive(1)
if ( ret ):
print 'The database is running'
else:
print 'The database is not running'
def verify_activemq(self):
if ( self.is_amq_active() ):
print 'ActiveMQ is found listening at', self.broker_protocol + "://" + self.broker_host + ':' + self.broker_port
return True
return False
def check_node(self, args):
messages = []
spacer = ' '
node = args[0]
messages.append((' '))
messages.append(('Checking', node, '...'))
if(self.ssh_operational(node)):
text = "ssh is operational to "+node
#print text
else:
text = "ssh is NOT operational to "+node
print text
messages.append((spacer, text))
return messages
response = self.find_ducc_process(node) # a tuple, (True|False, proclist)
if ( not response[0] ):
messages.append((spacer, "No response."))
return messages
proclist = response[1] # a list of tuples, tuple is (component, pid, user)
if ( len(proclist) > 0 ):
for proc in proclist:
component = proc[0]
pid = proc[1]
found_user = proc[2]
signal = self.kill_signal
if ( component == 'orchestrator' ):
component = 'or'
if ( component == 'database' and not self.automanage_database ):
automan = " (NOT auto-managed)"
else:
automan = ""
process_id = found_user + ' ' + component + '@' + node + ' PID ' + pid + automan
messages.append((spacer, 'Found', process_id))
else:
messages.append((spacer, 'no processes found.'))
if ( self.kill_signal == None ):
response = "Node health checks return."
lines = self.ssh(node, True, self.DUCC_HOME + "/admin/check_ducc", "-x", str(int(time())))
while 1:
line = lines.readline()
if ( 'signal' in line ):
response = "Node health did not complete: " + line
self.badnodes.append(node)
# these next two filter junk if 'mesg' is running in a shell rc
if ( 'stdin: is not a tty' in line ):
continue
if ( 'mesg' in line ):
continue
if ( not line ):
break
line = line.strip()
messages.append((spacer, line))
#messages.append((spacer, '[]', line))
messages.append((spacer, response))
return messages
def signalHandler(self, signum, frame):
print "-------- Caught signal", signum, "--------"
if ( len(self.badnodes) != 0 ):
print "Health checks on these nodes did not return:"
for n in self.badnodes:
print n,
print ''
sys.exit(1)
def usage(self, msg):
if ( msg != None ):
print msg
print "Usage:"
print " check_ducc [options]"
print " If no options are given this is the equivalent of:"
print ""
print " check_ducc -n ../resources/ducc.nodes"
print ""
print "Options:"
print " -n --nodelist nodefile"
print " Check for agents on the nodes in nodefile. This option may be specified multiple time"
print " for multiple nodefiles. The head node(s) are always checked"
print ""
print " --localonly"
print " Check only this head node (used when updating a single head node on a local filesystem)"
print ""
print " -c --configuration"
print " Do basic sanity checking on the configuration only. Note that configuration checking is always"
print " performed with most options. The [-c, --configuration] option does ONLY configuration checking."
print ""
print " -x localdate"
print " Validate the local installation, called via ssh usually. The date is the date on the calling machine."
print ""
print " --nothreading"
print " Disable multithreaded operation if it would otherwise be used"
print ""
print " -v --verbose"
print " If specified, print the validated configuration to the console."
print ""
print " -? prints this message."
sys.exit(1)
def main(self, argv):
try:
opts, args = getopt.getopt(argv, 'cn:x:h?v', ['configuration', 'nodelist=', 'verbose', 'nothreading', 'localonly' ])
except:
self.usage("Invalid arguments " + ' '.join(argv))
nodefiles = []
self.user = os.environ['LOGNAME']
self.kill_signal = None # Kill disabled ... now handled by stop_ducc
process_changes = False
do_validate = False
checkdate = 0
config_only = False
verbose = False
local_only = False
for ( o, a ) in opts:
if o in ('-c', '--configuration'):
config_only = True
elif o in ('-n', '--nodelist'):
nodefiles.append(a)
elif o in ( '--localonly' ):
local_only = True
elif o in ( '--nothreading' ):
self.disable_threading()
elif o in ('-x'):
# intended to be called recursively from check_ducc, NOT from the command line
do_validate = True
checkdate = float(a)
elif o in ('-v', '--verbose'):
verbose = True
elif o in ('-h', '-?', '--help'):
self.usage(None)
else:
print 'badarg', a
usage('bad arg: ' + a)
if ( local_only ):
if ( len(nodefiles) > 0 ):
print "NOTOK: Cannot specify nodefiles with --localonly"
return
if not self.installed():
print "Head node is not initialized. Have you run ducc_post_install?"
return
self.check_properties()
if ( do_validate ):
# if validating, ONLY validate, called via ssh usually
self.validate(checkdate)
return
# When called directly must be from the head node
self.verify_head()
self.set_duccling_version()
os.system('cat ' + self.DUCC_HOME + '/state/duccling.version')
# not -x option, do this only on local node
env = self.show_ducc_environment()
for e in env:
print e
jvm = self.ducc_properties.get('ducc.jvm')
if ( jvm == None ):
print 'WARN: ducc.jvm is not specified in ducc.properties. Default is simply "java" which may not work on all nodes.'
if (self.is_head_node()):
if ( not verify_master_node(self.ducc_properties) ):
print 'FAIL: Cannot verify master mode'
return
if ( not self.verify_activemq() ):
print 'ActiveMQ broker is not running on', self.broker_protocol + "://" + self.broker_host + ':' + self.broker_port
self.verify_database()
# Create the nodelists
nodesmap = {}
n_nodes = 0
if ( local_only ):
# Include just this head node
nodesmap['head'] = [ self.localhost ]
n_nodes = 1
else:
# Load the specified or default nodefiles
if ( len(nodefiles) == 0 ):
nodefiles = self.default_nodefiles
for nf in nodefiles:
n, nodesmap = self.read_nodefile(nf, nodesmap)
n_nodes += n
# Include all the head node(s)
nodesmap['head'] = self.head_nodes
n_nodes += len(self.head_nodes)
self.verify_jvm()
if ( config_only ):
if ( nodefiles != self.default_nodefiles):
print "NOTOK: Config check only works with full, default nodefile:", self.default_nodefiles
return
if self.verify_class_configuration(nodefiles[0], verbose):
print "OK: Class configuration checked"
else:
print "NOTOK: Errors in class or node configuration."
return
# checking starts here - reduce any full names to the short names without the domain
print "Checking", n_nodes, "nodes"
self.threadpool = ThreadPool(n_nodes) # n_nodes is >= number of unique nodes
checked = {}
signal.signal(signal.SIGINT, self.signalHandler)
try:
for (nodefile, nodelist) in nodesmap.items():
if ( nodelist == None ):
# loading the nodes prints the necessary message
continue
for node in nodelist:
node = node.split('.')[0]
if ( checked.has_key(node) ):
continue
checked[node] = node
self.threadpool.invoke(self.check_node, node)
except:
self.threadpool.quit()
print sys.exc_info()[0], "Exiting."
sys.exit(1)
self.threadpool.quit()
if __name__ == "__main__":
checker = CheckDucc()
checker.main(sys.argv[1:])