blob: 28f074e7c68f2b4e9240614c69361bca09adfd77 [file] [log] [blame]
#!/usr/bin/env python
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
'''
USAGE: gpcheck_hostdump [--appliance] [--linux] [--solaris] --hadoop hadoop_home_path --sysctl comma_separated_sysctl_options
where --appliance will do a dump of appliance server
where --linux will do a dump of a generic linux server
where --solaris will do a dump of a generic solaris server
'''
import os, sys, re, tempfile, subprocess, pickle, glob, stat, time, getpass
from xml.dom import minidom
try:
from optparse import Option, OptionParser
from gppylib.gpparseopts import OptParser, OptChecker
from gppylib.gpcheckutil import ApplianceOutputData, GenericLinuxOutputData, GenericSolarisOutputData
from gppylib.gpcheckutil import chkconfig, omreport, grubconf, mounts, GpMount, GpMount, inittab, ntp
from gppylib.gpcheckutil import securetty, ioschedulers, blockdev, bcu, rclocal, sysctl, limitsconf, limitsconf_entry, uname, connectemc, diskusage, diskusage_entry, hdfs, hawq, machine
from gppylib.gpcheckutil import solaris_etc_system, solaris_etc_project, solaris_etc_user_attr
except ImportError, e:
sys.exit('Cannot import modules. Please check that you have sourced greenplum_path.sh. Detail: ' + str(e))
# Global Variables
output = None
options = None
def removeComments(line):
words = line.split("#")
if len(words) < 2:
return line
return words[0]
def collectChkconfigData():
data = chkconfig()
p = subprocess.Popen("/sbin/chkconfig --list", shell = True, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
result = p.communicate()
data.errormsg = result[1].strip()
if p.returncode:
return data
startxInet = False
for line in result[0].splitlines():
if not startxInet:
if re.search("xinetd based services", line):
startxInet = True
else:
words = line.split()
if len(words) != 8:
continue
levels = dict()
for i in range(1,8):
state = words[i].split(":")
if state[1] == 'on':
levels[state[0]] = True
else:
levels[state[0]] = False
data.services[words[0]] = levels
else:
words = line.split()
if len(words) != 2:
continue
name = words[0].split(":")
if words[1] == 'on':
data.xinetd[name[0]] = True
else:
data.xinetd[name[0]] = False
return data
# argument is a reference to instantiated omreport object
def omReportOmVersion(data):
# Obtain Dell OpenManage software version
p = subprocess.Popen("omreport system version", shell = True, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
result = p.communicate()
data.omversion_errormsg = result[1].strip()
if p.returncode:
return
nextLineisOurs = False
for line in result[0].splitlines():
words = line.split(":")
if len(words) != 2:
continue
if nextLineisOurs:
data.omversion = words[1].strip()
break
label = words[1].strip()
if label == 'Dell Server Administrator':
nextLineisOurs = True
# argument is a reference to instantiated omreport object
def omReportStorageController(data):
p = subprocess.Popen("omreport storage controller", shell = True, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
result = p.communicate()
data.controller_errormsg = result[1].strip()
if p.returncode:
return
for line in result[0].splitlines():
words = line.split(":")
if len(words) != 2:
continue
data.controller[words[0].strip()] = words[1].strip()
# argument is a reference to instantiated omreport object
def omReportStorageVdisk(data):
p = subprocess.Popen("omreport storage vdisk controller=0", shell = True, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
result = p.communicate()
data.vdisks_errormsg = result[1].strip()
if p.returncode:
return
counter = -1
for line in result[0].splitlines():
if re.search("ID\s*:", line):
counter += 1
data.vdisks.append(dict())
words = line.split(":")
if len(words) < 2:
continue
data.vdisks[counter][words[0].strip()] = ":".join(words[1:]).strip()
# argument is a reference to instantiated omreport object
def omReportChassisRemoteaccess(data):
p = subprocess.Popen("omreport chassis remoteaccess", shell = True, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
result = p.communicate()
if p.returncode:
# the remote access command on omreport is flaky... retry it once
p = subprocess.Popen("omreport chassis remoteaccess", shell = True, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
result = p.communicate()
data.remoteaccess_errormsg = result[1].strip()
if p.returncode:
data.remoteaccess_errormsg = "%s\nreturn code from omreport %d" % (data.remoteaccess_errormsg, p.returncode)
return
attribute = None
for line in result[0].splitlines():
if not attribute:
if re.search("Attribute :", line):
words = line.strip().split("Attribute :")
attribute = words[1].strip()
else:
if re.search("Value :", line):
words = line.strip().split("Value :")
data.remoteaccess[attribute] = words[1].strip()
attribute = None
# argument is a reference to instantiated omreport object
def omReportChassisBios(data):
p = subprocess.Popen("omreport chassis bios", shell = True, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
result = p.communicate()
data.bios_errormsg = result[1].strip()
if p.returncode:
return
for line in result[0].splitlines():
words = line.split(":")
if len(words) != 2:
continue
data.bios[words[0].strip()] = words[1].strip()
# argument is a reference to instantiated omreport object
def omReportChassisBiossetup(data):
p = subprocess.Popen("omreport chassis biossetup", shell = True, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
result = p.communicate()
data.biossetup_errormsg = result[1].strip()
if p.returncode:
return
attribute = None
for line in result[0].splitlines():
if not attribute:
if re.search("Attribute :", line):
words = line.strip().split("Attribute :")
attribute = words[1].strip()
else:
if re.search("Setting :", line):
words = line.strip().split("Setting :")
data.biossetup[attribute] = words[1].strip()
attribute = None
# argument is a reference to instantiated omreport object
def omReportSystemAlertlog(data):
p = subprocess.Popen("omreport system alertlog", shell = True, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
result = p.communicate()
data.alerts_errormsg = result[1].strip()
if p.returncode:
return
openAlert = False
for line in result[0].splitlines():
words = line.split(":")
if len(words) < 2:
openAlert = False
continue
if len(words) != 2 and len(words) != 4:
continue
key = words[0].strip()
if len(words) == 4:
if words[0].strip() != 'Date and Time':
continue
value = ":".join(words[1:])
else:
value = words[1].strip()
if key == 'Severity':
openAlert = False
# this is not an OK alert ... add it to the list and start adding attributes
if value != 'Ok':
workingDictionary = dict()
data.alerts.append(workingDictionary)
openAlert = True
if openAlert:
# add kvp to last element in list of alerts
data.alerts[-1][key] = value
# argument is a reference to instantiated omreport object
def omReportChassisBiossetupBootorder(data):
p = subprocess.Popen("omreport chassis biossetup attribute=bootorder", shell = True, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
result = p.communicate()
data.bootorder_errormsg = result[1].strip()
if p.returncode:
return
for line in result[0].splitlines():
if re.search("BIOS Hard Disk Sequence", line):
break
if re.search("Device Name :", line):
words = line.strip().split("Device Name :")
data.bootorder.append(words[1].strip())
# returns populated omreport object
def collectOmReports():
data = omreport()
omReportOmVersion(data)
omReportChassisBios(data)
omReportChassisBiossetup(data)
omReportChassisBiossetupBootorder(data)
omReportChassisRemoteaccess(data)
omReportStorageVdisk(data)
omReportStorageController(data)
omReportSystemAlertlog(data)
return data
def collectBcu():
data = bcu()
p = subprocess.Popen("bcu adapter --query 1", shell = True, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
result = p.communicate()
data.errormsg = result[1].strip()
if p.returncode:
return data
for line in result[0].splitlines():
if re.search("bios version:", line):
words = line.strip().split("bios version:")
if len(words) == 2:
data.biosversion = words[1].strip()
if re.search("fw version:", line):
words = line.strip().split("fw version:")
if len(words) == 2:
data.firmware = words[1].strip()
return data
# returns populated grubconf object
def collectGrubConf():
data = grubconf()
try:
for line in open("/boot/grub/grub.conf", "r"):
line = removeComments(line)
if re.search("^serial ", line):
data.serial_declaration = True
if re.search("^terminal ", line):
data.terminal_declaration = True
if re.search("kernel", line):
if re.search(" console=ttyS1,", line):
data.ttyS1_declaration = True
except Exception, e:
data.errormsg = e.__str__()
return data
def collectUname():
data = uname()
p = subprocess.Popen("uname -r", shell = True,
stdout = subprocess.PIPE, stderr = subprocess.PIPE)
result = p.communicate()
if p.returncode:
data.errormsg = result[1].strip()
else:
data.output = result[0].strip()
return data
def collectCPUandMemoryInfo():
data = machine()
# cpu cores number
p = subprocess.Popen('grep "^processor" /proc/cpuinfo | wc -l', shell = True,
stdout = subprocess.PIPE, stderr = subprocess.PIPE)
result = p.communicate()
if p.returncode:
data.errormsg = result[1].strip()
else:
data.total_cpucores = int(result[0].strip())
# total memory
p = subprocess.Popen("head -n 1 /proc/meminfo | awk '{print $2}'", shell = True,
stdout = subprocess.PIPE, stderr = subprocess.PIPE)
result = p.communicate()
if p.returncode:
data.errormsg = result[1].strip()
else:
memory_in_KB = int(result[0].strip())
data.memory_in_MB = memory_in_KB / 1024
return data
def collectHAWQ():
if not options.hawq:
return None
data = hawq()
hawq_config_dir = os.environ.get('GPHOME')
if hawq_config_dir is None:
print "Please export GPHOME first, exit"
sys.exit(1)
hdfs_client_file = os.path.join(hawq_config_dir, "etc/hdfs-client.xml")
yarn_client_file = os.path.join(hawq_config_dir, "etc/yarn-client.xml")
hawq_site_file = os.path.join(hawq_config_dir, "etc/hawq-site.xml")
# collect HDFS site config
getPropName = lambda node: node.getElementsByTagName('name')[0].childNodes[0].data
getPropValue = lambda node: node.getElementsByTagName('value')[0].childNodes[0].data
hawq_config_file_list = [hdfs_client_file, hawq_site_file]
if options.yarn:
hawq_config_file_list.append(yarn_client_file)
for filename in hawq_config_file_list:
try:
with open(filename) as f:
xmldoc = minidom.parse(f)
for node in xmldoc.getElementsByTagName('property'):
try:
data.site_config[getPropName(node)] = getPropValue(node).strip()
except IndexError:
pass # the <value> tag may be empty, which causes IndexError in getPropValue
except Exception, e:
data.errormsg = "Failed to read HAWQ config file '%s': %s" % (filename, e)
return data
def collectHDFS():
if not options.hadoop:
return None
data = hdfs()
hawq_config_dir = os.environ.get('GPHOME')
if hawq_config_dir is None:
print "Please export GPHOME first, exit"
sys.exit(1)
hadoop_config_file = os.path.join(options.hadoop, "libexec/hadoop-config.sh")
hadoop_env_file = os.path.join(options.hadoop, "etc/hadoop/hadoop-env.sh")
hdfs_site_file = os.path.join(options.hadoop, "etc/hadoop/hdfs-site.xml")
yarn_site_file = os.path.join(options.hadoop, "etc/hadoop/yarn-site.xml")
core_site_file = os.path.join(options.hadoop, "etc/hadoop/core-site.xml")
# collect java heap size config
p = subprocess.Popen(". %s; echo $JAVA_HEAP_MAX" % hadoop_config_file, shell = True,
stdout = subprocess.PIPE, stderr = subprocess.PIPE)
result = p.communicate()
if p.returncode:
data.errormsg = result[1].strip()
return data
else:
data.max_heap_size = int(re.sub(r'[^\d]+', '', result[0]))
# collect possible namenode/datanode heap size config
p = subprocess.Popen(". %s && echo $HADOOP_NAMENODE_OPTS | tr ' ' '\\n' | grep Xmx | tail -n 1" % hadoop_env_file, shell = True,
stdout = subprocess.PIPE, stderr = subprocess.PIPE)
result = p.communicate()
if p.returncode or result[0] == '':
data.namenode_heap_size = data.max_heap_size
else:
data.namenode_heap_size = int(re.sub(r'[^\d]+', '', result[0]))
# collect possible namenode/datanode heap size config
p = subprocess.Popen(". %s && echo $HADOOP_DATANODE_OPTS | tr ' ' '\\n' | grep Xmx | tail -n 1" % hadoop_env_file, shell = True,
stdout = subprocess.PIPE, stderr = subprocess.PIPE)
result = p.communicate()
if p.returncode or result[0] == '':
data.datanode_heap_size = data.max_heap_size
else:
data.datanode_heap_size = int(re.sub(r'[^\d]+', '', result[0]))
# collect HDFS site config
getPropName = lambda node: node.getElementsByTagName('name')[0].childNodes[0].data
getPropValue = lambda node: node.getElementsByTagName('value')[0].childNodes[0].data
hdfs_config_file_list = [hdfs_site_file, core_site_file]
if options.yarn:
hdfs_config_file_list.append(yarn_site_file)
for filename in hdfs_config_file_list:
try:
with open(filename) as f:
xmldoc = minidom.parse(f)
for node in xmldoc.getElementsByTagName('property'):
try:
data.site_config[getPropName(node)] = getPropValue(node).strip()
except IndexError:
pass # the <value> tag may be empty, which causes IndexError in getPropValue
except Exception, e:
data.errormsg = "Failed to read HDFS config file '%s': %s" % (filename, e)
return data
def collectDiskUsage():
data = diskusage()
p = subprocess.Popen("df -Ph", shell = True,
stdout = subprocess.PIPE, stderr = subprocess.PIPE)
result = p.communicate()
if p.returncode:
data.errormsg = result[1].strip()
else:
for line in result[0].splitlines()[1:]:
words = line.split() # [Filesystem, Size, Used, Avail, Use%, Mounted on]
data.lines.append(diskusage_entry(*words))
return data
def collectSysctl():
data = sysctl()
for param in options.sysctl.split(","):
p = subprocess.Popen("/sbin/sysctl -n " + param, shell = True,
stdout = subprocess.PIPE, stderr = subprocess.PIPE)
result = p.communicate()
if p.returncode:
data.errormsg = result[1].strip()
else:
# replace TAB to space, for param like 'kernel.sem' can be compare with
# value in gpcheck.cnf as a whole
data.variables[param] = result[0].strip().replace('\t', ' ')
return data
def collectLimits():
data = limitsconf()
try:
lines = []
results = [open("/etc/security/limits.conf")] + [open(f) for f in glob.glob("/etc/security/limits.d/*.conf")]
for f in results:
lines += map(removeComments, f.readlines())
for line in lines:
words = line.split()
if len(words) == 4:
domain = words[0].strip()
type = words[1].strip()
item = words[2].strip()
value = words[3].strip()
data.lines.append(limitsconf_entry(domain, type, item, value))
except Exception, e:
data.errormsg = str(e)
return data
def collectMounts():
data = mounts()
p = subprocess.Popen("mount", shell = True,
stdout = subprocess.PIPE, stderr = subprocess.PIPE)
result = p.communicate()
if p.returncode:
data.errormsg = result[1].strip()
else:
for line in result[0].splitlines():
mdata = GpMount()
words = line.strip().split()
mdata.partition = words[0]
mdata.dir = words[2]
mdata.type = words[4]
mdata.options = set(words[5].strip("()").split(","))
data.entries[mdata.partition] = mdata
return data
def collectIOschedulers():
data = ioschedulers()
devices = set()
try:
for f in glob.glob("/sys/block/*/queue/scheduler"):
words = f.split("/")
if len(words) == 6:
devices.add(words[3].strip())
except Exception, e:
data.errormsg = str(e)
for d in devices:
try:
fd = open("/sys/block/%s/queue/scheduler" % d, 'r')
scheduler = fd.read()
words = scheduler.split("[")
if len(words) != 2:
continue
words = words[1].split("]")
if len(words) != 2:
continue
data.devices[d] = words[0].strip()
except Exception, e:
data.errormsg += str(e)
return data
def collectBlockdev():
data = blockdev()
try:
devices = glob.glob("/dev/sd*")
except Exception, e:
data.errormsg = str(e)
return data
for d in devices:
# need "sudo"
p = subprocess.Popen("/sbin/blockdev --getra %s" % d, shell = True,
stdout = subprocess.PIPE, stderr = subprocess.PIPE)
result = p.communicate()
if p.returncode:
data.errormsg = result[1].strip()
else:
data.ra[d] = result[0].strip()
return data
def collectNtpd():
data = ntp()
p = subprocess.Popen("/usr/sbin/ntpq -p", shell = True,
stdout = subprocess.PIPE, stderr = subprocess.PIPE)
result = p.communicate()
data.errormsg = result[1].strip()
if not p.returncode:
startHosts = False
for line in result[0].splitlines():
if startHosts:
words = line.split()
if len(words) < 2: # there are actually 10 fields but we only care about the first field
continue
host = words[0].strip()
if host.startswith("*"):
host = host.lstrip("*")
data.hosts.add(host)
else:
if re.search("======", line):
startHosts = True
p = subprocess.Popen("pgrep ntpd", shell = True,
stdout = subprocess.PIPE, stderr = subprocess.PIPE)
result = p.communicate()
if data.errormsg:
data.errormsg = "%s\n%s" % (data.errormsg, result[1].strip())
else:
data.errormsg = result[1].strip()
if not p.returncode:
for line in result[0].splitlines():
try:
pid = int(line.strip())
data.running = True
except:
pass
data.currenttime = time.time()
return data
def collectRclocal():
data = rclocal()
data.isexecutable = False
try:
bits = os.stat('/etc/rc.d/rc.local')[stat.ST_MODE]
if bits & 0100:
data.isexecutable = True
except Exception, e:
pass
return data
# returns populated inittab object
def collectInittab():
data = inittab()
try:
for line in open("/etc/inittab", "r"):
if re.search("^id:", line):
words = line.split(":")
if len(words) > 2:
data.defaultRunLevel = words[1]
if re.search("^S1", line):
data.s1 = True
except Exception, e:
data.errormsg = e.__str__()
return data
def collectConnectemc():
data = connectemc()
p = subprocess.Popen("/etc/init.d/connectemc status", shell = True, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
result = p.communicate()
data.errormsg = result[1].strip()
if p.returncode:
return data
data.output = result[0].strip()
return data
# returns populated securetty object
def collectSecuretty():
data = securetty()
try:
for line in open("/etc/securetty", "r"):
val = line.strip()
data.data.add(val)
except Exception, e:
data.errormsg = e.__str__()
return data
# returns poulated object of type: solaris_etc_system
def collectSolarisEtcSystem():
# create dictionary
data = solaris_etc_system()
# comments begin with either a * or #
# lines we care about look like: set rlim_fd_max=262144
try:
for line in open("/etc/system", "r"):
line = line.strip()
if line.startswith("#"):
continue
if line.startswith("*"):
continue
words = line.split("set ")
if len(words) != 2:
continue
words = words[1].split("=")
if len(words) != 2:
continue
key = words[0].strip()
value = words[1].strip()
data.parameters[key] = value
except Exception, e:
data.errormsg = e.__str__()
return data
# returns poulated object of type: solaris_etc_project
def collectSolarisEtcProject():
# store list of lines
# ignore lines that start with # or do not contain :
data = solaris_etc_project()
try:
for line in open("/etc/project", "r"):
val = line.strip()
if val.startswith("#"):
continue
if not re.search(":", val):
continue
data.lines.append(val)
except Exception, e:
data.errormsg = e.__str__()
return data
# returns poulated object of type: solaris_etc_user_attr
def collectSolarisEtcUserAttr():
# store list of lines
# ignore lines that start with # or do not contain :
data = solaris_etc_user_attr()
try:
for line in open("/etc/user_attr", "r"):
val = line.strip()
if val.startswith("#"):
continue
if not re.search(":", val):
continue
data.lines.append(val)
except Exception, e:
data.errormsg = e.__str__()
return data
def processGenericSolarisServer():
global output
output = GenericSolarisOutputData()
output.etc_system = collectSolarisEtcSystem()
output.etc_project = collectSolarisEtcProject()
output.etc_user_attr = collectSolarisEtcUserAttr()
output.uname = collectUname()
def processGenericLinuxServer():
global output
output = GenericLinuxOutputData()
output.hdfs = collectHDFS()
output.hawq = collectHAWQ()
output.uname = collectUname()
output.machine = collectCPUandMemoryInfo()
output.diskusage = collectDiskUsage()
output.sysctl = collectSysctl()
output.limitsconf = collectLimits()
output.mounts = collectMounts()
output.ioschedulers = collectIOschedulers()
output.ntp = collectNtpd()
if getpass.getuser() == "root":
output.blockdev = collectBlockdev()
def processApplianceServer():
global output
output = ApplianceOutputData()
output.chkconfig = collectChkconfigData()
output.omreport = collectOmReports()
output.grubconf = collectGrubConf()
output.mounts = collectMounts()
output.inittab = collectInittab()
output.uname = collectUname()
output.securetty = collectSecuretty()
output.ioschedulers = collectIOschedulers()
output.blockdev = collectBlockdev()
output.bcu = collectBcu()
output.rclocal = collectRclocal()
output.sysctl = collectSysctl()
output.limitsconf = collectLimits()
output.connectemc = collectConnectemc()
output.ntp = collectNtpd()
def parseargs():
global options
parser = OptParser(option_class=OptChecker)
parser.remove_option('-h')
parser.add_option('-h', '-?', '--help', action='store_true')
parser.add_option('--hadoop', type='string')
parser.add_option('--hawq', action='store_true')
parser.add_option('--yarn', action='store_true')
parser.add_option('--sysctl', type='string')
parser.add_option('--appliance', action='store_true')
parser.add_option('--linux', action='store_true')
parser.add_option('--solaris', action='store_true')
(options, args) = parser.parse_args()
if options.help:
print __doc__
sys.exit(1)
if __name__ == "__main__":
parseargs()
if options.solaris:
processGenericSolarisServer()
elif options.linux:
processGenericLinuxServer()
elif options.appliance:
processApplianceServer()
else:
sys.stderr.write("host type not specified on command line\n")
sys.exit(1)
(fd, filename) = tempfile.mkstemp(dir='/tmp', prefix='gpcheck_dump', text=True)
with open(filename, 'wb') as f:
pickle.dump(output, f)
print filename # `gpcheck` will get the pickled filename from stdout