blob: 996fe16f1f67636f5b3c8261282925df89c9891d [file] [log] [blame]
#!/usr/bin/env python
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# Line too long - pylint: disable=C0301
# Invalid name - pylint: disable=C0103
"""
gpstandbywatch.py
Copyright (c) EMC/Greenplum Inc 2011. All Rights Reserved.
Check actual contents and process state of syncmaster
in order to properly return accurate information back to
gpinitstandby via gpstart.
"""
import os
import sys
import glob
import time
from gppylib.gplog import setup_tool_logging, get_default_logger
from gppylib.commands import gp, unix
def matching_files(pglogdir, ignore=None, setlimit=False):
"""
Generate a series of file names corresponding to files
in 'pglogdir' which are not in the specified 'ignore' map.
Note that startup.log is always returned if present.
If 'setlimit' is not false, files whose modification time
exceeds the ctime of startup.log will also be ignored.
"""
mlimit = None
pattern = os.path.join(pglogdir, 'startup.log')
for path in glob.glob(pattern):
if setlimit:
mlimit = os.stat(path).st_ctime
yield path
home_directory = os.path.expanduser("~")
pattern = os.path.join('%s/hawqAdminLogs/' % home_directory, 'startup.log')
for path in glob.glob(pattern):
if setlimit:
mlimit = os.stat(path).st_ctime
yield path
pattern = os.path.join(pglogdir, '*.csv')
for path in glob.glob(pattern):
if ignore is not None and path in ignore:
continue
if mlimit is not None and os.stat(path).st_mtime >= mlimit:
continue
yield path
def updated_files(pglogdir, ignore, prev):
"""
Generate a series of (time, path) tuples corresponding to files
in 'pglogdir' and not in 'ignore' which were also modified
after 'prev' (or all files if 'prev' is None).
"""
for path in matching_files(pglogdir, ignore):
ts = os.stat(path).st_mtime
if prev is None or prev < ts:
yield (ts, path)
def updated_handles(pglogdir, ignore, prev, handles):
"""
Generate a series of (time, handle) tuples corresponding to files
in 'pglogdir' and not in 'ignore' modified after 'prev'
(or all files if 'prev' is None).
"""
for ts, path in updated_files(pglogdir, ignore, prev):
h = handles.get(path, None)
if h is None:
h = open(path, 'r')
handles[path] = h
yield (ts, h)
class SyncmasterWatcher:
"""
Watch changes to files in the pg_log directory recorded by the gpsyncmaster.
"""
def __init__(self, datadir):
"""
Build a map containing the existing contents of the pg_log
directory so that we can avoid getting confused by them
after we start the syncmaster.
"""
self.datadir = datadir
self.pglogdir = os.path.join(self.datadir, 'pg_log')
# note use of setlimit=True here to prevent any files created
# after startup.log from becoming ignored.
self.ignore = {}
for path in matching_files( self.pglogdir, setlimit=True ):
self.ignore[path] = True
self.handles = {}
self.maxlines = 1000
self.timelimit = 3
self.delay = 0.1
def tail_briefly(self):
"""
Generate lines recently added to log files in the pg_log directory
updated after our __init__ constructor was called.
"""
start = time.time() # starting time
elapsed = 0 # time elapsed so far
count = 0 # number of lines we've seen
tp = None
# until we're out of time or have returned enough lines
while elapsed < self.timelimit and count < self.maxlines:
# for each file modified since we last checked
tn = None
for ts, h in updated_handles(self.pglogdir, self.ignore, tp, self.handles):
# track the last file modification time
if tn is None or tn < ts:
tn = ts
# yield the new lines to the caller
while count < self.maxlines:
line = h.readline()
if not line:
break
yield line
count += 1
# update the elapsed time
elapsed = time.time() - start
# if any new lines, update prev and keep checking for more
if tn is not None:
tp = tn
continue
# if we get here it means none of the files were updated in
# our last iteration. sleep a moment before checking for
# more updates
time.sleep(self.delay)
def monitor_logs(self):
"""
Read the syncmaster log files for a few seconds, looking for
potential problems.
Returns 0 if no problems were seen or or 1 if the startup log
contained an error or if the gpsyncmaster process exited before
we were done watching.
"""
logger.info("Monitoring logs")
# now scan some of the syncmaster output for a moment
for line in self.tail_briefly():
if line.startswith('Traceback'): # gpsyncmaster traceback recorded
logger.warning(line)
return 1
# MPP-13212 - since the syncmaster reports rejected client connections
# as 'FATAL' errors, the presence of a 'FATAL' error need not indicate
# a problem in the syncmaster so we comment out the following logic:
#
# if line.find('FATAL') >= 0: # fatal error recorded
# logger.warning(line)
# return 1
#
# This is especially important for health monitoring clients which may
# rely on the difference between a rejected connection and a TCP failure.
if line.find('could not bind IPv4 socket') >= 0: # syncmaster used IPv6 by mistake
logger.warning(line)
return 1
if line.find('QDSYNC: scan forward') >= 0: # syncmaster appears to be working
logger.info(line)
break
logger.info("checking if syncmaster is running")
count = 0
counter = 20
while True:
pid = gp.getSyncmasterPID('localhost', self.datadir)
if not pid > 0:
if count >= counter:
logger.error("Standby master start timeout")
return 1
else:
logger.warning("syncmaster not running, waiting...")
else:
break
count += 1
time.sleep(3)
# syncmaster is running and there are no obvious errors in the log
logger.info("syncmaster appears ok, pid %s" % pid)
return 0
def close(self):
"""
Closes all handles to the logs we're watching.
"""
for h in self.handles.values():
h.close()
self.handles = {}
if __name__ == '__main__':
# setup gpAdminLogs logging
execname = os.path.split(sys.argv[0])[-1]
hostname = unix.getLocalHostname()
username = unix.getUserName()
setup_tool_logging(execname, hostname, username)
logger = get_default_logger()
# watch syncmaster logs
if len(sys.argv) > 2 and sys.argv[2] == 'debug':
logger.info("Checking standby master status")
watcher = SyncmasterWatcher( sys.argv[1] )
rc = watcher.monitor_logs()
watcher.close()
# report final status
# logger.info("exiting with %s" % rc)
sys.exit( rc )