| # @@@ START COPYRIGHT @@@ |
| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| # @@@ END COPYRIGHT @@@ |
| # |
| # This script is useful on workstations when doing overnight |
| # development regression runs using the local hadoop. It |
| # periodically checks to see if the HMaster is up. If it |
| # isn't, it attempts to restart it. |
| # |
| import os |
| import sys |
| import subprocess |
| import re |
| import sets |
| import time |
| import argparse # requires Python 2.7 |
| |
| |
| |
| |
| # beginning of main |
| |
| # pseudocode |
| # |
| # giveUp = false |
| # while not giveUp |
| # check to see if HMaster is up |
| # if not |
| # try to start it |
| # end while |
| # if we've had too many failures |
| # giveUp = true |
| # else |
| # sleep for a while |
| # end if |
| # end while |
| # |
| |
| |
| # process command line arguments |
| |
| parser = argparse.ArgumentParser( |
| description='This script watches to see if HMaster goes away and tries to restart it if so.') |
| |
| args = parser.parse_args() # exits and prints help if args are incorrect |
| |
| exitCode = 0 |
| |
| giveUp = False |
| # sleep time units are in minutes in this script |
| retryStartSleepTime = 1 |
| normalSleepTime = 5 |
| lastSleepTime = normalSleepTime |
| sleepTimeThisGoRound = 0 # so we check right away the first time |
| |
| while giveUp == False: |
| |
| # sleep function takes seconds |
| time.sleep(60 * sleepTimeThisGoRound) |
| |
| p1 = subprocess.Popen(["jps"], stdout=subprocess.PIPE) |
| p2 = subprocess.Popen(["grep", "HMaster"], stdin=p1.stdout, stdout=subprocess.PIPE, close_fds=True) |
| foundIt = False |
| for ip in p2.stdout: |
| foundIt = True |
| |
| # gets rid of <defunct> jps and grep |
| p1.wait() |
| p2.wait() |
| |
| sleepTimeThisGoRound = 0 |
| if foundIt == False: |
| print "At %s, HMaster was not running." % time.ctime() |
| if lastSleepTime == normalSleepTime: |
| # it was up the last time we checked; use minimal sleep time |
| retryStartSleepTime = 1 |
| else: |
| # double the sleep time for consecutive HBase restarts up to 64 |
| retryStartSleepTime = 2 * retryStartSleepTime |
| |
| sleepTimeThisGoRound = retryStartSleepTime |
| if retryStartSleepTime > 64: |
| giveUp = True |
| else: |
| retcode = subprocess.call(["swstarthbase"]) |
| print "retcode from swstarthbase call was " + str(retcode) |
| else: |
| print "At %s, HMaster was up." % time.ctime() |
| sleepTimeThisGoRound = normalSleepTime |
| |
| lastSleepTime = sleepTimeThisGoRound |
| |
| print "Too many consecutive failures; giving up." |
| exit(exitCode) |
| |
| |