204 lines
6.6 KiB
Python
Executable file
204 lines
6.6 KiB
Python
Executable file
#!/usr/bin/python -u
|
|
import os, socket, sys, signal, time, subprocess, logging
|
|
from optparse import OptionParser
|
|
import common
|
|
from autotest_lib.scheduler import babysitter_logging_config
|
|
from autotest_lib.client.common_lib import error, global_config, utils
|
|
from autotest_lib.client.common_lib import logging_manager
|
|
from autotest_lib.scheduler import scheduler_logging_config
|
|
from autotest_lib.scheduler import status_server
|
|
from autotest_lib.scheduler import monitor_db
|
|
|
|
PAUSE_LENGTH = 60
|
|
STALL_TIMEOUT = 2*60*60
|
|
|
|
parser = OptionParser()
|
|
parser.add_option("-r", action="store_true", dest="recover",
|
|
help=("run recovery mode (implicit after any crash)"))
|
|
parser.add_option("--background", dest="background", action="store_true",
|
|
default=False, help=("runs the scheduler monitor on "
|
|
"background"))
|
|
(options, args) = parser.parse_args()
|
|
|
|
autodir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
|
|
results_dir = os.path.join(autodir, 'results')
|
|
monitor_db_path = os.path.join(autodir, 'scheduler/monitor_db.py')
|
|
recover = (options.recover == True)
|
|
|
|
if len(args) != 0:
|
|
parser.print_help()
|
|
sys.exit(1)
|
|
|
|
|
|
def run_banner_output(cmd):
|
|
"""Returns ------ CMD ------\nCMD_OUTPUT in a string"""
|
|
banner_output = '%s\n%%s\n\n' % cmd.center(60, '-')
|
|
command_output = ''
|
|
try:
|
|
cmd_out = utils.run(cmd, ignore_status=True, timeout=30)
|
|
command_output = cmd_out.stdout + cmd_out.stderr
|
|
except error.CmdError:
|
|
command_output = 'Timed out'
|
|
|
|
return banner_output % command_output
|
|
|
|
|
|
def kill_monitor():
|
|
logging.info("Killing monitor_db")
|
|
# try shutdown first
|
|
utils.signal_program(monitor_db.PID_FILE_PREFIX, sig=signal.SIGINT)
|
|
if utils.program_is_alive(monitor_db.PID_FILE_PREFIX): # was it killed?
|
|
# give it some time to shutdown
|
|
time.sleep(30)
|
|
# kill it
|
|
utils.signal_process(monitor_db.PID_FILE_PREFIX)
|
|
|
|
|
|
def handle_sigterm(signum, frame):
|
|
logging.info('Caught SIGTERM')
|
|
kill_monitor()
|
|
utils.delete_pid_file_if_exists(monitor_db.BABYSITTER_PID_FILE_PREFIX)
|
|
sys.exit(1)
|
|
|
|
signal.signal(signal.SIGTERM, handle_sigterm)
|
|
|
|
|
|
SiteMonitorProc = utils.import_site_class(
|
|
__file__, 'autotest_lib.scheduler.site_monitor_db_babysitter',
|
|
'SiteMonitorProc', object)
|
|
|
|
|
|
class MonitorProc(SiteMonitorProc):
|
|
def __init__(self, do_recovery=False):
|
|
args = [monitor_db_path]
|
|
if do_recovery:
|
|
args.append("--recover-hosts")
|
|
args.append(results_dir)
|
|
|
|
kill_monitor()
|
|
environ = os.environ
|
|
scheduler_config = scheduler_logging_config.SchedulerLoggingConfig
|
|
log_name = scheduler_config.get_log_name()
|
|
os.environ['AUTOTEST_SCHEDULER_LOG_NAME'] = log_name
|
|
scheduler_log_dir = scheduler_config.get_server_log_dir()
|
|
self.log_path = os.path.join(scheduler_log_dir, log_name)
|
|
|
|
self.log_size = 0
|
|
self.last_log_change = time.time()
|
|
|
|
logging.info("STARTING monitor_db with log file %s" % self.log_path)
|
|
self.args = args
|
|
|
|
# Allow site specific code to run, set environment variables and
|
|
# modify self.args if desired.
|
|
super(MonitorProc, self).__init__()
|
|
|
|
|
|
def start(self):
|
|
devnull = open(os.devnull, 'w')
|
|
self.proc = subprocess.Popen(self.args, stdout=devnull)
|
|
|
|
|
|
def is_running(self):
|
|
if self.proc.poll() is not None:
|
|
logging.info("monitor_db DIED")
|
|
return False
|
|
|
|
old_size = self.log_size
|
|
new_size = os.path.getsize(self.log_path)
|
|
if old_size != new_size:
|
|
logging.info("Log was touched")
|
|
self.log_size = new_size
|
|
self.last_log_change = time.time()
|
|
elif self.last_log_change + STALL_TIMEOUT < time.time():
|
|
logging.info("monitor_db STALLED")
|
|
self.collect_stalled_info()
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
def collect_stalled_info(self):
|
|
INFO_TO_COLLECT = ['uptime',
|
|
'ps auxwww',
|
|
'iostat -k -x 2 4',
|
|
]
|
|
db_cmd = '/usr/bin/mysqladmin --verbose processlist -u%s -p%s'
|
|
config = global_config.global_config
|
|
try:
|
|
user = config.get_config_value("BACKUP", "user")
|
|
password = config.get_config_value("BACKUP", "password")
|
|
db_cmd %= (user, password)
|
|
INFO_TO_COLLECT.append(db_cmd)
|
|
except global_config.ConfigError:
|
|
pass
|
|
stall_log_path = self.log_path + '.stall_info'
|
|
log = open(stall_log_path, "w")
|
|
for cmd in INFO_TO_COLLECT:
|
|
log.write(run_banner_output(cmd))
|
|
|
|
log.close()
|
|
|
|
|
|
if os.getuid() == 0:
|
|
logging.critical("Running as root, aborting!")
|
|
sys.exit(1)
|
|
|
|
if utils.program_is_alive(monitor_db.BABYSITTER_PID_FILE_PREFIX):
|
|
logging.critical("Monitor_db_babysitter already running, aborting!")
|
|
sys.exit(1)
|
|
|
|
utils.write_pid(monitor_db.BABYSITTER_PID_FILE_PREFIX)
|
|
|
|
if options.background:
|
|
logging_manager.configure_logging(
|
|
babysitter_logging_config.BabysitterLoggingConfig(use_console=False))
|
|
|
|
# Double fork - see http://code.activestate.com/recipes/66012/
|
|
try:
|
|
pid = os.fork()
|
|
if (pid > 0):
|
|
sys.exit(0) # exit from first parent
|
|
except OSError, e:
|
|
sys.stderr.write("fork #1 failed: (%d) %s\n" % (e.errno, e.strerror))
|
|
sys.exit(1)
|
|
|
|
# Decouple from parent environment
|
|
os.chdir("/")
|
|
os.umask(0)
|
|
os.setsid()
|
|
|
|
# Second fork
|
|
try:
|
|
pid = os.fork()
|
|
if (pid > 0):
|
|
sys.exit(0) # exit from second parent
|
|
except OSError, e:
|
|
sys.stderr.write("fork #2 failed: (%d) %s\n" % (e.errno, e.strerror))
|
|
sys.exit(1)
|
|
else:
|
|
logging_manager.configure_logging(
|
|
babysitter_logging_config.BabysitterLoggingConfig())
|
|
|
|
|
|
while True:
|
|
sock = socket.socket()
|
|
try:
|
|
# Try to bind to the same port as the status_server.
|
|
sock.bind(('localhost', status_server._PORT))
|
|
except socket.error, msg:
|
|
# If binding failed, open the port.
|
|
logging.error('Failed to open socket with error:%s. Closing socket.',
|
|
msg)
|
|
release_port_cmd_list = ['fuser', '-k', '-n', 'tcp',
|
|
'%d' % status_server._PORT]
|
|
process = subprocess.Popen(release_port_cmd_list)
|
|
process.wait()
|
|
sock.close()
|
|
proc = MonitorProc(do_recovery=recover)
|
|
proc.start()
|
|
time.sleep(PAUSE_LENGTH)
|
|
while proc.is_running():
|
|
logging.info("Tick")
|
|
time.sleep(PAUSE_LENGTH)
|
|
recover = False
|