128 lines
4.8 KiB
Python
128 lines
4.8 KiB
Python
# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
|
|
# Use of this source code is governed by a BSD-style license that can be
|
|
# found in the LICENSE file.
|
|
|
|
# This file lets us test the repair supporting code.
|
|
# We could not easily unit test it if it was in the repair file as it makes
|
|
# a function call that is not protected by a __name__ == ??? guard.
|
|
|
|
import datetime, getpass, logging, operator, smtplib, urllib2, xmlrpclib
|
|
|
|
import common
|
|
|
|
from autotest_lib.client.common_lib import global_config, mail, logging_config
|
|
from autotest_lib.server import frontend
|
|
from autotest_lib.server.cros.dynamic_suite import reporting
|
|
|
|
|
|
# Receiver and sender information, if we need to send an email
|
|
_NOTIFY_ADDRESS = global_config.global_config.get_config_value(
|
|
'SCHEDULER', 'notify_email_errors', default='')
|
|
_SENDER_ADDRESS = global_config.global_config.get_config_value(
|
|
'SCHEDULER', "notify_email_from", default=getpass.getuser())
|
|
|
|
# Ignore any jobs that were ran more than this many mins past the max job
|
|
# timeout.
|
|
_CUTOFF_AFTER_TIMEOUT_MINS = 60
|
|
_DEFAULT_TEST_TIMEOUT_MINS = global_config.global_config.get_config_value(
|
|
'AUTOTEST_WEB', 'job_max_runtime_mins_default', type=int,
|
|
default=0)
|
|
|
|
|
|
class MachineDeathLogger(logging_config.LoggingConfig):
|
|
"""
|
|
Used to log information about a machine going into the Repair Failed state.
|
|
|
|
We use this so that if the default log location ever changes it will also
|
|
change for this logger and to keep this information separate from the
|
|
other logs.
|
|
|
|
"""
|
|
file_formatter = logging.Formatter(fmt='%(asctime)s | %(message)s',
|
|
datefmt='%m/%d %H:%M:%S')
|
|
LOGFILE_NAME = 'machine_death.log'
|
|
|
|
def __init__(self):
|
|
super(MachineDeathLogger, self).__init__(False)
|
|
self.logger = logging.getLogger('machine_death')
|
|
|
|
super(MachineDeathLogger, self).configure_logging(use_console=False)
|
|
log_dir = self.get_server_log_dir()
|
|
self.add_file_handler(self.LOGFILE_NAME, logging.ERROR,
|
|
log_dir=log_dir)
|
|
|
|
|
|
def _find_problem_test(machine, rpc):
|
|
"""
|
|
Find the last job that ran on the machine.
|
|
|
|
Go as far back as _DEFAULT_TEST_TIMEOUT_MINS + _CUTOFF_AFTER_TIMEOUT_MINS.
|
|
If global_config doesn't have a job_max_runtime_mins_default we will search
|
|
only as far as _CUTOFF_AFTER_TIMEOUT_MINS.
|
|
|
|
@param machine: The hostname (e.g. IP address) of the machine to find the
|
|
last ran job on it.
|
|
|
|
@param rpc: The rpc object to contact the server with.
|
|
|
|
@return the job status dictionary for the job that last ran on the machine
|
|
or None if there is no such job.
|
|
"""
|
|
|
|
# Going through the RPC interface means we cannot use the latest() django
|
|
# QuerySet function. So we will instead look at the past
|
|
# job_max_runtime_mins_default plus _CUTOFF_AFTER_TIMEOUT_MINS
|
|
# and pick the most recent run from there.
|
|
cutoff = (datetime.datetime.today() -
|
|
datetime.timedelta(minutes=_DEFAULT_TEST_TIMEOUT_MINS) -
|
|
datetime.timedelta(minutes=_CUTOFF_AFTER_TIMEOUT_MINS))
|
|
|
|
results = rpc.run('get_host_queue_entries', host__hostname=machine,
|
|
started_on__gte=str(cutoff))
|
|
|
|
if results:
|
|
return max(results, key=operator.itemgetter('started_on'))
|
|
else:
|
|
return None
|
|
|
|
|
|
def flag_problem_test(machine):
|
|
"""
|
|
Notify people about the last job that ran on a machine.
|
|
|
|
This method is invoked everytime a machine fails to repair, and attempts
|
|
to identify the last test that ran on the machine. If successfull, it files
|
|
a bug, or sends out an email, or just logs the fact.
|
|
|
|
@param machine: The hostname (e.g. IP address) of the machine to find the
|
|
last job ran on it.
|
|
|
|
"""
|
|
rpc = frontend.AFE()
|
|
logger = MachineDeathLogger()
|
|
|
|
try:
|
|
problem_test = _find_problem_test(machine, rpc)
|
|
except (urllib2.URLError, xmlrpclib.ProtocolError):
|
|
logger.logger.error('%s | ERROR: Could not contact RPC server'
|
|
% machine)
|
|
return
|
|
|
|
if problem_test:
|
|
job_id = problem_test['job']['id']
|
|
job_name = problem_test['job']['name']
|
|
bug = reporting.MachineKillerBug(job_id=job_id,
|
|
job_name=job_name,
|
|
machine=machine)
|
|
reporter = reporting.Reporter()
|
|
bug_id = reporter.report(bug)[0]
|
|
|
|
if bug_id is None:
|
|
try:
|
|
email_prefix = ('The following test is killing a machine, '
|
|
'could not file a bug to report this:\n\n')
|
|
mail.send(_SENDER_ADDRESS, _NOTIFY_ADDRESS, '',
|
|
bug.title(), email_prefix + bug.summary())
|
|
except smtplib.SMTPDataError:
|
|
logger.logger.error('%s | %d | %s'
|
|
% (machine, job_id, job_name))
|