1168 lines
42 KiB
Python
Executable file
1168 lines
42 KiB
Python
Executable file
#!/usr/bin/env python
|
|
# Copyright 2015 The Chromium OS Authors. All rights reserved.
|
|
# Use of this source code is governed by a BSD-style license that can be
|
|
# found in the LICENSE file.
|
|
|
|
"""Create e-mail reports of the Lab's DUT inventory.
|
|
|
|
Gathers a list of all DUTs of interest in the Lab, segregated by
|
|
board and pool, and determines whether each DUT is working or
|
|
broken. Then, send one or more e-mail reports summarizing the
|
|
status to e-mail addresses provided on the command line.
|
|
|
|
usage: lab_inventory.py [ options ] [ board ... ]
|
|
|
|
Options:
|
|
--duration / -d <hours>
|
|
How far back in time to search job history to determine DUT
|
|
status.
|
|
|
|
--board-notify <address>[,<address>]
|
|
Send the "board status" e-mail to all the specified e-mail
|
|
addresses.
|
|
|
|
--pool-notify <address>[,<address>]
|
|
Send the "pool status" e-mail to all the specified e-mail
|
|
addresses.
|
|
|
|
--recommend <number>
|
|
When generating the "board status" e-mail, included a list of
|
|
<number> specific DUTs to be recommended for repair.
|
|
|
|
--logdir <directory>
|
|
Log progress and actions in a file under this directory. Text
|
|
of any e-mail sent will also be logged in a timestamped file in
|
|
this directory.
|
|
|
|
--debug
|
|
Suppress all logging and sending e-mail. Instead, write the
|
|
output that would be generated onto stdout.
|
|
|
|
<board> arguments:
|
|
With no arguments, gathers the status for all boards in the lab.
|
|
With one or more named boards on the command line, restricts
|
|
reporting to just those boards.
|
|
|
|
"""
|
|
|
|
|
|
import argparse
|
|
import logging
|
|
import logging.handlers
|
|
import os
|
|
import re
|
|
import sys
|
|
import time
|
|
|
|
import common
|
|
from autotest_lib.client.bin import utils
|
|
from autotest_lib.client.common_lib import time_utils
|
|
from autotest_lib.server.cros.dynamic_suite import frontend_wrappers
|
|
from autotest_lib.server.hosts import servo_host
|
|
from autotest_lib.server.lib import status_history
|
|
from autotest_lib.site_utils import gmail_lib
|
|
from autotest_lib.site_utils.suite_scheduler import constants
|
|
|
|
|
|
CRITICAL_POOLS = constants.Pools.CRITICAL_POOLS
|
|
SPARE_POOL = constants.Pools.SPARE_POOL
|
|
MANAGED_POOLS = constants.Pools.MANAGED_POOLS
|
|
|
|
# _EXCLUDED_LABELS - A set of labels that disqualify a DUT from
|
|
# monitoring by this script. Currently, we're excluding any
|
|
# 'adb' host, because we're not ready to monitor Android or
|
|
# Brillo hosts.
|
|
|
|
_EXCLUDED_LABELS = set(['adb'])
|
|
|
|
# _DEFAULT_DURATION:
|
|
# Default value used for the --duration command line option.
|
|
# Specifies how far back in time to search in order to determine
|
|
# DUT status.
|
|
|
|
_DEFAULT_DURATION = 24
|
|
|
|
# _LOGDIR:
|
|
# Relative path used in the calculation of the default setting
|
|
# for the --logdir option. The full path path is relative to
|
|
# the root of the autotest directory, as determined from
|
|
# sys.argv[0].
|
|
# _LOGFILE:
|
|
# Basename of a file to which general log information will be
|
|
# written.
|
|
# _LOG_FORMAT:
|
|
# Format string for log messages.
|
|
|
|
_LOGDIR = os.path.join('logs', 'dut-data')
|
|
_LOGFILE = 'lab-inventory.log'
|
|
_LOG_FORMAT = '%(asctime)s | %(levelname)-10s | %(message)s'
|
|
|
|
# Pattern describing location-based host names in the Chrome OS test
|
|
# labs. Each DUT hostname designates the DUT's location:
|
|
# * A lab (room) that's physically separated from other labs
|
|
# (i.e. there's a door).
|
|
# * A row (or aisle) of DUTs within the lab.
|
|
# * A vertical rack of shelves on the row.
|
|
# * A specific host on one shelf of the rack.
|
|
|
|
_HOSTNAME_PATTERN = re.compile(
|
|
r'(chromeos\d+)-row(\d+)-rack(\d+)-host(\d+)')
|
|
|
|
# Default entry for managed pools.
|
|
|
|
_MANAGED_POOL_DEFAULT = 'all_pools'
|
|
|
|
|
|
class _PoolCounts(object):
|
|
"""Maintains a set of `HostJobHistory` objects for a pool.
|
|
|
|
The collected history objects are nominally all part of a single
|
|
scheduling pool of DUTs. The collection maintains a list of
|
|
working DUTs, a list of broken DUTs, and a list of all DUTs.
|
|
|
|
Performance note: Certain methods in this class are potentially
|
|
expensive:
|
|
* `get_working()`
|
|
* `get_working_list()`
|
|
* `get_broken()`
|
|
* `get_broken_list()`
|
|
* `get_idle()`
|
|
* `get_idle_list()`
|
|
The first time any one of these methods is called, it causes
|
|
multiple RPC calls with a relatively expensive set of database
|
|
queries. However, the results of the queries are cached in the
|
|
individual `HostJobHistory` objects, so only the first call
|
|
actually pays the full cost.
|
|
|
|
Additionally, `get_working_list()`, `get_broken_list()` and
|
|
`get_idle_list()` cache their return values to avoid recalculating
|
|
lists at every call; this caching is separate from the caching of RPC
|
|
results described above.
|
|
|
|
This class is deliberately constructed to delay the RPC cost
|
|
until the accessor methods are called (rather than to query in
|
|
`record_host()`) so that it's possible to construct a complete
|
|
`_LabInventory` without making the expensive queries at creation
|
|
time. `_populate_board_counts()`, below, assumes this behavior.
|
|
|
|
"""
|
|
|
|
def __init__(self):
|
|
self._histories = []
|
|
self._working_list = None
|
|
self._broken_list = None
|
|
self._idle_list = None
|
|
|
|
|
|
def record_host(self, host_history):
|
|
"""Add one `HostJobHistory` object to the collection.
|
|
|
|
@param host_history The `HostJobHistory` object to be
|
|
remembered.
|
|
|
|
"""
|
|
self._working_list = None
|
|
self._broken_list = None
|
|
self._idle_list = None
|
|
self._histories.append(host_history)
|
|
|
|
|
|
def get_working_list(self):
|
|
"""Return a list of all working DUTs in the pool.
|
|
|
|
Filter `self._histories` for histories where the last
|
|
diagnosis is `WORKING`.
|
|
|
|
Cache the result so that we only cacluate it once.
|
|
|
|
@return A list of HostJobHistory objects.
|
|
|
|
"""
|
|
if self._working_list is None:
|
|
self._working_list = [h for h in self._histories
|
|
if h.last_diagnosis()[0] == status_history.WORKING]
|
|
return self._working_list
|
|
|
|
|
|
def get_working(self):
|
|
"""Return the number of working DUTs in the pool."""
|
|
return len(self.get_working_list())
|
|
|
|
|
|
def get_broken_list(self):
|
|
"""Return a list of all broken DUTs in the pool.
|
|
|
|
Filter `self._histories` for histories where the last
|
|
diagnosis is `BROKEN`.
|
|
|
|
Cache the result so that we only cacluate it once.
|
|
|
|
@return A list of HostJobHistory objects.
|
|
|
|
"""
|
|
if self._broken_list is None:
|
|
self._broken_list = [h for h in self._histories
|
|
if h.last_diagnosis()[0] == status_history.BROKEN]
|
|
return self._broken_list
|
|
|
|
|
|
def get_broken(self):
|
|
"""Return the number of broken DUTs in the pool."""
|
|
return len(self.get_broken_list())
|
|
|
|
|
|
def get_idle_list(self):
|
|
"""Return a list of all idle DUTs in the pool.
|
|
|
|
Filter `self._histories` for histories where the last
|
|
diagnosis is `UNUSED` or `UNKNOWN`.
|
|
|
|
Cache the result so that we only cacluate it once.
|
|
|
|
@return A list of HostJobHistory objects.
|
|
|
|
"""
|
|
idle_list = [status_history.UNUSED, status_history.UNKNOWN]
|
|
if self._idle_list is None:
|
|
self._idle_list = [h for h in self._histories
|
|
if h.last_diagnosis()[0] in idle_list]
|
|
return self._idle_list
|
|
|
|
|
|
def get_idle(self):
|
|
"""Return the number of idle DUTs in the pool."""
|
|
return len(self.get_idle_list())
|
|
|
|
|
|
def get_total(self):
|
|
"""Return the total number of DUTs in the pool."""
|
|
return len(self._histories)
|
|
|
|
|
|
class _BoardCounts(object):
|
|
"""Maintains a set of `HostJobHistory` objects for a board.
|
|
|
|
The collected history objects are nominally all of the same
|
|
board. The collection maintains a count of working DUTs, a
|
|
count of broken DUTs, and a total count. The counts can be
|
|
obtained either for a single pool, or as a total across all
|
|
pools.
|
|
|
|
DUTs in the collection must be assigned to one of the pools
|
|
in `_MANAGED_POOLS`.
|
|
|
|
The `get_working()` and `get_broken()` methods rely on the
|
|
methods of the same name in _PoolCounts, so the performance
|
|
note in _PoolCounts applies here as well.
|
|
|
|
"""
|
|
|
|
def __init__(self):
|
|
self._pools = {
|
|
pool: _PoolCounts() for pool in MANAGED_POOLS
|
|
}
|
|
|
|
def record_host(self, host_history):
|
|
"""Add one `HostJobHistory` object to the collection.
|
|
|
|
@param host_history The `HostJobHistory` object to be
|
|
remembered.
|
|
|
|
"""
|
|
pool = host_history.host_pool
|
|
self._pools[pool].record_host(host_history)
|
|
|
|
|
|
def _count_pool(self, get_pool_count, pool=None):
|
|
"""Internal helper to count hosts in a given pool.
|
|
|
|
The `get_pool_count` parameter is a function to calculate
|
|
the exact count of interest for the pool.
|
|
|
|
@param get_pool_count Function to return a count from a
|
|
_PoolCount object.
|
|
@param pool The pool to be counted. If `None`,
|
|
return the total across all pools.
|
|
|
|
"""
|
|
if pool is None:
|
|
return sum([get_pool_count(counts)
|
|
for counts in self._pools.values()])
|
|
else:
|
|
return get_pool_count(self._pools[pool])
|
|
|
|
|
|
def get_working_list(self):
|
|
"""Return a list of all working DUTs for the board.
|
|
|
|
Go through all HostJobHistory objects in the board's pools,
|
|
selecting the ones where the last diagnosis is `WORKING`.
|
|
|
|
@return A list of HostJobHistory objects.
|
|
|
|
"""
|
|
l = []
|
|
for p in self._pools.values():
|
|
l.extend(p.get_working_list())
|
|
return l
|
|
|
|
|
|
def get_working(self, pool=None):
|
|
"""Return the number of working DUTs in a pool.
|
|
|
|
@param pool The pool to be counted. If `None`, return the
|
|
total across all pools.
|
|
|
|
@return The total number of working DUTs in the selected
|
|
pool(s).
|
|
"""
|
|
return self._count_pool(_PoolCounts.get_working, pool)
|
|
|
|
|
|
def get_broken_list(self):
|
|
"""Return a list of all broken DUTs for the board.
|
|
|
|
Go through all HostJobHistory objects in the board's pools,
|
|
selecting the ones where the last diagnosis is `BROKEN`.
|
|
|
|
@return A list of HostJobHistory objects.
|
|
|
|
"""
|
|
l = []
|
|
for p in self._pools.values():
|
|
l.extend(p.get_broken_list())
|
|
return l
|
|
|
|
|
|
def get_broken(self, pool=None):
|
|
"""Return the number of broken DUTs in a pool.
|
|
|
|
@param pool The pool to be counted. If `None`, return the
|
|
total across all pools.
|
|
|
|
@return The total number of broken DUTs in the selected pool(s).
|
|
"""
|
|
return self._count_pool(_PoolCounts.get_broken, pool)
|
|
|
|
|
|
def get_idle_list(self, pool=None):
|
|
"""Return a list of all idle DUTs for the board.
|
|
|
|
Go through all HostJobHistory objects in the board's pools,
|
|
selecting the ones where the last diagnosis is `UNUSED` or `UNKNOWN`.
|
|
|
|
@param pool: The pool to be counted. If `None`, return the total list
|
|
across all pools.
|
|
|
|
@return A list of HostJobHistory objects.
|
|
|
|
"""
|
|
if pool is None:
|
|
l = []
|
|
for p in self._pools.values():
|
|
l.extend(p.get_idle_list())
|
|
return l
|
|
else:
|
|
return _PoolCounts.get_idle_list(self._pools[pool])
|
|
|
|
|
|
def get_idle(self, pool=None):
|
|
"""Return the number of idle DUTs in a pool.
|
|
|
|
@param pool: The pool to be counted. If `None`, return the total
|
|
across all pools.
|
|
|
|
@return The total number of idle DUTs in the selected pool(s).
|
|
"""
|
|
return self._count_pool(_PoolCounts.get_idle, pool)
|
|
|
|
|
|
def get_spares_buffer(self):
|
|
"""Return the the nominal number of working spares.
|
|
|
|
Calculates and returns how many working spares there would
|
|
be in the spares pool if all broken DUTs were in the spares
|
|
pool. This number may be negative, indicating a shortfall
|
|
in the critical pools.
|
|
|
|
@return The total number DUTs in the spares pool, less the total
|
|
number of broken DUTs in all pools.
|
|
"""
|
|
return self.get_total(SPARE_POOL) - self.get_broken()
|
|
|
|
|
|
def get_total(self, pool=None):
|
|
"""Return the total number of DUTs in a pool.
|
|
|
|
@param pool The pool to be counted. If `None`, return the
|
|
total across all pools.
|
|
|
|
@return The total number of DUTs in the selected pool(s).
|
|
"""
|
|
return self._count_pool(_PoolCounts.get_total, pool)
|
|
|
|
|
|
class _LabInventory(dict):
|
|
"""Collection of `HostJobHistory` objects for the Lab's inventory.
|
|
|
|
The collection is indexed by board. Indexing returns the
|
|
_BoardCounts object associated with the board.
|
|
|
|
The collection is also iterable. The iterator returns all the
|
|
boards in the inventory, in unspecified order.
|
|
|
|
"""
|
|
|
|
@staticmethod
|
|
def _eligible_host(afehost):
|
|
"""Return whether this host is eligible for monitoring.
|
|
|
|
Hosts with any label that's in `_EXCLUDED_LABELS` aren't
|
|
eligible.
|
|
|
|
@param afehost The host to be tested for eligibility.
|
|
"""
|
|
return not len(_EXCLUDED_LABELS.intersection(afehost.labels))
|
|
|
|
|
|
@classmethod
|
|
def create_inventory(cls, afe, start_time, end_time, boardlist=[]):
|
|
"""Return a Lab inventory with specified parameters.
|
|
|
|
By default, gathers inventory from `HostJobHistory` objects
|
|
for all DUTs in the `MANAGED_POOLS` list. If `boardlist`
|
|
is supplied, the inventory will be restricted to only the
|
|
given boards.
|
|
|
|
@param afe AFE object for constructing the
|
|
`HostJobHistory` objects.
|
|
@param start_time Start time for the `HostJobHistory`
|
|
objects.
|
|
@param end_time End time for the `HostJobHistory`
|
|
objects.
|
|
@param boardlist List of boards to include. If empty,
|
|
include all available boards.
|
|
@return A `_LabInventory` object for the specified boards.
|
|
|
|
"""
|
|
label_list = [constants.Labels.POOL_PREFIX + l
|
|
for l in MANAGED_POOLS]
|
|
afehosts = afe.get_hosts(labels__name__in=label_list)
|
|
if boardlist:
|
|
# We're deliberately not checking host eligibility in this
|
|
# code path. This is a debug path, not used in production;
|
|
# it may be useful to include ineligible hosts here.
|
|
boardhosts = []
|
|
for board in boardlist:
|
|
board_label = constants.Labels.BOARD_PREFIX + board
|
|
host_list = [h for h in afehosts
|
|
if board_label in h.labels]
|
|
boardhosts.extend(host_list)
|
|
afehosts = boardhosts
|
|
else:
|
|
afehosts = [h for h in afehosts if cls._eligible_host(h)]
|
|
create = lambda host: (
|
|
status_history.HostJobHistory(afe, host,
|
|
start_time, end_time))
|
|
return cls([create(host) for host in afehosts])
|
|
|
|
|
|
def __init__(self, histories):
|
|
# N.B. The query that finds our hosts is restricted to those
|
|
# with a valid pool: label, but doesn't check for a valid
|
|
# board: label. In some (insufficiently) rare cases, the
|
|
# AFE hosts table has been known to (incorrectly) have DUTs
|
|
# with a pool: but no board: label. We explicitly exclude
|
|
# those here.
|
|
histories = [h for h in histories
|
|
if h.host_board is not None]
|
|
boards = set([h.host_board for h in histories])
|
|
initval = { board: _BoardCounts() for board in boards }
|
|
super(_LabInventory, self).__init__(initval)
|
|
self._dut_count = len(histories)
|
|
self._managed_boards = {}
|
|
for h in histories:
|
|
self[h.host_board].record_host(h)
|
|
|
|
|
|
def get_managed_boards(self, pool=_MANAGED_POOL_DEFAULT):
|
|
"""Return the set of "managed" boards.
|
|
|
|
Operationally, saying a board is "managed" means that the
|
|
board will be included in the "board" and "repair
|
|
recommendations" reports. That is, if there are failures in
|
|
the board's inventory then lab techs will be asked to fix
|
|
them without a separate ticket.
|
|
|
|
For purposes of implementation, a board is "managed" if it
|
|
has DUTs in both the spare and a non-spare (i.e. critical)
|
|
pool.
|
|
|
|
@param pool: The specified pool for managed boards.
|
|
@return A set of all the boards that have both spare and
|
|
non-spare pools, unless the pool is specified,
|
|
then the set of boards in that pool.
|
|
"""
|
|
if self._managed_boards.get(pool, None) is None:
|
|
self._managed_boards[pool] = set()
|
|
for board, counts in self.items():
|
|
# Get the counts for all pools, otherwise get it for the
|
|
# specified pool.
|
|
if pool == _MANAGED_POOL_DEFAULT:
|
|
spares = counts.get_total(SPARE_POOL)
|
|
total = counts.get_total()
|
|
if spares != 0 and spares != total:
|
|
self._managed_boards[pool].add(board)
|
|
else:
|
|
if counts.get_total(pool) != 0:
|
|
self._managed_boards[pool].add(board)
|
|
return self._managed_boards[pool]
|
|
|
|
|
|
def get_num_duts(self):
|
|
"""Return the total number of DUTs in the inventory."""
|
|
return self._dut_count
|
|
|
|
|
|
def get_num_boards(self):
|
|
"""Return the total number of boards in the inventory."""
|
|
return len(self)
|
|
|
|
|
|
def _sort_by_location(inventory_list):
|
|
"""Return a list of DUTs, organized by location.
|
|
|
|
Take the given list of `HostJobHistory` objects, separate it
|
|
into a list per lab, and sort each lab's list by location. The
|
|
order of sorting within a lab is
|
|
* By row number within the lab,
|
|
* then by rack number within the row,
|
|
* then by host shelf number within the rack.
|
|
|
|
Return a list of the sorted lists.
|
|
|
|
Implementation note: host locations are sorted by converting
|
|
each location into a base 100 number. If row, rack or
|
|
host numbers exceed the range [0..99], then sorting will
|
|
break down.
|
|
|
|
@return A list of sorted lists of DUTs.
|
|
|
|
"""
|
|
BASE = 100
|
|
lab_lists = {}
|
|
for history in inventory_list:
|
|
location = _HOSTNAME_PATTERN.match(history.host.hostname)
|
|
if location:
|
|
lab = location.group(1)
|
|
key = 0
|
|
for idx in location.group(2, 3, 4):
|
|
key = BASE * key + int(idx)
|
|
lab_lists.setdefault(lab, []).append((key, history))
|
|
return_list = []
|
|
for dut_list in lab_lists.values():
|
|
dut_list.sort(key=lambda t: t[0])
|
|
return_list.append([t[1] for t in dut_list])
|
|
return return_list
|
|
|
|
|
|
def _score_repair_set(buffer_counts, repair_list):
|
|
"""Return a numeric score rating a set of DUTs to be repaired.
|
|
|
|
`buffer_counts` is a dictionary mapping board names to the
|
|
size of the board's spares buffer.
|
|
|
|
`repair_list` is a list of DUTs to be repaired.
|
|
|
|
This function calculates the new set of buffer counts that would
|
|
result from the proposed repairs, and scores the new set using
|
|
two numbers:
|
|
* Worst case buffer count for any board (higher is better).
|
|
This is the more siginficant number for comparison.
|
|
* Number of boards at the worst case (lower is better). This
|
|
is the less significant number.
|
|
|
|
Implementation note: The score could fail to reflect the
|
|
intended criteria if there are more than 1000 boards in the
|
|
inventory.
|
|
|
|
@param spare_counts A dictionary mapping boards to buffer counts.
|
|
@param repair_list A list of boards to be repaired.
|
|
@return A numeric score.
|
|
|
|
"""
|
|
# Go through `buffer_counts`, and create a list of new counts
|
|
# that records the buffer count for each board after repair.
|
|
# The new list of counts discards the board names, as they don't
|
|
# contribute to the final score.
|
|
_NBOARDS = 1000
|
|
repair_inventory = _LabInventory(repair_list)
|
|
new_counts = []
|
|
for b, c in buffer_counts.items():
|
|
if b in repair_inventory:
|
|
newcount = repair_inventory[b].get_total()
|
|
else:
|
|
newcount = 0
|
|
new_counts.append(c + newcount)
|
|
# Go through the new list of counts. Find the worst available
|
|
# spares count, and count how many times that worst case occurs.
|
|
worst_count = new_counts[0]
|
|
num_worst = 1
|
|
for c in new_counts[1:]:
|
|
if c == worst_count:
|
|
num_worst += 1
|
|
elif c < worst_count:
|
|
worst_count = c
|
|
num_worst = 1
|
|
# Return the calculated score
|
|
return _NBOARDS * worst_count - num_worst
|
|
|
|
|
|
def _generate_repair_recommendation(inventory, num_recommend):
|
|
"""Return a summary of selected DUTs needing repair.
|
|
|
|
Returns a message recommending a list of broken DUTs to be
|
|
repaired. The list of DUTs is selected based on these
|
|
criteria:
|
|
* No more than `num_recommend` DUTs will be listed.
|
|
* All DUTs must be in the same lab.
|
|
* DUTs should be selected for some degree of physical
|
|
proximity.
|
|
* DUTs for boards with a low spares buffer are more important
|
|
than DUTs with larger buffers.
|
|
|
|
The algorithm used will guarantee that at least one DUT from a
|
|
board with the smallest spares buffer will be recommended. If
|
|
the worst spares buffer number is shared by more than one board,
|
|
the algorithm will tend to prefer repair sets that include more
|
|
of those boards over sets that cover fewer boards.
|
|
|
|
@param inventory Inventory for generating recommendations.
|
|
@param num_recommend Number of DUTs to recommend for repair.
|
|
|
|
"""
|
|
logging.debug('Creating DUT repair recommendations')
|
|
board_buffer_counts = {}
|
|
broken_list = []
|
|
for board in inventory.get_managed_boards():
|
|
logging.debug('Listing failed DUTs for %s', board)
|
|
counts = inventory[board]
|
|
if counts.get_broken() != 0:
|
|
board_buffer_counts[board] = counts.get_spares_buffer()
|
|
broken_list.extend(counts.get_broken_list())
|
|
# N.B. The logic inside this loop may seem complicated, but
|
|
# simplification is hard:
|
|
# * Calculating an initial recommendation outside of
|
|
# the loop likely would make things more complicated,
|
|
# not less.
|
|
# * It's necessary to calculate an initial lab slice once per
|
|
# lab _before_ the while loop, in case the number of broken
|
|
# DUTs in a lab is less than `num_recommend`.
|
|
recommendation = None
|
|
best_score = None
|
|
for lab_duts in _sort_by_location(broken_list):
|
|
start = 0
|
|
end = num_recommend
|
|
lab_slice = lab_duts[start : end]
|
|
lab_score = _score_repair_set(board_buffer_counts,
|
|
lab_slice)
|
|
while end < len(lab_duts):
|
|
start += 1
|
|
end += 1
|
|
new_slice = lab_duts[start : end]
|
|
new_score = _score_repair_set(board_buffer_counts,
|
|
new_slice)
|
|
if new_score > lab_score:
|
|
lab_slice = new_slice
|
|
lab_score = new_score
|
|
if recommendation is None or lab_score > best_score:
|
|
recommendation = lab_slice
|
|
best_score = lab_score
|
|
# N.B. The trailing space here is manadatory: Without it, Gmail
|
|
# will parse the URL wrong. Don't ask. If you simply _must_
|
|
# know more, go try it yourself...
|
|
line_fmt = '%-30s %-16s %-6s\n %s '
|
|
message = ['Repair recommendations:\n',
|
|
line_fmt % ( 'Hostname', 'Board', 'Servo?', 'Logs URL')]
|
|
for h in recommendation:
|
|
servo_name = servo_host.make_servo_hostname(h.host.hostname)
|
|
servo_present = utils.host_is_in_lab_zone(servo_name)
|
|
_, event = h.last_diagnosis()
|
|
line = line_fmt % (
|
|
h.host.hostname, h.host_board,
|
|
'Yes' if servo_present else 'No', event.job_url)
|
|
message.append(line)
|
|
return '\n'.join(message)
|
|
|
|
|
|
def _generate_board_inventory_message(inventory):
|
|
"""Generate the "board inventory" e-mail message.
|
|
|
|
The board inventory is a list by board summarizing the number
|
|
of working and broken DUTs, and the total shortfall or surplus
|
|
of working devices relative to the minimum critical pool
|
|
requirement.
|
|
|
|
The report omits boards with no DUTs in the spare pool or with
|
|
no DUTs in a critical pool.
|
|
|
|
N.B. For sample output text formattted as users can expect to
|
|
see it in e-mail and log files, refer to the unit tests.
|
|
|
|
@param inventory _LabInventory object with the inventory to
|
|
be reported on.
|
|
@return String with the inventory message to be sent.
|
|
|
|
"""
|
|
logging.debug('Creating board inventory')
|
|
nworking = 0
|
|
nbroken = 0
|
|
nidle = 0
|
|
nbroken_boards = 0
|
|
ntotal_boards = 0
|
|
summaries = []
|
|
for board in inventory.get_managed_boards():
|
|
counts = inventory[board]
|
|
logging.debug('Counting %2d DUTS for board %s',
|
|
counts.get_total(), board)
|
|
# Summary elements laid out in the same order as the text
|
|
# headers:
|
|
# Board Avail Bad Idle Good Spare Total
|
|
# e[0] e[1] e[2] e[3] e[4] e[5] e[6]
|
|
element = (board,
|
|
counts.get_spares_buffer(),
|
|
counts.get_broken(),
|
|
counts.get_idle(),
|
|
counts.get_working(),
|
|
counts.get_total(SPARE_POOL),
|
|
counts.get_total())
|
|
if element[2]:
|
|
summaries.append(element)
|
|
nbroken_boards += 1
|
|
ntotal_boards += 1
|
|
nbroken += element[2]
|
|
nidle += element[3]
|
|
nworking += element[4]
|
|
ntotal = nworking + nbroken + nidle
|
|
summaries = sorted(summaries, key=lambda e: (e[1], -e[2]))
|
|
broken_percent = int(round(100.0 * nbroken / ntotal))
|
|
idle_percent = int(round(100.0 * nidle / ntotal))
|
|
working_percent = 100 - broken_percent - idle_percent
|
|
message = ['Summary of DUTs in inventory:',
|
|
'%10s %10s %10s %6s' % ('Bad', 'Idle', 'Good', 'Total'),
|
|
'%5d %3d%% %5d %3d%% %5d %3d%% %6d' % (
|
|
nbroken, broken_percent,
|
|
nidle, idle_percent,
|
|
nworking, working_percent,
|
|
ntotal),
|
|
'',
|
|
'Boards with failures: %d' % nbroken_boards,
|
|
'Boards in inventory: %d' % ntotal_boards,
|
|
'', '',
|
|
'Full board inventory:\n',
|
|
'%-22s %5s %5s %5s %5s %5s %5s' % (
|
|
'Board', 'Avail', 'Bad', 'Idle', 'Good',
|
|
'Spare', 'Total')]
|
|
message.extend(
|
|
['%-22s %5d %5d %5d %5d %5d %5d' % e for e in summaries])
|
|
return '\n'.join(message)
|
|
|
|
|
|
_POOL_INVENTORY_HEADER = '''\
|
|
Notice to Infrastructure deputies: All boards shown below are at
|
|
less than full strength, please take action to resolve the issues.
|
|
Once you're satisified that failures won't recur, failed DUTs can
|
|
be replaced with spares by running `balance_pool`. Detailed
|
|
instructions can be found here:
|
|
http://go/cros-manage-duts
|
|
'''
|
|
|
|
|
|
def _generate_pool_inventory_message(inventory):
|
|
"""Generate the "pool inventory" e-mail message.
|
|
|
|
The pool inventory is a list by pool and board summarizing the
|
|
number of working and broken DUTs in the pool. Only boards with
|
|
at least one broken DUT are included in the list.
|
|
|
|
N.B. For sample output text formattted as users can expect to
|
|
see it in e-mail and log files, refer to the unit tests.
|
|
|
|
@param inventory _LabInventory object with the inventory to
|
|
be reported on.
|
|
@return String with the inventory message to be sent.
|
|
|
|
"""
|
|
logging.debug('Creating pool inventory')
|
|
message = [_POOL_INVENTORY_HEADER]
|
|
newline = ''
|
|
for pool in CRITICAL_POOLS:
|
|
message.append(
|
|
'%sStatus for pool:%s, by board:' % (newline, pool))
|
|
message.append(
|
|
'%-20s %5s %5s %5s %5s' % (
|
|
'Board', 'Bad', 'Idle', 'Good', 'Total'))
|
|
data_list = []
|
|
for board, counts in inventory.items():
|
|
logging.debug('Counting %2d DUTs for %s, %s',
|
|
counts.get_total(pool), board, pool)
|
|
broken = counts.get_broken(pool)
|
|
idle = counts.get_idle(pool)
|
|
# boards at full strength are not reported
|
|
if broken == 0 and idle == 0:
|
|
continue
|
|
working = counts.get_working(pool)
|
|
total = counts.get_total(pool)
|
|
data_list.append((board, broken, idle, working, total))
|
|
if data_list:
|
|
data_list = sorted(data_list, key=lambda d: -d[1])
|
|
message.extend(
|
|
['%-20s %5d %5d %5d %5d' % t for t in data_list])
|
|
else:
|
|
message.append('(All boards at full strength)')
|
|
newline = '\n'
|
|
return '\n'.join(message)
|
|
|
|
|
|
_IDLE_INVENTORY_HEADER = '''\
|
|
Notice to Infrastructure deputies: The hosts shown below haven't
|
|
run any jobs for at least 24 hours. Please check each host; locked
|
|
hosts should normally be unlocked; stuck jobs should normally be
|
|
aborted.
|
|
'''
|
|
|
|
|
|
def _generate_idle_inventory_message(inventory):
|
|
"""Generate the "idle inventory" e-mail message.
|
|
|
|
The idle inventory is a host list with corresponding pool and board,
|
|
where the hosts are idle (`UNKWOWN` or `UNUSED`).
|
|
|
|
N.B. For sample output text format as users can expect to
|
|
see it in e-mail and log files, refer to the unit tests.
|
|
|
|
@param inventory _LabInventory object with the inventory to
|
|
be reported on.
|
|
@return String with the inventory message to be sent.
|
|
|
|
"""
|
|
logging.debug('Creating idle inventory')
|
|
message = [_IDLE_INVENTORY_HEADER]
|
|
message.append('Idle Host List:')
|
|
message.append('%-30s %-20s %s' % ('Hostname', 'Board', 'Pool'))
|
|
data_list = []
|
|
for pool in MANAGED_POOLS:
|
|
for board, counts in inventory.items():
|
|
logging.debug('Counting %2d DUTs for %s, %s',
|
|
counts.get_total(pool), board, pool)
|
|
data_list.extend([(dut.host.hostname, board, pool)
|
|
for dut in counts.get_idle_list(pool)])
|
|
if data_list:
|
|
message.extend(['%-30s %-20s %s' % t for t in data_list])
|
|
else:
|
|
message.append('(No idle DUTs)')
|
|
return '\n'.join(message)
|
|
|
|
|
|
def _send_email(arguments, tag, subject, recipients, body):
|
|
"""Send an inventory e-mail message.
|
|
|
|
The message is logged in the selected log directory using `tag`
|
|
for the file name.
|
|
|
|
If the --print option was requested, the message is neither
|
|
logged nor sent, but merely printed on stdout.
|
|
|
|
@param arguments Parsed command-line options.
|
|
@param tag Tag identifying the inventory for logging
|
|
purposes.
|
|
@param subject E-mail Subject: header line.
|
|
@param recipients E-mail addresses for the To: header line.
|
|
@param body E-mail message body.
|
|
|
|
"""
|
|
logging.debug('Generating email: "%s"', subject)
|
|
all_recipients = ', '.join(recipients)
|
|
report_body = '\n'.join([
|
|
'To: %s' % all_recipients,
|
|
'Subject: %s' % subject,
|
|
'', body, ''])
|
|
if arguments.debug:
|
|
print report_body
|
|
else:
|
|
filename = os.path.join(arguments.logdir, tag)
|
|
try:
|
|
report_file = open(filename, 'w')
|
|
report_file.write(report_body)
|
|
report_file.close()
|
|
except EnvironmentError as e:
|
|
logging.error('Failed to write %s: %s', filename, e)
|
|
try:
|
|
gmail_lib.send_email(all_recipients, subject, body)
|
|
except Exception as e:
|
|
logging.error('Failed to send e-mail to %s: %s',
|
|
all_recipients, e)
|
|
|
|
|
|
def _separate_email_addresses(address_list):
|
|
"""Parse a list of comma-separated lists of e-mail addresses.
|
|
|
|
@param address_list A list of strings containing comma
|
|
separate e-mail addresses.
|
|
@return A list of the individual e-mail addresses.
|
|
|
|
"""
|
|
newlist = []
|
|
for arg in address_list:
|
|
newlist.extend([email.strip() for email in arg.split(',')])
|
|
return newlist
|
|
|
|
|
|
def _verify_arguments(arguments):
|
|
"""Validate command-line arguments.
|
|
|
|
Join comma separated e-mail addresses for `--board-notify` and
|
|
`--pool-notify` in separate option arguments into a single list.
|
|
|
|
For non-debug uses, require that notification be requested for
|
|
at least one report. For debug, if notification isn't specified,
|
|
treat it as "run all the reports."
|
|
|
|
The return value indicates success or failure; in the case of
|
|
failure, we also write an error message to stderr.
|
|
|
|
@param arguments Command-line arguments as returned by
|
|
`ArgumentParser`
|
|
@return True if the arguments are semantically good, or False
|
|
if the arguments don't meet requirements.
|
|
|
|
"""
|
|
arguments.board_notify = _separate_email_addresses(
|
|
arguments.board_notify)
|
|
arguments.pool_notify = _separate_email_addresses(
|
|
arguments.pool_notify)
|
|
if not arguments.board_notify and not arguments.pool_notify:
|
|
if not arguments.debug:
|
|
sys.stderr.write('Must specify at least one of '
|
|
'--board-notify or --pool-notify\n')
|
|
return False
|
|
else:
|
|
# We want to run all the reports. An empty notify list
|
|
# will cause a report to be skipped, so make sure the
|
|
# lists are non-empty.
|
|
arguments.board_notify = ['']
|
|
arguments.pool_notify = ['']
|
|
return True
|
|
|
|
|
|
def _get_logdir(script):
|
|
"""Get the default directory for the `--logdir` option.
|
|
|
|
The default log directory is based on the parent directory
|
|
containing this script.
|
|
|
|
@param script Path to this script file.
|
|
@return A path to a directory.
|
|
|
|
"""
|
|
basedir = os.path.dirname(os.path.abspath(script))
|
|
basedir = os.path.dirname(basedir)
|
|
return os.path.join(basedir, _LOGDIR)
|
|
|
|
|
|
def _parse_command(argv):
|
|
"""Parse the command line arguments.
|
|
|
|
Create an argument parser for this command's syntax, parse the
|
|
command line, and return the result of the ArgumentParser
|
|
parse_args() method.
|
|
|
|
@param argv Standard command line argument vector; argv[0] is
|
|
assumed to be the command name.
|
|
@return Result returned by ArgumentParser.parse_args().
|
|
|
|
"""
|
|
parser = argparse.ArgumentParser(
|
|
prog=argv[0],
|
|
description='Gather and report lab inventory statistics')
|
|
parser.add_argument('-d', '--duration', type=int,
|
|
default=_DEFAULT_DURATION, metavar='HOURS',
|
|
help='number of hours back to search for status'
|
|
' (default: %d)' % _DEFAULT_DURATION)
|
|
parser.add_argument('--board-notify', action='append',
|
|
default=[], metavar='ADDRESS',
|
|
help='Generate board inventory message, '
|
|
'and send it to the given e-mail address(es)')
|
|
parser.add_argument('--pool-notify', action='append',
|
|
default=[], metavar='ADDRESS',
|
|
help='Generate pool inventory message, '
|
|
'and send it to the given address(es)')
|
|
parser.add_argument('-r', '--recommend', type=int, default=None,
|
|
help=('Specify how many DUTs should be '
|
|
'recommended for repair (default: no '
|
|
'recommendation)'))
|
|
parser.add_argument('--debug', action='store_true',
|
|
help='Print e-mail messages on stdout '
|
|
'without sending them.')
|
|
parser.add_argument('--logdir', default=_get_logdir(argv[0]),
|
|
help='Directory where logs will be written.')
|
|
parser.add_argument('boardnames', nargs='*',
|
|
metavar='BOARD',
|
|
help='names of boards to report on '
|
|
'(default: all boards)')
|
|
arguments = parser.parse_args(argv[1:])
|
|
if not _verify_arguments(arguments):
|
|
return None
|
|
return arguments
|
|
|
|
|
|
def _configure_logging(arguments):
|
|
"""Configure the `logging` module for our needs.
|
|
|
|
How we log depends on whether the `--print` option was
|
|
provided on the command line. Without the option, we log all
|
|
messages at DEBUG level or above, and write them to a file in
|
|
the directory specified by the `--logdir` option. With the
|
|
option, we write log messages to stdout; messages below INFO
|
|
level are discarded.
|
|
|
|
The log file is configured to rotate once a week on Friday
|
|
evening, preserving ~3 months worth of history.
|
|
|
|
@param arguments Command-line arguments as returned by
|
|
`ArgumentParser`
|
|
|
|
"""
|
|
root_logger = logging.getLogger()
|
|
if arguments.debug:
|
|
root_logger.setLevel(logging.INFO)
|
|
handler = logging.StreamHandler(sys.stdout)
|
|
handler.setFormatter(logging.Formatter())
|
|
else:
|
|
if not os.path.exists(arguments.logdir):
|
|
os.mkdir(arguments.logdir)
|
|
root_logger.setLevel(logging.DEBUG)
|
|
logfile = os.path.join(arguments.logdir, _LOGFILE)
|
|
handler = logging.handlers.TimedRotatingFileHandler(
|
|
logfile, when='W4', backupCount=13)
|
|
formatter = logging.Formatter(_LOG_FORMAT,
|
|
time_utils.TIME_FMT)
|
|
handler.setFormatter(formatter)
|
|
# TODO(jrbarnette) This is gross. Importing client.bin.utils
|
|
# implicitly imported logging_config, which calls
|
|
# logging.basicConfig() *at module level*. That gives us an
|
|
# extra logging handler that we don't want. So, clear out all
|
|
# the handlers here.
|
|
for h in root_logger.handlers:
|
|
root_logger.removeHandler(h)
|
|
root_logger.addHandler(handler)
|
|
|
|
|
|
def _populate_board_counts(inventory):
|
|
"""Gather board counts while providing interactive feedback.
|
|
|
|
Gathering the status of all individual DUTs in the lab can take
|
|
considerable time (~30 minutes at the time of this writing).
|
|
|
|
Normally, we pay that cost by querying as we go. However, with
|
|
the `--print` option, a human being may be watching the
|
|
progress. So, we force the first (expensive) queries to happen
|
|
up front, and provide a small ASCII progress bar to give an
|
|
indicator of how many boards have been processed.
|
|
|
|
@param inventory _LabInventory object with the inventory to
|
|
be gathered.
|
|
|
|
"""
|
|
n = 0
|
|
total_broken = 0
|
|
for counts in inventory.values():
|
|
n += 1
|
|
if n % 10 == 5:
|
|
c = '+'
|
|
elif n % 10 == 0:
|
|
c = '%d' % ((n / 10) % 10)
|
|
else:
|
|
c = '.'
|
|
sys.stdout.write(c)
|
|
sys.stdout.flush()
|
|
# This next call is where all the time goes - it forces all
|
|
# of a board's HostJobHistory objects to query the database
|
|
# and cache their results.
|
|
total_broken += counts.get_broken()
|
|
sys.stdout.write('\n')
|
|
sys.stdout.write('Found %d broken DUTs\n' % total_broken)
|
|
|
|
|
|
def main(argv):
|
|
"""Standard main routine.
|
|
@param argv Command line arguments including `sys.argv[0]`.
|
|
"""
|
|
arguments = _parse_command(argv)
|
|
if not arguments:
|
|
sys.exit(1)
|
|
_configure_logging(arguments)
|
|
try:
|
|
end_time = int(time.time())
|
|
start_time = end_time - arguments.duration * 60 * 60
|
|
timestamp = time.strftime('%Y-%m-%d.%H',
|
|
time.localtime(end_time))
|
|
logging.debug('Starting lab inventory for %s', timestamp)
|
|
if arguments.board_notify:
|
|
if arguments.recommend:
|
|
logging.debug('Will include repair recommendations')
|
|
logging.debug('Will include board inventory')
|
|
if arguments.pool_notify:
|
|
logging.debug('Will include pool inventory')
|
|
|
|
afe = frontend_wrappers.RetryingAFE(server=None)
|
|
inventory = _LabInventory.create_inventory(
|
|
afe, start_time, end_time, arguments.boardnames)
|
|
logging.info('Found %d hosts across %d boards',
|
|
inventory.get_num_duts(),
|
|
inventory.get_num_boards())
|
|
|
|
if arguments.debug:
|
|
_populate_board_counts(inventory)
|
|
|
|
if arguments.board_notify:
|
|
if arguments.recommend:
|
|
recommend_message = _generate_repair_recommendation(
|
|
inventory, arguments.recommend) + '\n\n\n'
|
|
else:
|
|
recommend_message = ''
|
|
board_message = _generate_board_inventory_message(inventory)
|
|
_send_email(arguments,
|
|
'boards-%s.txt' % timestamp,
|
|
'DUT board inventory %s' % timestamp,
|
|
arguments.board_notify,
|
|
recommend_message + board_message)
|
|
|
|
if arguments.pool_notify:
|
|
pool_message = _generate_pool_inventory_message(inventory)
|
|
idle_message = _generate_idle_inventory_message(inventory)
|
|
_send_email(arguments,
|
|
'pools-%s.txt' % timestamp,
|
|
'DUT pool inventory %s' % timestamp,
|
|
arguments.pool_notify,
|
|
pool_message + '\n\n\n' + idle_message)
|
|
except KeyboardInterrupt:
|
|
pass
|
|
except EnvironmentError as e:
|
|
logging.exception('Unexpected OS error: %s', e)
|
|
except Exception as e:
|
|
logging.exception('Unexpected exception: %s', e)
|
|
|
|
|
|
def get_inventory(afe):
|
|
end_time = int(time.time())
|
|
start_time = end_time - 24 * 60 * 60
|
|
return _LabInventory.create_inventory(afe, start_time, end_time)
|
|
|
|
|
|
def get_managed_boards(afe):
|
|
return get_inventory(afe).get_managed_boards()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main(sys.argv)
|