544 lines
20 KiB
Python
544 lines
20 KiB
Python
# Copyright 2007-2010 Google Inc. Released under the GPL v2
|
|
__author__ = "duanes (Duane Sand), pdahl (Peter Dahl)"
|
|
|
|
# A basic cpuset/cgroup container manager for limiting memory use during tests
|
|
# for use on kernels not running some site-specific container manager
|
|
|
|
import os, sys, re, glob, fcntl, logging
|
|
from autotest_lib.client.bin import utils
|
|
from autotest_lib.client.common_lib import error
|
|
|
|
SUPER_ROOT = '' # root of all containers or cgroups
|
|
NO_LIMIT = (1 << 63) - 1 # containername/memory.limit_in_bytes if no limit
|
|
|
|
# propio service classes:
|
|
PROPIO_PRIO = 1
|
|
PROPIO_NORMAL = 2
|
|
PROPIO_IDLE = 3
|
|
|
|
super_root_path = '' # usually '/dev/cgroup'; '/dev/cpuset' on 2.6.18
|
|
cpuset_prefix = None # usually 'cpuset.'; '' on 2.6.18
|
|
fake_numa_containers = False # container mem via numa=fake mem nodes, else pages
|
|
mem_isolation_on = False
|
|
node_mbytes = 0 # mbytes in one typical mem node
|
|
root_container_bytes = 0 # squishy limit on effective size of root container
|
|
|
|
|
|
def discover_container_style():
|
|
global super_root_path, cpuset_prefix
|
|
global mem_isolation_on, fake_numa_containers
|
|
global node_mbytes, root_container_bytes
|
|
if super_root_path != '':
|
|
return # already looked up
|
|
if os.path.exists('/dev/cgroup/tasks'):
|
|
# running on 2.6.26 or later kernel with containers on:
|
|
super_root_path = '/dev/cgroup'
|
|
cpuset_prefix = 'cpuset.'
|
|
if get_boot_numa():
|
|
mem_isolation_on = fake_numa_containers = True
|
|
else: # memcg containers IFF compiled-in & mounted & non-fakenuma boot
|
|
fake_numa_containers = False
|
|
mem_isolation_on = os.path.exists(
|
|
'/dev/cgroup/memory.limit_in_bytes')
|
|
# TODO: handle possibility of where memcg is mounted as its own
|
|
# cgroup hierarchy, separate from cpuset??
|
|
elif os.path.exists('/dev/cpuset/tasks'):
|
|
# running on 2.6.18 kernel with containers on:
|
|
super_root_path = '/dev/cpuset'
|
|
cpuset_prefix = ''
|
|
mem_isolation_on = fake_numa_containers = get_boot_numa() != ''
|
|
else:
|
|
# neither cpuset nor cgroup filesystem active:
|
|
super_root_path = None
|
|
cpuset_prefix = 'no_cpusets_or_cgroups_exist'
|
|
mem_isolation_on = fake_numa_containers = False
|
|
|
|
logging.debug('mem_isolation: %s', mem_isolation_on)
|
|
logging.debug('fake_numa_containers: %s', fake_numa_containers)
|
|
if fake_numa_containers:
|
|
node_mbytes = int(mbytes_per_mem_node())
|
|
elif mem_isolation_on: # memcg-style containers
|
|
# For now, limit total of all containers to using just 98% of system's
|
|
# visible total ram, to avoid oom events at system level, and avoid
|
|
# page reclaim overhead from going above kswapd highwater mark.
|
|
system_visible_pages = utils.memtotal() >> 2
|
|
usable_pages = int(system_visible_pages * 0.98)
|
|
root_container_bytes = usable_pages << 12
|
|
logging.debug('root_container_bytes: %s',
|
|
utils.human_format(root_container_bytes))
|
|
|
|
|
|
def need_mem_containers():
|
|
discover_container_style()
|
|
if not mem_isolation_on:
|
|
raise error.AutotestError('Mem-isolation containers not enabled '
|
|
'by latest reboot')
|
|
|
|
def need_fake_numa():
|
|
discover_container_style()
|
|
if not fake_numa_containers:
|
|
raise error.AutotestError('fake=numa not enabled by latest reboot')
|
|
|
|
|
|
def full_path(container_name):
|
|
discover_container_style()
|
|
return os.path.join(super_root_path, container_name)
|
|
|
|
|
|
def unpath(container_path):
|
|
return container_path[len(super_root_path)+1:]
|
|
|
|
|
|
def cpuset_attr(container_name, attr):
|
|
discover_container_style()
|
|
return os.path.join(super_root_path, container_name, cpuset_prefix+attr)
|
|
|
|
|
|
def io_attr(container_name, attr):
|
|
discover_container_style()
|
|
# current version assumes shared cgroup hierarchy
|
|
return os.path.join(super_root_path, container_name, 'io.'+attr)
|
|
|
|
|
|
def tasks_path(container_name):
|
|
return os.path.join(full_path(container_name), 'tasks')
|
|
|
|
|
|
def mems_path(container_name):
|
|
return cpuset_attr(container_name, 'mems')
|
|
|
|
|
|
def memory_path(container_name):
|
|
return os.path.join(super_root_path, container_name, 'memory')
|
|
|
|
|
|
def cpus_path(container_name):
|
|
return cpuset_attr(container_name, 'cpus')
|
|
|
|
|
|
def container_exists(name):
|
|
return name is not None and os.path.exists(tasks_path(name))
|
|
|
|
|
|
def move_tasks_into_container(name, tasks):
|
|
task_file = tasks_path(name)
|
|
for task in tasks:
|
|
try:
|
|
logging.debug('moving task %s into container "%s"', task, name)
|
|
utils.write_one_line(task_file, task)
|
|
except Exception:
|
|
if utils.pid_is_alive(task):
|
|
raise # task exists but couldn't move it
|
|
# task is gone or zombie so ignore this exception
|
|
|
|
|
|
def move_self_into_container(name):
|
|
me = str(os.getpid())
|
|
move_tasks_into_container(name, [me])
|
|
logging.debug('running self (pid %s) in container "%s"', me, name)
|
|
|
|
|
|
def _avail_mbytes_via_nodes(parent):
|
|
# total mbytes of mem nodes available for new containers in parent
|
|
free_nodes = available_exclusive_mem_nodes(parent)
|
|
mbytes = nodes_avail_mbytes(free_nodes)
|
|
# don't have exact model for how container mgr measures mem space
|
|
# better here to underestimate than overestimate
|
|
mbytes = max(mbytes - node_mbytes//2, 0)
|
|
return mbytes
|
|
|
|
|
|
def _avail_bytes_via_pages(parent):
|
|
# Get memory bytes available to parent container which could
|
|
# be allocated exclusively to new child containers.
|
|
# This excludes mem previously allocated to existing children.
|
|
available = container_bytes(parent)
|
|
mem_files_pattern = os.path.join(full_path(parent),
|
|
'*', 'memory.limit_in_bytes')
|
|
for mem_file in glob.glob(mem_files_pattern):
|
|
child_container = unpath(os.path.dirname(mem_file))
|
|
available -= container_bytes(child_container)
|
|
return available
|
|
|
|
|
|
def avail_mbytes(parent=SUPER_ROOT):
|
|
# total mbytes available in parent, for exclusive use in new containers
|
|
if fake_numa_containers:
|
|
return _avail_mbytes_via_nodes(parent)
|
|
else:
|
|
return _avail_bytes_via_pages(parent) >> 20
|
|
|
|
|
|
def delete_leftover_test_containers():
|
|
# recover mems and cores tied up by containers of prior failed tests:
|
|
for child in inner_containers_of(SUPER_ROOT):
|
|
_release_container_nest(child)
|
|
|
|
|
|
def my_lock(lockname):
|
|
# lockname is 'inner'
|
|
lockdir = os.environ['AUTODIR']
|
|
lockname = os.path.join(lockdir, '.cpuset.lock.'+lockname)
|
|
lockfile = open(lockname, 'w')
|
|
fcntl.flock(lockfile, fcntl.LOCK_EX)
|
|
return lockfile
|
|
|
|
|
|
def my_unlock(lockfile):
|
|
fcntl.flock(lockfile, fcntl.LOCK_UN)
|
|
lockfile.close()
|
|
|
|
|
|
# Convert '1-3,7,9-12' to set(1,2,3,7,9,10,11,12)
|
|
def rangelist_to_set(rangelist):
|
|
result = set()
|
|
if not rangelist:
|
|
return result
|
|
for x in rangelist.split(','):
|
|
if re.match(r'^(\d+)$', x):
|
|
result.add(int(x))
|
|
continue
|
|
m = re.match(r'^(\d+)-(\d+)$', x)
|
|
if m:
|
|
start = int(m.group(1))
|
|
end = int(m.group(2))
|
|
result.update(set(range(start, end+1)))
|
|
continue
|
|
msg = 'Cannot understand data input: %s %s' % (x, rangelist)
|
|
raise ValueError(msg)
|
|
return result
|
|
|
|
|
|
def my_container_name():
|
|
# Get current process's inherited or self-built container name
|
|
# within /dev/cpuset or /dev/cgroup. Is '' for root container.
|
|
name = utils.read_one_line('/proc/%i/cpuset' % os.getpid())
|
|
return name[1:] # strip leading /
|
|
|
|
|
|
def get_mem_nodes(container_name):
|
|
# all mem nodes now available to a container, both exclusive & shared
|
|
file_name = mems_path(container_name)
|
|
if os.path.exists(file_name):
|
|
return rangelist_to_set(utils.read_one_line(file_name))
|
|
else:
|
|
return set()
|
|
|
|
|
|
def _busy_mem_nodes(parent_container):
|
|
# Get set of numa memory nodes now used (exclusively or shared)
|
|
# by existing children of parent container
|
|
busy = set()
|
|
mem_files_pattern = os.path.join(full_path(parent_container),
|
|
'*', cpuset_prefix+'mems')
|
|
for mem_file in glob.glob(mem_files_pattern):
|
|
child_container = os.path.dirname(mem_file)
|
|
busy |= get_mem_nodes(child_container)
|
|
return busy
|
|
|
|
|
|
def available_exclusive_mem_nodes(parent_container):
|
|
# Get subset of numa memory nodes of parent container which could
|
|
# be allocated exclusively to new child containers.
|
|
# This excludes nodes now allocated to existing children.
|
|
need_fake_numa()
|
|
available = get_mem_nodes(parent_container)
|
|
available -= _busy_mem_nodes(parent_container)
|
|
return available
|
|
|
|
|
|
def my_mem_nodes():
|
|
# Get set of numa memory nodes owned by current process's container.
|
|
discover_container_style()
|
|
if not mem_isolation_on:
|
|
return set() # as expected by vmstress
|
|
return get_mem_nodes(my_container_name())
|
|
|
|
|
|
def my_available_exclusive_mem_nodes():
|
|
# Get subset of numa memory nodes owned by current process's
|
|
# container, which could be allocated exclusively to new child
|
|
# containers. This excludes any nodes now allocated
|
|
# to existing children.
|
|
return available_exclusive_mem_nodes(my_container_name())
|
|
|
|
|
|
def node_avail_kbytes(node):
|
|
return node_mbytes << 10 # crude; fixed numa node size
|
|
|
|
|
|
def nodes_avail_mbytes(nodes):
|
|
# nodes' combined user+avail size, in Mbytes
|
|
return sum(node_avail_kbytes(n) for n in nodes) // 1024
|
|
|
|
|
|
def container_bytes(name):
|
|
if fake_numa_containers:
|
|
return nodes_avail_mbytes(get_mem_nodes(name)) << 20
|
|
else:
|
|
while True:
|
|
file = memory_path(name) + '.limit_in_bytes'
|
|
limit = int(utils.read_one_line(file))
|
|
if limit < NO_LIMIT:
|
|
return limit
|
|
if name == SUPER_ROOT:
|
|
return root_container_bytes
|
|
name = os.path.dirname(name)
|
|
|
|
|
|
def container_mbytes(name):
|
|
return container_bytes(name) >> 20
|
|
|
|
|
|
def mbytes_per_mem_node():
|
|
# Get mbyte size of standard numa mem node, as float
|
|
# (some nodes are bigger than this)
|
|
# Replaces utils.node_size().
|
|
numa = get_boot_numa()
|
|
if numa.endswith('M'):
|
|
return float(numa[:-1]) # mbyte size of fake nodes
|
|
elif numa:
|
|
nodecnt = int(numa) # fake numa mem nodes for container isolation
|
|
else:
|
|
nodecnt = len(utils.numa_nodes()) # phys mem-controller nodes
|
|
# Use guessed total physical mem size, not kernel's
|
|
# lesser 'available memory' after various system tables.
|
|
return utils.rounded_memtotal() / (nodecnt * 1024.0)
|
|
|
|
|
|
def get_cpus(container_name):
|
|
file_name = cpus_path(container_name)
|
|
if os.path.exists(file_name):
|
|
return rangelist_to_set(utils.read_one_line(file_name))
|
|
else:
|
|
return set()
|
|
|
|
|
|
def get_tasks(container_name):
|
|
file_name = tasks_path(container_name)
|
|
try:
|
|
tasks = [x.rstrip() for x in open(file_name).readlines()]
|
|
except IOError:
|
|
if os.path.exists(file_name):
|
|
raise
|
|
tasks = [] # container doesn't exist anymore
|
|
return tasks
|
|
|
|
|
|
def inner_containers_of(parent):
|
|
pattern = os.path.join(full_path(parent), '*/tasks')
|
|
return [unpath(os.path.dirname(task_file))
|
|
for task_file in glob.glob(pattern)]
|
|
|
|
|
|
def _release_container_nest(nest):
|
|
# Destroy a container, and any nested sub-containers
|
|
nest_path = full_path(nest)
|
|
if os.path.exists(nest_path):
|
|
|
|
# bottom-up walk of tree, releasing all nested sub-containers
|
|
for child in inner_containers_of(nest):
|
|
_release_container_nest(child)
|
|
|
|
logging.debug("releasing container %s", nest)
|
|
|
|
# Transfer any survivor tasks (e.g. self) to parent container
|
|
parent = os.path.dirname(nest)
|
|
move_tasks_into_container(parent, get_tasks(nest))
|
|
|
|
# remove the now-empty outermost container of this nest
|
|
if os.path.exists(nest_path):
|
|
os.rmdir(nest_path) # nested, or dead manager
|
|
|
|
|
|
def release_container(container_name=None):
|
|
# Destroy a container
|
|
my_container = my_container_name()
|
|
if container_name is None:
|
|
container_name = my_container
|
|
_release_container_nest(container_name)
|
|
displaced = my_container_name()
|
|
if displaced != my_container:
|
|
logging.debug('now running self (pid %d) in container "%s"',
|
|
os.getpid(), displaced)
|
|
|
|
|
|
def remove_empty_prio_classes(prios):
|
|
# remove prio classes whose set of allowed priorities is empty
|
|
# e.g 'no:3;rt:;be:3;id:' --> 'no:3;be:3'
|
|
return ';'.join(p for p in prios.split(';') if p.split(':')[1])
|
|
|
|
|
|
def all_drive_names():
|
|
# list of all disk drives sda,sdb,...
|
|
paths = glob.glob('/sys/block/sd*')
|
|
if not paths:
|
|
paths = glob.glob('/sys/block/hd*')
|
|
return [os.path.basename(path) for path in paths]
|
|
|
|
|
|
def set_io_controls(container_name, disks=[], ioprio_classes=[PROPIO_NORMAL],
|
|
io_shares=[95], io_limits=[0]):
|
|
# set the propio controls for one container, for selected disks
|
|
# writing directly to /dev/cgroup/container_name/io.io_service_level
|
|
# without using containerd or container.py
|
|
# See wiki ProportionalIOScheduler for definitions
|
|
# ioprio_classes: list of service classes, one per disk
|
|
# using numeric propio service classes as used by kernel API, namely
|
|
# 1: RT, Real Time, aka PROPIO_PRIO
|
|
# 2: BE, Best Effort, aka PROPIO_NORMAL
|
|
# 3: PROPIO_IDLE
|
|
# io_shares: list of disk-time-fractions, one per disk,
|
|
# as percentage integer 0..100
|
|
# io_limits: list of limit on/off, one per disk
|
|
# 0: no limit, shares use of other containers' unused disk time
|
|
# 1: limited, container's use of disk time is capped to given DTF
|
|
# ioprio_classes defaults to best-effort
|
|
# io_limit defaults to no limit, use slack time
|
|
if not disks: # defaults to all drives
|
|
disks = all_drive_names()
|
|
io_shares = [io_shares [0]] * len(disks)
|
|
ioprio_classes = [ioprio_classes[0]] * len(disks)
|
|
io_limits = [io_limits [0]] * len(disks)
|
|
if not (len(disks) == len(ioprio_classes) and len(disks) == len(io_shares)
|
|
and len(disks) == len(io_limits)):
|
|
raise error.AutotestError('Unequal number of values for io controls')
|
|
service_level = io_attr(container_name, 'io_service_level')
|
|
if not os.path.exists(service_level):
|
|
return # kernel predates propio features
|
|
# or io cgroup is mounted separately from cpusets
|
|
disk_infos = []
|
|
for disk,ioclass,limit,share in zip(disks, ioprio_classes,
|
|
io_limits, io_shares):
|
|
parts = (disk, str(ioclass), str(limit), str(share))
|
|
disk_info = ' '.join(parts)
|
|
utils.write_one_line(service_level, disk_info)
|
|
disk_infos.append(disk_info)
|
|
logging.debug('set_io_controls of %s to %s',
|
|
container_name, ', '.join(disk_infos))
|
|
|
|
|
|
def abbrev_list(vals):
|
|
"""Condense unsigned (0,4,5,6,7,10) to '0,4-7,10'."""
|
|
ranges = []
|
|
lower = 0
|
|
upper = -2
|
|
for val in sorted(vals)+[-1]:
|
|
if val != upper+1:
|
|
if lower == upper:
|
|
ranges.append(str(lower))
|
|
elif lower <= upper:
|
|
ranges.append('%d-%d' % (lower, upper))
|
|
lower = val
|
|
upper = val
|
|
return ','.join(ranges)
|
|
|
|
|
|
def create_container_with_specific_mems_cpus(name, mems, cpus):
|
|
need_fake_numa()
|
|
os.mkdir(full_path(name))
|
|
utils.write_one_line(cpuset_attr(name, 'mem_hardwall'), '1')
|
|
utils.write_one_line(mems_path(name), ','.join(map(str, mems)))
|
|
utils.write_one_line(cpus_path(name), ','.join(map(str, cpus)))
|
|
logging.debug('container %s has %d cpus and %d nodes totalling %s bytes',
|
|
name, len(cpus), len(get_mem_nodes(name)),
|
|
utils.human_format(container_bytes(name)) )
|
|
|
|
|
|
def create_container_via_memcg(name, parent, bytes, cpus):
|
|
# create container via direct memcg cgroup writes
|
|
os.mkdir(full_path(name))
|
|
nodes = utils.read_one_line(mems_path(parent))
|
|
utils.write_one_line(mems_path(name), nodes) # inherit parent's nodes
|
|
utils.write_one_line(memory_path(name)+'.limit_in_bytes', str(bytes))
|
|
utils.write_one_line(cpus_path(name), ','.join(map(str, cpus)))
|
|
logging.debug('Created container %s directly via memcg,'
|
|
' has %d cpus and %s bytes',
|
|
name, len(cpus), utils.human_format(container_bytes(name)))
|
|
|
|
|
|
def _create_fake_numa_container_directly(name, parent, mbytes, cpus):
|
|
need_fake_numa()
|
|
lockfile = my_lock('inner') # serialize race between parallel tests
|
|
try:
|
|
# Pick specific mem nodes for new cpuset's exclusive use
|
|
# For now, arbitrarily pick highest available node numbers
|
|
needed_kbytes = mbytes * 1024
|
|
nodes = sorted(list(available_exclusive_mem_nodes(parent)))
|
|
kbytes = 0
|
|
nodecnt = 0
|
|
while kbytes < needed_kbytes and nodecnt < len(nodes):
|
|
nodecnt += 1
|
|
kbytes += node_avail_kbytes(nodes[-nodecnt])
|
|
if kbytes < needed_kbytes:
|
|
parent_mbytes = container_mbytes(parent)
|
|
if mbytes > parent_mbytes:
|
|
raise error.AutotestError(
|
|
"New container's %d Mbytes exceeds "
|
|
"parent container's %d Mbyte size"
|
|
% (mbytes, parent_mbytes) )
|
|
else:
|
|
raise error.AutotestError(
|
|
"Existing sibling containers hold "
|
|
"%d Mbytes needed by new container"
|
|
% ((needed_kbytes - kbytes)//1024) )
|
|
mems = nodes[-nodecnt:]
|
|
|
|
create_container_with_specific_mems_cpus(name, mems, cpus)
|
|
finally:
|
|
my_unlock(lockfile)
|
|
|
|
|
|
def create_container_directly(name, mbytes, cpus):
|
|
parent = os.path.dirname(name)
|
|
if fake_numa_containers:
|
|
_create_fake_numa_container_directly(name, parent, mbytes, cpus)
|
|
else:
|
|
create_container_via_memcg(name, parent, mbytes<<20, cpus)
|
|
|
|
|
|
def create_container_with_mbytes_and_specific_cpus(name, mbytes,
|
|
cpus=None, root=SUPER_ROOT, io={}, move_in=True, timeout=0):
|
|
"""\
|
|
Create a cpuset container and move job's current pid into it
|
|
Allocate the list "cpus" of cpus to that container
|
|
|
|
name = arbitrary string tag
|
|
mbytes = reqested memory for job in megabytes
|
|
cpus = list of cpu indicies to associate with the cpuset
|
|
defaults to all cpus avail with given root
|
|
root = the parent cpuset to nest this new set within
|
|
'': unnested top-level container
|
|
io = arguments for proportional IO containers
|
|
move_in = True: Move current process into the new container now.
|
|
timeout = must be 0: persist until explicitly deleted.
|
|
"""
|
|
need_mem_containers()
|
|
if not container_exists(root):
|
|
raise error.AutotestError('Parent container "%s" does not exist'
|
|
% root)
|
|
if cpus is None:
|
|
# default to biggest container we can make under root
|
|
cpus = get_cpus(root)
|
|
else:
|
|
cpus = set(cpus) # interface uses list
|
|
if not cpus:
|
|
raise error.AutotestError('Creating container with no cpus')
|
|
name = os.path.join(root, name) # path relative to super_root
|
|
if os.path.exists(full_path(name)):
|
|
raise error.AutotestError('Container %s already exists' % name)
|
|
create_container_directly(name, mbytes, cpus)
|
|
set_io_controls(name, **io)
|
|
if move_in:
|
|
move_self_into_container(name)
|
|
return name
|
|
|
|
|
|
def get_boot_numa():
|
|
# get boot-time numa=fake=xyz option for current boot
|
|
# eg numa=fake=nnn, numa=fake=nnnM, or nothing
|
|
label = 'numa=fake='
|
|
for arg in utils.read_one_line('/proc/cmdline').split():
|
|
if arg.startswith(label):
|
|
return arg[len(label):]
|
|
return ''
|