335 lines
10 KiB
C++
335 lines
10 KiB
C++
// Copyright (c) 2013 The Chromium Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
// found in the LICENSE file.
|
|
|
|
#include "sandbox/linux/services/credentials.h"
|
|
|
|
#include <errno.h>
|
|
#include <limits.h>
|
|
#include <signal.h>
|
|
#include <stddef.h>
|
|
#include <stdint.h>
|
|
#include <stdio.h>
|
|
#include <sys/syscall.h>
|
|
#include <sys/types.h>
|
|
#include <sys/wait.h>
|
|
#include <unistd.h>
|
|
|
|
#include "base/bind.h"
|
|
#include "base/compiler_specific.h"
|
|
#include "base/files/file_path.h"
|
|
#include "base/files/file_util.h"
|
|
#include "base/logging.h"
|
|
#include "base/macros.h"
|
|
#include "base/posix/eintr_wrapper.h"
|
|
#include "base/process/launch.h"
|
|
#include "base/third_party/valgrind/valgrind.h"
|
|
#include "build/build_config.h"
|
|
#include "sandbox/linux/services/namespace_utils.h"
|
|
#include "sandbox/linux/services/proc_util.h"
|
|
#include "sandbox/linux/services/syscall_wrappers.h"
|
|
#include "sandbox/linux/services/thread_helpers.h"
|
|
#include "sandbox/linux/system_headers/capability.h"
|
|
#include "sandbox/linux/system_headers/linux_signal.h"
|
|
|
|
namespace sandbox {
|
|
|
|
namespace {
|
|
|
|
bool IsRunningOnValgrind() { return RUNNING_ON_VALGRIND; }
|
|
|
|
// Checks that the set of RES-uids and the set of RES-gids have
|
|
// one element each and return that element in |resuid| and |resgid|
|
|
// respectively. It's ok to pass NULL as one or both of the ids.
|
|
bool GetRESIds(uid_t* resuid, gid_t* resgid) {
|
|
uid_t ruid, euid, suid;
|
|
gid_t rgid, egid, sgid;
|
|
PCHECK(sys_getresuid(&ruid, &euid, &suid) == 0);
|
|
PCHECK(sys_getresgid(&rgid, &egid, &sgid) == 0);
|
|
const bool uids_are_equal = (ruid == euid) && (ruid == suid);
|
|
const bool gids_are_equal = (rgid == egid) && (rgid == sgid);
|
|
if (!uids_are_equal || !gids_are_equal) return false;
|
|
if (resuid) *resuid = euid;
|
|
if (resgid) *resgid = egid;
|
|
return true;
|
|
}
|
|
|
|
const int kExitSuccess = 0;
|
|
|
|
#if defined(__clang__)
|
|
// Disable sanitizers that rely on TLS and may write to non-stack memory.
|
|
__attribute__((no_sanitize_address))
|
|
__attribute__((no_sanitize_thread))
|
|
__attribute__((no_sanitize_memory))
|
|
#endif
|
|
int ChrootToSelfFdinfo(void*) {
|
|
// This function can be run from a vforked child, so it should not write to
|
|
// any memory other than the stack or errno. Reads from TLS may be different
|
|
// from in the parent process.
|
|
RAW_CHECK(sys_chroot("/proc/self/fdinfo/") == 0);
|
|
|
|
// CWD is essentially an implicit file descriptor, so be careful to not
|
|
// leave it behind.
|
|
RAW_CHECK(chdir("/") == 0);
|
|
_exit(kExitSuccess);
|
|
}
|
|
|
|
// chroot() to an empty dir that is "safe". To be safe, it must not contain
|
|
// any subdirectory (chroot-ing there would allow a chroot escape) and it must
|
|
// be impossible to create an empty directory there.
|
|
// We achieve this by doing the following:
|
|
// 1. We create a new process sharing file system information.
|
|
// 2. In the child, we chroot to /proc/self/fdinfo/
|
|
// This is already "safe", since fdinfo/ does not contain another directory and
|
|
// one cannot create another directory there.
|
|
// 3. The process dies
|
|
// After (3) happens, the directory is not available anymore in /proc.
|
|
bool ChrootToSafeEmptyDir() {
|
|
// We need to chroot to a fdinfo that is unique to a process and have that
|
|
// process die.
|
|
// 1. We don't want to simply fork() because duplicating the page tables is
|
|
// slow with a big address space.
|
|
// 2. We do not use a regular thread (that would unshare CLONE_FILES) because
|
|
// when we are in a PID namespace, we cannot easily get a handle to the
|
|
// /proc/tid directory for the thread (since /proc may not be aware of the
|
|
// PID namespace). With a process, we can just use /proc/self.
|
|
pid_t pid = -1;
|
|
char stack_buf[PTHREAD_STACK_MIN] ALIGNAS(16);
|
|
#if defined(ARCH_CPU_X86_FAMILY) || defined(ARCH_CPU_ARM_FAMILY) || \
|
|
defined(ARCH_CPU_MIPS_FAMILY)
|
|
// The stack grows downward.
|
|
void* stack = stack_buf + sizeof(stack_buf);
|
|
#else
|
|
#error "Unsupported architecture"
|
|
#endif
|
|
|
|
int clone_flags = CLONE_FS | LINUX_SIGCHLD;
|
|
void* tls = nullptr;
|
|
#if defined(ARCH_CPU_X86_64) || defined(ARCH_CPU_ARM_FAMILY)
|
|
// Use CLONE_VM | CLONE_VFORK as an optimization to avoid copying page tables.
|
|
// Since clone writes to the new child's TLS before returning, we must set a
|
|
// new TLS to avoid corrupting the current process's TLS. On ARCH_CPU_X86,
|
|
// glibc performs syscalls by calling a function pointer in TLS, so we do not
|
|
// attempt this optimization.
|
|
clone_flags |= CLONE_VM | CLONE_VFORK | CLONE_SETTLS;
|
|
|
|
char tls_buf[PTHREAD_STACK_MIN] = {0};
|
|
tls = tls_buf;
|
|
#endif
|
|
|
|
pid = clone(ChrootToSelfFdinfo, stack, clone_flags, nullptr, nullptr, tls,
|
|
nullptr);
|
|
PCHECK(pid != -1);
|
|
|
|
int status = -1;
|
|
PCHECK(HANDLE_EINTR(waitpid(pid, &status, 0)) == pid);
|
|
|
|
return WIFEXITED(status) && WEXITSTATUS(status) == kExitSuccess;
|
|
}
|
|
|
|
// CHECK() that an attempt to move to a new user namespace raised an expected
|
|
// errno.
|
|
void CheckCloneNewUserErrno(int error) {
|
|
// EPERM can happen if already in a chroot. EUSERS if too many nested
|
|
// namespaces are used. EINVAL for kernels that don't support the feature.
|
|
// Valgrind will ENOSYS unshare().
|
|
PCHECK(error == EPERM || error == EUSERS || error == EINVAL ||
|
|
error == ENOSYS);
|
|
}
|
|
|
|
// Converts a Capability to the corresponding Linux CAP_XXX value.
|
|
int CapabilityToKernelValue(Credentials::Capability cap) {
|
|
switch (cap) {
|
|
case Credentials::Capability::SYS_CHROOT:
|
|
return CAP_SYS_CHROOT;
|
|
case Credentials::Capability::SYS_ADMIN:
|
|
return CAP_SYS_ADMIN;
|
|
}
|
|
|
|
LOG(FATAL) << "Invalid Capability: " << static_cast<int>(cap);
|
|
return 0;
|
|
}
|
|
|
|
} // namespace.
|
|
|
|
// static
|
|
bool Credentials::DropAllCapabilities(int proc_fd) {
|
|
if (!SetCapabilities(proc_fd, std::vector<Capability>())) {
|
|
return false;
|
|
}
|
|
|
|
CHECK(!HasAnyCapability());
|
|
return true;
|
|
}
|
|
|
|
// static
|
|
bool Credentials::DropAllCapabilities() {
|
|
base::ScopedFD proc_fd(ProcUtil::OpenProc());
|
|
return Credentials::DropAllCapabilities(proc_fd.get());
|
|
}
|
|
|
|
// static
|
|
bool Credentials::DropAllCapabilitiesOnCurrentThread() {
|
|
return SetCapabilitiesOnCurrentThread(std::vector<Capability>());
|
|
}
|
|
|
|
// static
|
|
bool Credentials::SetCapabilitiesOnCurrentThread(
|
|
const std::vector<Capability>& caps) {
|
|
struct cap_hdr hdr = {};
|
|
hdr.version = _LINUX_CAPABILITY_VERSION_3;
|
|
struct cap_data data[_LINUX_CAPABILITY_U32S_3] = {{}};
|
|
|
|
// Initially, cap has no capability flags set. Enable the effective and
|
|
// permitted flags only for the requested capabilities.
|
|
for (const Capability cap : caps) {
|
|
const int cap_num = CapabilityToKernelValue(cap);
|
|
const size_t index = CAP_TO_INDEX(cap_num);
|
|
const uint32_t mask = CAP_TO_MASK(cap_num);
|
|
data[index].effective |= mask;
|
|
data[index].permitted |= mask;
|
|
}
|
|
|
|
return sys_capset(&hdr, data) == 0;
|
|
}
|
|
|
|
// static
|
|
bool Credentials::SetCapabilities(int proc_fd,
|
|
const std::vector<Capability>& caps) {
|
|
DCHECK_LE(0, proc_fd);
|
|
|
|
#if !defined(THREAD_SANITIZER)
|
|
// With TSAN, accept to break the security model as it is a testing
|
|
// configuration.
|
|
CHECK(ThreadHelpers::IsSingleThreaded(proc_fd));
|
|
#endif
|
|
|
|
return SetCapabilitiesOnCurrentThread(caps);
|
|
}
|
|
|
|
bool Credentials::HasAnyCapability() {
|
|
struct cap_hdr hdr = {};
|
|
hdr.version = _LINUX_CAPABILITY_VERSION_3;
|
|
struct cap_data data[_LINUX_CAPABILITY_U32S_3] = {{}};
|
|
|
|
PCHECK(sys_capget(&hdr, data) == 0);
|
|
|
|
for (size_t i = 0; i < arraysize(data); ++i) {
|
|
if (data[i].effective || data[i].permitted || data[i].inheritable) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
bool Credentials::HasCapability(Capability cap) {
|
|
struct cap_hdr hdr = {};
|
|
hdr.version = _LINUX_CAPABILITY_VERSION_3;
|
|
struct cap_data data[_LINUX_CAPABILITY_U32S_3] = {{}};
|
|
|
|
PCHECK(sys_capget(&hdr, data) == 0);
|
|
|
|
const int cap_num = CapabilityToKernelValue(cap);
|
|
const size_t index = CAP_TO_INDEX(cap_num);
|
|
const uint32_t mask = CAP_TO_MASK(cap_num);
|
|
|
|
return (data[index].effective | data[index].permitted |
|
|
data[index].inheritable) &
|
|
mask;
|
|
}
|
|
|
|
// static
|
|
bool Credentials::CanCreateProcessInNewUserNS() {
|
|
// Valgrind will let clone(2) pass-through, but doesn't support unshare(),
|
|
// so always consider UserNS unsupported there.
|
|
if (IsRunningOnValgrind()) {
|
|
return false;
|
|
}
|
|
|
|
#if defined(THREAD_SANITIZER)
|
|
// With TSAN, processes will always have threads running and can never
|
|
// enter a new user namespace with MoveToNewUserNS().
|
|
return false;
|
|
#endif
|
|
|
|
// This is roughly a fork().
|
|
const pid_t pid = sys_clone(CLONE_NEWUSER | SIGCHLD, 0, 0, 0, 0);
|
|
|
|
if (pid == -1) {
|
|
CheckCloneNewUserErrno(errno);
|
|
return false;
|
|
}
|
|
|
|
// The parent process could have had threads. In the child, these threads
|
|
// have disappeared. Make sure to not do anything in the child, as this is a
|
|
// fragile execution environment.
|
|
if (pid == 0) {
|
|
_exit(kExitSuccess);
|
|
}
|
|
|
|
// Always reap the child.
|
|
int status = -1;
|
|
PCHECK(HANDLE_EINTR(waitpid(pid, &status, 0)) == pid);
|
|
CHECK(WIFEXITED(status));
|
|
CHECK_EQ(kExitSuccess, WEXITSTATUS(status));
|
|
|
|
// clone(2) succeeded, we can use CLONE_NEWUSER.
|
|
return true;
|
|
}
|
|
|
|
bool Credentials::MoveToNewUserNS() {
|
|
uid_t uid;
|
|
gid_t gid;
|
|
if (!GetRESIds(&uid, &gid)) {
|
|
// If all the uids (or gids) are not equal to each other, the security
|
|
// model will most likely confuse the caller, abort.
|
|
DVLOG(1) << "uids or gids differ!";
|
|
return false;
|
|
}
|
|
int ret = sys_unshare(CLONE_NEWUSER);
|
|
if (ret) {
|
|
const int unshare_errno = errno;
|
|
VLOG(1) << "Looks like unprivileged CLONE_NEWUSER may not be available "
|
|
<< "on this kernel.";
|
|
CheckCloneNewUserErrno(unshare_errno);
|
|
return false;
|
|
}
|
|
|
|
if (NamespaceUtils::KernelSupportsDenySetgroups()) {
|
|
PCHECK(NamespaceUtils::DenySetgroups());
|
|
}
|
|
|
|
// The current {r,e,s}{u,g}id is now an overflow id (c.f.
|
|
// /proc/sys/kernel/overflowuid). Setup the uid and gid maps.
|
|
DCHECK(GetRESIds(NULL, NULL));
|
|
const char kGidMapFile[] = "/proc/self/gid_map";
|
|
const char kUidMapFile[] = "/proc/self/uid_map";
|
|
PCHECK(NamespaceUtils::WriteToIdMapFile(kGidMapFile, gid));
|
|
PCHECK(NamespaceUtils::WriteToIdMapFile(kUidMapFile, uid));
|
|
DCHECK(GetRESIds(NULL, NULL));
|
|
return true;
|
|
}
|
|
|
|
bool Credentials::DropFileSystemAccess(int proc_fd) {
|
|
CHECK_LE(0, proc_fd);
|
|
|
|
CHECK(ChrootToSafeEmptyDir());
|
|
CHECK(!base::DirectoryExists(base::FilePath("/proc")));
|
|
CHECK(!ProcUtil::HasOpenDirectory(proc_fd));
|
|
// We never let this function fail.
|
|
return true;
|
|
}
|
|
|
|
pid_t Credentials::ForkAndDropCapabilitiesInChild() {
|
|
pid_t pid = fork();
|
|
if (pid != 0) {
|
|
return pid;
|
|
}
|
|
|
|
// Since we just forked, we are single threaded.
|
|
PCHECK(DropAllCapabilitiesOnCurrentThread());
|
|
return 0;
|
|
}
|
|
|
|
} // namespace sandbox.
|