Chromium Code Reviews| Index: sandbox/linux/seccomp_bpf/sandbox_bpf.cc |
| =================================================================== |
| --- sandbox/linux/seccomp_bpf/sandbox_bpf.cc (revision 0) |
| +++ sandbox/linux/seccomp_bpf/sandbox_bpf.cc (revision 0) |
| @@ -0,0 +1,311 @@ |
| +// Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| +// Use of this source code is governed by a BSD-style license that can be |
| +// found in the LICENSE file. |
| + |
| +#include "sandbox/linux/seccomp_bpf/sandbox_bpf.h" |
| + |
| +// The kernel gives us a sandbox, we turn it into a playground :-) |
| +// This is version 2 of the playground; version 1 was built on top of |
| +// pre-BPF seccomp mode. |
| +namespace playground2 { |
| + |
| +Sandbox::ErrorCode Sandbox::probeEvaluator(int signo) { |
| + switch (signo) { |
| + case __NR_getpid: |
| + // Return EPERM so that we can check that the filter actually ran. |
| + return (ErrorCode)EPERM; |
| + case __NR_exit_group: |
| + // Allow exit() with a non-default return code. |
| + return SB_ALLOWED; |
| + default: |
| + // Make everything else fail in an easily recognizable way. |
| + return (ErrorCode)EINVAL; |
| + } |
| +} |
| + |
| +bool Sandbox::kernelSupportSeccompBPF(int proc_fd) { |
| + // Block all signals before forking a child process. This prevents an |
| + // attacker from manipulating our test by sending us an unexpected signal. |
| + sigset_t oldMask, newMask; |
| + if (sigfillset(&newMask) || |
| + sigprocmask(SIG_BLOCK, &newMask, &oldMask)) { |
| + die("sigprocmask() failed"); |
| + } |
| + |
| + pid_t pid = fork(); |
| + if (pid < 0) { |
| + // Die if we cannot fork(). We would probably fail a little later |
| + // anyway, as the machine is likely very close to running out of |
| + // memory. |
| + // But what we don't want to do is return "false", as a crafty |
| + // attacker might cause fork() to fail at will and could trick us |
| + // into running without a sandbox. |
| + sigprocmask(SIG_SETMASK, &oldMask, NULL); // OK, if it fails |
| + die("fork() failed unexpectedly"); |
| + } |
| + |
| + // In the child process |
| + if (!pid) { |
| + // Test a very simple sandbox policy to verify that we can |
| + // successfully turn on sandboxing. |
| + suppressLogging_ = true; |
| + evaluators_.clear(); |
| + setSandboxPolicy(probeEvaluator, NULL); |
| + setProcFd(proc_fd); |
| + startSandbox(); |
| + if (syscall(__NR_getpid) < 0 && errno == EPERM) { |
| + syscall(__NR_exit_group, (intptr_t)100); |
| + } |
| + die(NULL); |
| + } |
| + |
| + // In the parent process |
| + if (sigprocmask(SIG_SETMASK, &oldMask, NULL)) { |
| + die("sigprocmask() failed"); |
| + } |
| + int status; |
| + if (HANDLE_EINTR(waitpid(pid, &status, 0)) != pid) { |
| + die("waitpid() failed unexpectedly"); |
| + } |
| + return WIFEXITED(status) && WEXITSTATUS(status) == 100; |
| +} |
| + |
| +Sandbox::SandboxStatus Sandbox::supportsSeccompSandbox(int proc_fd) { |
| + // It the sandbox is currently active, we clearly must have support for |
| + // sandboxing. |
| + if (status_ == STATUS_ENABLED) { |
| + return status_; |
| + } |
| + |
| + // Even if the sandbox was previously available, something might have |
| + // changed in our run-time environment. Check one more time. |
| + if (status_ == STATUS_AVAILABLE) { |
| + if (!isSingleThreaded(proc_fd)) { |
| + status_ = STATUS_UNAVAILABLE; |
| + } |
| + return status_; |
| + } |
| + |
| + if (status_ == STATUS_UNAVAILABLE && isSingleThreaded(proc_fd)) { |
| + // All state transitions resulting in STATUS_UNAVAILABLE are immediately |
| + // preceded by STATUS_AVAILABLE. Furthermore, these transitions all |
| + // happen, if and only if they are triggered by the process being multi- |
| + // threaded. |
| + // In other words, if a single-threaded process is currently in the |
| + // STATUS_UNAVAILABLE state, it is safe to assume that sandboxing is |
| + // actually available. |
| + status_ == STATUS_AVAILABLE; |
| + return status_; |
| + } |
| + |
| + // If we have not previously checked for availability of the sandbox or if |
| + // we otherwise don't believe to have a good cached value, we have to |
| + // perform a thorough check now. |
| + if (status_ == STATUS_UNKNOWN) { |
| + status_ = kernelSupportSeccompBPF(proc_fd) |
| + ? STATUS_AVAILABLE : STATUS_UNSUPPORTED; |
| + |
| + // As we are performing our tests from a child process, the run-time |
| + // environment that is visible to the sandbox is always guaranteed to be |
| + // single-threaded. Let's check here whether the caller is single- |
| + // threaded. Otherwise, we mark the sandbox as temporarily unavailable. |
| + if (status_ == STATUS_AVAILABLE && !isSingleThreaded(proc_fd)) { |
| + status_ = STATUS_UNAVAILABLE; |
| + } |
| + } |
| + return status_; |
| +} |
| + |
| +void Sandbox::setProcFd(int proc_fd) { |
| + proc_fd_ = proc_fd; |
| +} |
| + |
| +void Sandbox::startSandbox() { |
| + if (status_ == STATUS_UNSUPPORTED || status_ == STATUS_UNAVAILABLE) { |
| + die("Trying to start sandbox, even though it is known to be unavailable"); |
| + } else if (status_ == STATUS_ENABLED) { |
| + die("Cannot start sandbox recursively. Use multiple calls to " |
| + "setSandboxPolicy() to stack policies instead"); |
| + } |
| + if (proc_fd_ < 0) { |
| + proc_fd_ = open("/proc", O_RDONLY|O_DIRECTORY); |
| + } |
| + if (proc_fd_ < 0) { |
| + // For now, continue in degraded mode, if we can't access /proc. |
| + // In the future, we might want to tighten this requirement. |
| + } |
| + if (!isSingleThreaded(proc_fd_)) { |
| + die("Cannot start sandbox, if process is already multi-threaded"); |
| + } |
| + |
| + // We no longer need access to any files in /proc. We want to do this |
| + // before installing the filters, just in case that our policy denies |
| + // close(). |
| + if (proc_fd_ >= 0) { |
| + if (HANDLE_EINTR(close(proc_fd_))) { |
| + die("Failed to close file descriptor for /proc"); |
| + } |
| + proc_fd_ = -1; |
| + } |
| + |
| + // Install the filters. |
| + installFilter(); |
| + |
| + // We are now inside the sandbox. |
| + status_ = STATUS_ENABLED; |
| +} |
| + |
| +bool Sandbox::isSingleThreaded(int proc_fd) { |
| + if (proc_fd < 0) { |
| + // Cannot determine whether program is single-threaded. Hope for |
| + // the best... |
| + return true; |
| + } |
| + |
| + struct stat sb; |
| + int task = -1; |
| + if ((task = openat(proc_fd, "self/task", O_RDONLY|O_DIRECTORY)) < 0 || |
| + fstat(task, &sb) != 0 || |
| + sb.st_nlink != 3 || |
| + HANDLE_EINTR(close(task))) { |
| + if (task >= 0) { |
| + HANDLE_EINTR(close(task)); |
| + } |
| + return false; |
| + } |
| + return true; |
| +} |
| + |
| +void Sandbox::setSandboxPolicy(EvaluateSyscall syscallEvaluator, |
| + EvaluateArguments argumentEvaluator) { |
| + evaluators_.push_back(std::make_pair(syscallEvaluator, argumentEvaluator)); |
| +} |
| + |
| +void Sandbox::installFilter() { |
| + // Verify that the user pushed a policy. |
| + if (evaluators_.empty()) { |
| + filter_failed: |
| + die("Failed to configure system call filters"); |
| + } |
| + |
| + // Set new SIGSYS handler |
| + struct sigaction sa; |
| + memset(&sa, 0, sizeof(sa)); |
| + sa.sa_sigaction = &sigSys; |
| + sa.sa_flags = SA_SIGINFO; |
| + if (sigaction(SIGSYS, &sa, NULL) < 0) { |
| + goto filter_failed; |
| + } |
| + |
| + // Unmask SIGSYS |
| + sigset_t mask; |
| + sigemptyset(&mask); |
| + sigaddset(&mask, SIGSYS); |
|
Chris Evans
2012/06/04 22:21:55
Nit: we took the trouble to check the sigfillset()
|
| + if (sigprocmask(SIG_UNBLOCK, &mask, NULL)) { |
| + goto filter_failed; |
| + } |
| + |
| + // We can't handle stacked evaluators, yet. We'll get there eventually |
| + // though. Hang tight. |
| + if (evaluators_.size() != 1) { |
| + die("Not implemented"); |
| + } |
| + |
| + // If the architecture doesn't match SECCOMP_ARCH, disallow the |
| + // system call. |
| + std::vector<struct sock_filter> program; |
| + program.push_back((struct sock_filter) |
| + BPF_STMT(BPF_LD+BPF_W+BPF_ABS, |
| + offsetof(struct arch_seccomp_data, arch))); |
| + program.push_back((struct sock_filter) |
| + BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, SECCOMP_ARCH, 1, 0)); |
| + program.push_back((struct sock_filter) |
| + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ERRNO + SECCOMP_DENY_ERRNO)); |
| + |
| + // Grab the system call number, so that we can implement jump tables. |
| + program.push_back((struct sock_filter) |
| + BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct arch_seccomp_data, nr))); |
| + |
| + // Evaluate all possible system calls and depending on their |
| + // exit codes generate a BPF filter. |
| + // This is very inefficient right now. We need to be much smarter |
| + // eventually. |
|
Jorge Lucangeli Obes
2012/06/04 22:16:14
We'll probably want to fix this (to at least avoid
Chris Evans
2012/06/04 22:21:55
Nit: note the actual run time in a comment. Julien
|
| + EvaluateSyscall evaluateSyscall = evaluators_.begin()->first; |
| + for (int sysnum = MIN_SYSCALL; sysnum <= MAX_SYSCALL; ++sysnum) { |
| + ErrorCode err = evaluateSyscall(sysnum); |
| + int ret; |
| + switch (err) { |
| + case SB_INSPECT_ARG_1...SB_INSPECT_ARG_6: |
| + die("Not implemented"); |
| + case SB_TRAP: |
| + ret = SECCOMP_RET_TRAP; |
| + break; |
| + case SB_ALLOWED: |
| + ret = SECCOMP_RET_ALLOW; |
| + break; |
| + default: |
| + if (err >= static_cast<ErrorCode>(1) && |
| + err <= static_cast<ErrorCode>(4096)) { |
| + // We limit errno values to a reasonable range. In fact, the Linux ABI |
| + // doesn't support errno values outside of this range. |
| + ret = SECCOMP_RET_ERRNO + err; |
| + } else { |
| + die("Invalid ErrorCode reported by sandbox system call evaluator"); |
| + } |
| + break; |
| + } |
| + program.push_back((struct sock_filter) |
| + BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, sysnum, 0, 1)); |
| + program.push_back((struct sock_filter) |
| + BPF_STMT(BPF_RET+BPF_K, ret)); |
| + } |
| + |
| + // Everything that isn't allowed is forbidden. Eventually, we would |
| + // like to have a way to log forbidden calls, when in debug mode. |
| + program.push_back((struct sock_filter) |
| + BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ERRNO + SECCOMP_DENY_ERRNO)); |
| + |
| + // Install BPF filter program |
| + const struct sock_fprog prog = { program.size(), &program[0] }; |
| + if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) || |
| + prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) { |
| + goto filter_failed; |
| + } |
| + |
| + return; |
| +} |
| + |
| +void Sandbox::sigSys(int nr, siginfo_t *info, void *void_context) { |
| + if (nr != SIGSYS || info->si_code != SYS_SECCOMP || !void_context) { |
|
Chris Evans
2012/06/04 22:21:55
Nit: add a comment that die() might call LOG(FATAL
|
| + die("Unexpected SIGSYS received"); |
| + } |
| + ucontext_t *ctx = reinterpret_cast<ucontext_t *>(void_context); |
| + int old_errno = errno; |
| + |
| + // In case of error, set the REG_RESULT CPU register to the default |
| + // errno value (i.e. EPERM). |
| + // We need to be very careful when doing this, as some of our target |
| + // platforms have pointer types and CPU registers that are wider than |
| + // ints. Furthermore, the kernel ABI requires us to return a negative |
| + // value, but errno values are usually positive. And in fact, it would |
| + // be perfectly reasonable for somebody to have defined them as unsigned |
| + // properties. This makes the correct incantation of type casts rather |
| + // subtle. Sometimes, C++ is just too smart for its own good. |
| + void *rc = (void *)(intptr_t)-(int)SECCOMP_DENY_ERRNO; |
| + |
| + // This is where we can add extra code to handle complex system calls. |
| + // ... |
| + |
| + ctx->uc_mcontext.gregs[REG_RESULT] = reinterpret_cast<greg_t>(rc); |
| + errno = old_errno; |
| + return; |
| +} |
| + |
| + |
| +bool Sandbox::suppressLogging_ = false; |
| +Sandbox::SandboxStatus Sandbox::status_ = STATUS_UNKNOWN; |
| +int Sandbox::proc_fd_ = -1; |
| +std::vector<std::pair<Sandbox::EvaluateSyscall, |
| + Sandbox::EvaluateArguments> > Sandbox::evaluators_; |
| + |
| +} // namespace |
| Property changes on: sandbox/linux/seccomp_bpf/sandbox_bpf.cc |
| ___________________________________________________________________ |
| Added: svn:eol-style |
| + LF |