sandbox/linux/seccomp_bpf/sandbox_bpf.cc - Issue 10458040: Initial snapshot of the new BPF-enabled seccomp sandbox. This code is

Unified Diff: sandbox/linux/seccomp_bpf/sandbox_bpf.cc

Issue 10458040: Initial snapshot of the new BPF-enabled seccomp sandbox. This code is (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src/

Patch Set: Created 8 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: sandbox/linux/seccomp_bpf/sandbox_bpf.cc

===================================================================

--- sandbox/linux/seccomp_bpf/sandbox_bpf.cc (revision 0)

+++ sandbox/linux/seccomp_bpf/sandbox_bpf.cc (revision 0)

@@ -0,0 +1,396 @@

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

+#include "sandbox_bpf.h"

+namespace playground2 {

+int Sandbox::supportsSeccompSandbox(int proc_fd) {

jln (very slow on Chromium) 2012/05/30 20:31:21 It looks like content/common/sandbox_init_linux.cc

+ if (status_ == STATUS_UNKNOWN) {

+ if (!isSingleThreaded(proc_fd)) {

+ status_ = STATUS_UNSUPPORTED;

+ } else {

+ pid_t pid = fork();

+ if (pid < 0) {

+ die("Failed to check for sandbox support");

+ }

+ if (!pid) {

+ static const struct sock_filter filter[] = {

+ // If the architecture doesn't match SECCOMP_ARCH, disallow the

+ // system call.

+ BPF_STMT(BPF_LD+BPF_W+BPF_ABS,

+ offsetof(struct arch_seccomp_data, arch)),

+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, SECCOMP_ARCH, 0, 3),

+ // Check the system call number. The only allowed call are getpid()

+ // and exit_group()

+ BPF_STMT(BPF_LD+BPF_W+BPF_ABS,

+ offsetof(struct arch_seccomp_data, nr)),

+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_getpid, 2, 1),

+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_exit_group, 0, 2),

+ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),

+ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ERRNO | EPERM),

+ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL),

+ };

+ // Try to install filter. If we succeed, return success.

+ const struct sock_fprog prog = {

+ ARRAYSIZE(filter),

+ (struct sock_filter *)filter

+ };

+ if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == 0 &&

+ prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) == 0 &&

+ syscall(__NR_getpid) == -1 && errno == EPERM) {

+ syscall(__NR_exit_group, (intptr_t)0);

+ }

+ _exit(1);

+ }

+ int status;

+ TEMP_FAILURE_RETRY(waitpid(pid, &status, 0));

+ status_ = WIFEXITED(status) && !WEXITSTATUS(status)

+ ? STATUS_AVAILABLE : STATUS_UNSUPPORTED;

+ }

+ return status_ == STATUS_AVAILABLE;

+void Sandbox::setProcFd(int proc_fd) {

+ proc_fd_ = proc_fd;

+void Sandbox::startSandbox() {

+ if (status_ == STATUS_UNSUPPORTED) {

+ die("Trying to start sandbox, even though it is known to be unavailable");

+ }

+ if (proc_fd_ < 0) {

+ proc_fd_ = open("/proc", O_RDONLY|O_DIRECTORY);

+ }

+ if (proc_fd_ < 0) {

+ die("Cannot access /proc");

+ }

+ if (!isSingleThreaded(proc_fd_)) {

+ die("Cannot start sandbox, if process is already multi-threaded");

+ }

+ disableFilesystem();

+ installFilter();

+ // We no longer need access to any files in /proc

+ if (proc_fd_ >= 0) {

+ if (TEMP_FAILURE_RETRY(close(proc_fd_))) {

+ die("Failed to close file descriptor for /proc");

+ }

+ proc_fd_ = -1;

+ }

+bool Sandbox::isSingleThreaded(int proc_fd) {

+ struct stat sb;

+ int task = -1;

+ if (proc_fd < 0 ||

+ (task = openat(proc_fd, "self/task", O_RDONLY|O_DIRECTORY)) < 0 ||

+ fstat(task, &sb) != 0 ||

+ sb.st_nlink != 3 ||

+ TEMP_FAILURE_RETRY(close(task))) {

+ if (task >= 0) {

+ TEMP_FAILURE_RETRY(close(task));

+ }

+ return false;

+ }

+ return true;

+bool Sandbox::disableFilesystem() {

jln (very slow on Chromium) 2012/05/30 20:31:21 Looks good, but this should be kept independent of

+ // Some versions of PR_SET_NO_NEW_PRIVS allow unprivileged processes

+ // to call chroot(). If this feature is available in the kernel, move

+ // us into a non-existent directory.

+ // This is slightly more difficult than it sounds. We don't want

+ // to actually create a directory anywhere, as that is difficult

+ // to do securely. Instead, we rely on the /proc filesystem to

+ // give us a directory for our child process. We can then remove

+ // this directory by terminating the process.

+ // Also, pass the file descriptor from the child process to the

+ // parent rather than opening the directory by "/proc/${PID}". The

+ // latter doesn't necessarily work, if somebody already pushed us

+ // into a new pid namespace. Access by "/proc/self" is more reliable.

+ if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {

+ return false;

+ }

+ int fds[2];

+ pid_t pid;

+ if (socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, fds) < 0 ||

+ (pid = fork()) < 0) {

+ chroot_failed:

+ die("Failed to isolate file system accesses");

+ }

+ if (!pid) {

+ TEMP_FAILURE_RETRY(close(fds[1]));

+ prctl(PR_SET_DUMPABLE, 1);

+ fds[1] = openat(proc_fd_, "self/fdinfo", O_RDONLY|O_DIRECTORY);

+ if (fds[1] >= 0) {

+ Util::sendFds(fds[0], NULL, 0, fds[1], -1);

+ }

+ _exit(0);

+ }

+ TEMP_FAILURE_RETRY(close(fds[0]));

+ if (!Util::getFds(fds[1], NULL, 0, &fds[0], NULL)) {

+ goto chroot_failed;

+ }

+ bool rc = false;

+ if (fchdir(fds[0]) == 0 && chroot(".") == 0) {

+ rc = true;

+ }

+ TEMP_FAILURE_RETRY(close(fds[0]));

+ TEMP_FAILURE_RETRY(close(fds[1]));

+ TEMP_FAILURE_RETRY(waitpid(pid, NULL, 0));

+ return rc;

+int Sandbox::jumpTableSize(int numSyscalls, bool recursing) {

+ int ret = 0;

+ if (numSyscalls <= 0) {

+ // Nothing to do

+ } else if (numSyscalls > 160) {

+ for (int i = 0; i < numSyscalls; i += 160) {

+ ret += jumpTableSize(std::min(160, numSyscalls-i));

+ }

+ } else {

+ if (numSyscalls <= 3) {

+ ret += numSyscalls;

+ } else {

+ int m = numSyscalls/2;

+ ret += 1 + jumpTableSize(m, true) + jumpTableSize(numSyscalls-m, true);

+ }

+ if (!recursing) {

+ ++ret;

+ }

+ return ret;

+void Sandbox::verifyJumpTable(struct sock_filter *filter, int numInsn,

+ const int *syscallList, int numSyscalls) {

+ if (numSyscalls <= 0) {

+ if (numInsn != 0) {

+ failed:

+ die("Failed to assemble jump table");

+ }

+ return;

+ }

+ int j = 0;

+ for (int i = syscallList[0]-1; i <= syscallList[numSyscalls-1]+1; ++i) {

+ for (; j < numSyscalls && syscallList[j] < i; ++j) { }

+ bool present = j < numSyscalls && syscallList[j] == i;

+ for (int ip = 0; ip < numInsn; ++ip) {

+ if (filter[ip].code == BPF_JMP+BPF_JEQ+BPF_K) {

+ ip += i == (int)filter[ip].k ? filter[ip].jt : filter[ip].jf;

+ } else if (filter[ip].code == BPF_JMP+BPF_JGE+BPF_K) {

+ ip += i >= (int)filter[ip].k ? filter[ip].jt : filter[ip].jf;

+ } else if (filter[ip].code == BPF_RET+BPF_K) {

+ if (!present) {

+ goto failed;

+ } else {

+ goto ok;

+ }

+ } else {

+ goto failed;

+ }

+ if (ip >= numInsn) {

+ goto failed;

+ }

+ if (present) {

+ goto failed;

+ }

+ ok:;

+ }

+ return;

+static int cmp(const void *a, const void *b) {

+ return *(const int *)a - *(const int *)b;

+int Sandbox::jumpTable(struct sock_filter *filter, int *idx,

+ const int *syscallList, int numSyscalls,

+ int ret, bool sorted, bool recursing) {

+ const int origIdx = *idx;

+ // If the list of system calls is not yet sorted, we have to do that now.

+ const int *list;

+ int l[sorted ? 0 : numSyscalls];

+ if (sorted) {

+ list = syscallList;

+ } else {

+ memcpy(l, syscallList, sizeof(int)*numSyscalls);

+ qsort(l, numSyscalls, sizeof(int), cmp);

+ list = l;

+ }

+ // If the list of system calls is too big, we have to split it. That allows

+ // us to avoid jumps that are longer than 256 instructions.

+ if (numSyscalls <= 0) {

+ // Nothing to do

+ } else if (numSyscalls > 160) {

+ for (int i = 0; i < numSyscalls; i += 160) {

+ jumpTable(filter, idx, list+i, std::min(160, numSyscalls-i), ret, true);

+ }

+ verifyJumpTable(filter + origIdx, *idx - origIdx, list, numSyscalls);

+ } else {

+ if (numSyscalls <= 3) {

+ for (int i = 0; i < numSyscalls; ++i) {

+ // If outputting more than one comparison, only mark the very last one

+ // with BPF_JMP. When fixing up jump targets, we use this information

+ // to generate the correct if..else.. sequence of jumps. And at that

+ // point, we add the missing BPF_JMP into the filter.

+ filter[(*idx)++] = (struct sock_filter)

+ BPF_JUMP((i == numSyscalls-1 ? BPF_JMP : 0)+BPF_JEQ+BPF_K,

+ list[i], 0, 0);

+ }

+ } else {

+ int m = numSyscalls/2;

+ int x = (*idx)++;

+ filter[x] = (struct sock_filter)

+ BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, list[m], 0, 0);

+ jumpTable(filter, idx, list, m, ret, true, true);

+ if (*idx - x - 1 > 255) {

+ die("Failed to assemble jump table");

+ }

+ filter[x].jt = *idx - x - 1;

+ jumpTable(filter, idx, list+m, numSyscalls-m, ret, true, true);

+ }

+ // If we are done recursing, fix up jump targets and insert the

+ // return statement.

+ if (!recursing) {

+ for (int i = origIdx; i < *idx; ++i) {

+ if (BPF_OP(filter[i].code) == BPF_JEQ) {

+ if (*idx - i > 255) {

+ die("Failed to assemble jump table");

+ }

+ filter[i].jt = *idx - i - 1;

+ if (BPF_CLASS(filter[i].code) == BPF_JMP) {

+ filter[i].jf = *idx - i;

+ } else {

+ filter[i].code += BPF_JMP;

+ }

+ filter[(*idx)++] = (struct sock_filter)BPF_STMT(BPF_RET+BPF_K, ret);

+ verifyJumpTable(filter + origIdx, *idx - origIdx, list, numSyscalls);

+ }

+ return *idx - origIdx;

+void Sandbox::setSandboxPolicy(EvaluateSyscall syscallEvaluator,

+ EvaluateArguments argumentEvaluator) {

+ evaluators_.push_back(std::make_pair<EvaluateSyscall, EvaluateArguments>(

+ syscallEvaluator, argumentEvaluator));

+void Sandbox::installFilter() {

+ // Set new SIGSYS handler

+ struct sigaction sa;

+ memset(&sa, 0, sizeof(sa));

+ sa.sa_sigaction = &sigSys;

+ sa.sa_flags = SA_SIGINFO;

+ if (sigaction(SIGSYS, &sa, NULL) < 0) {

+ filter_failed:

+ die("Failed to configure system call filters");

+ }

+ // Unmask SIGSYS

+ sigset_t mask;

+ sigemptyset(&mask);

+ sigaddset(&mask, SIGSYS);

+ if (sigprocmask(SIG_UNBLOCK, &mask, NULL)) {

+ goto filter_failed;

+ }

+ // Static preamble at the beginning of the filter program

+ // static const struct sock_filter filterPreamble[] = {

+ // // If the architecture doesn't match SECCOMP_ARCH, disallow the

+ // // system call.

+ // BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct arch_seccomp_data, arch)),

+ // BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, SECCOMP_ARCH, 1, 0),

+ // BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_DENY),

+ //

+ // // Grab the system call number, so that we can implement jump tables.

+ // BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct arch_seccomp_data, nr)),

+ // };

+ for (std::vector<std::pair<EvaluateSyscall, EvaluateArguments> >::

+ const_iterator iter = evaluators_.begin();

+ iter != evaluators_.end();

+ ++iter) {

+ EvaluateSyscall evaluateSyscall = iter->first;

+ EvaluateArguments evaluateArgs = iter->second;

+ int oldSysnum = INT32_MIN;

+ ErrorCode oldErr = evaluateSyscall(oldSysnum);

+ if (oldErr != evaluateSyscall(-1) ||

+ (oldErr >= SB_INSPECT_ARG_1 && oldErr <= SB_INSPECT_ARG_6)) {

+ policyErr:

+ die("Invalid sandbox policy");

+ }

+ for (int sysnum = 0; sysnum <= MAX_SYSCALL; ++sysnum) {

+ ErrorCode err = evaluateSyscall(sysnum);

+ if (err != oldErr) {

+ addRange(oldSysnum, sysnum-1, oldErr);

+ oldSysnum = sysnum;

+ oldErr = err;

+ }

+ if (oldErr != evaluateSyscall(INT32_MAX) ||

+ (oldErr >= SB_INSPECT_ARG_1 && oldErr <= SB_INSPECT_ARG_6)) {

+ goto policyErr;

+ }

+ addRange(oldSysnum, INT32_MAX, oldErr);

+ /***/

+ // Install BPF filter program

+ const struct sock_fprog prog = { 0 /***/, 0 /***/ };

+ if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) ||

+ prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) {

+ goto filter_failed;

+ }

+ return;

+void Sandbox::sigSys(int nr, siginfo_t *info, void *void_context) {

jln (very slow on Chromium) 2012/05/30 20:31:21 For the purpose of merging with Chris code, would

+ if (info->si_code != SYS_SECCOMP || !void_context) {

+ die("Unexpected SIGSYS received");

+ }

+ ucontext_t *ctx = (ucontext_t *)void_context;

+ int old_errno = errno;

+ void *rc =

+ (void *)(intptr_t)-(int)(SECCOMP_RET_DENY & SECCOMP_RET_DATA);

+ if (rc == (void *)(intptr_t)-(int)(SECCOMP_RET_DENY & SECCOMP_RET_DATA)) {

+ // sprintf() is not technically async-signal safe. But in glibc it

+ // tends to be much safer than calling fprintf() or any other higher-

+ // level I/O function.

+ /***/

+ char buf[80];

+ sprintf(buf, "Seccomp policy denies system call %ld\n",

+ (long int)ctx->uc_mcontext.gregs[REG_SYSCALL]);

+ if (TEMP_FAILURE_RETRY(write(2, buf, strlen(buf)))) {}

+ }

+ ctx->uc_mcontext.gregs[REG_RESULT] = (greg_t)rc;

+ errno = old_errno;

+ return;

+Sandbox::SandboxStatus Sandbox::status_ = STATUS_UNKNOWN;

+int Sandbox::proc_fd_ = -1;

+std::vector<std::pair<Sandbox::EvaluateSyscall,

+ Sandbox::EvaluateArguments> > Sandbox::evaluators_;

+} // namespace

Property changes on: sandbox/linux/seccomp_bpf/sandbox_bpf.cc

___________________________________________________________________

Added: svn:eol-style

+ LF

« no previous file with comments | « sandbox/linux/seccomp_bpf/sandbox_bpf.h ('k') | sandbox/linux/seccomp_bpf/util.h » ('j') | no next file with comments »