Index: sandbox/linux/seccomp_bpf/sandbox_bpf.cc |
=================================================================== |
--- sandbox/linux/seccomp_bpf/sandbox_bpf.cc (revision 0) |
+++ sandbox/linux/seccomp_bpf/sandbox_bpf.cc (revision 0) |
@@ -0,0 +1,396 @@ |
+// Copyright (c) 2012 The Chromium Authors. All rights reserved. |
+// Use of this source code is governed by a BSD-style license that can be |
+// found in the LICENSE file. |
+ |
+#include "sandbox_bpf.h" |
+ |
+ |
+namespace playground2 { |
+ |
+int Sandbox::supportsSeccompSandbox(int proc_fd) { |
jln (very slow on Chromium)
2012/05/30 20:31:21
It looks like content/common/sandbox_init_linux.cc
|
+ if (status_ == STATUS_UNKNOWN) { |
+ if (!isSingleThreaded(proc_fd)) { |
+ status_ = STATUS_UNSUPPORTED; |
+ } else { |
+ pid_t pid = fork(); |
+ if (pid < 0) { |
+ die("Failed to check for sandbox support"); |
+ } |
+ if (!pid) { |
+ static const struct sock_filter filter[] = { |
+ // If the architecture doesn't match SECCOMP_ARCH, disallow the |
+ // system call. |
+ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, |
+ offsetof(struct arch_seccomp_data, arch)), |
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, SECCOMP_ARCH, 0, 3), |
+ |
+ // Check the system call number. The only allowed call are getpid() |
+ // and exit_group() |
+ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, |
+ offsetof(struct arch_seccomp_data, nr)), |
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_getpid, 2, 1), |
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_exit_group, 0, 2), |
+ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), |
+ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ERRNO | EPERM), |
+ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL), |
+ }; |
+ |
+ // Try to install filter. If we succeed, return success. |
+ const struct sock_fprog prog = { |
+ ARRAYSIZE(filter), |
+ (struct sock_filter *)filter |
+ }; |
+ if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == 0 && |
+ prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) == 0 && |
+ syscall(__NR_getpid) == -1 && errno == EPERM) { |
+ syscall(__NR_exit_group, (intptr_t)0); |
+ } |
+ _exit(1); |
+ } |
+ int status; |
+ TEMP_FAILURE_RETRY(waitpid(pid, &status, 0)); |
+ status_ = WIFEXITED(status) && !WEXITSTATUS(status) |
+ ? STATUS_AVAILABLE : STATUS_UNSUPPORTED; |
+ } |
+ } |
+ return status_ == STATUS_AVAILABLE; |
+} |
+ |
+void Sandbox::setProcFd(int proc_fd) { |
+ proc_fd_ = proc_fd; |
+} |
+ |
+void Sandbox::startSandbox() { |
+ if (status_ == STATUS_UNSUPPORTED) { |
+ die("Trying to start sandbox, even though it is known to be unavailable"); |
+ } |
+ if (proc_fd_ < 0) { |
+ proc_fd_ = open("/proc", O_RDONLY|O_DIRECTORY); |
+ } |
+ if (proc_fd_ < 0) { |
+ die("Cannot access /proc"); |
+ } |
+ if (!isSingleThreaded(proc_fd_)) { |
+ die("Cannot start sandbox, if process is already multi-threaded"); |
+ } |
+ disableFilesystem(); |
+ installFilter(); |
+ |
+ // We no longer need access to any files in /proc |
+ if (proc_fd_ >= 0) { |
+ if (TEMP_FAILURE_RETRY(close(proc_fd_))) { |
+ die("Failed to close file descriptor for /proc"); |
+ } |
+ proc_fd_ = -1; |
+ } |
+} |
+ |
+bool Sandbox::isSingleThreaded(int proc_fd) { |
+ struct stat sb; |
+ int task = -1; |
+ if (proc_fd < 0 || |
+ (task = openat(proc_fd, "self/task", O_RDONLY|O_DIRECTORY)) < 0 || |
+ fstat(task, &sb) != 0 || |
+ sb.st_nlink != 3 || |
+ TEMP_FAILURE_RETRY(close(task))) { |
+ if (task >= 0) { |
+ TEMP_FAILURE_RETRY(close(task)); |
+ } |
+ return false; |
+ } |
+ return true; |
+} |
+ |
+bool Sandbox::disableFilesystem() { |
jln (very slow on Chromium)
2012/05/30 20:31:21
Looks good, but this should be kept independent of
|
+ // Some versions of PR_SET_NO_NEW_PRIVS allow unprivileged processes |
+ // to call chroot(). If this feature is available in the kernel, move |
+ // us into a non-existent directory. |
+ // This is slightly more difficult than it sounds. We don't want |
+ // to actually create a directory anywhere, as that is difficult |
+ // to do securely. Instead, we rely on the /proc filesystem to |
+ // give us a directory for our child process. We can then remove |
+ // this directory by terminating the process. |
+ // Also, pass the file descriptor from the child process to the |
+ // parent rather than opening the directory by "/proc/${PID}". The |
+ // latter doesn't necessarily work, if somebody already pushed us |
+ // into a new pid namespace. Access by "/proc/self" is more reliable. |
+ if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { |
+ return false; |
+ } |
+ int fds[2]; |
+ pid_t pid; |
+ if (socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, fds) < 0 || |
+ (pid = fork()) < 0) { |
+ chroot_failed: |
+ die("Failed to isolate file system accesses"); |
+ } |
+ if (!pid) { |
+ TEMP_FAILURE_RETRY(close(fds[1])); |
+ prctl(PR_SET_DUMPABLE, 1); |
+ fds[1] = openat(proc_fd_, "self/fdinfo", O_RDONLY|O_DIRECTORY); |
+ if (fds[1] >= 0) { |
+ Util::sendFds(fds[0], NULL, 0, fds[1], -1); |
+ } |
+ _exit(0); |
+ } |
+ TEMP_FAILURE_RETRY(close(fds[0])); |
+ if (!Util::getFds(fds[1], NULL, 0, &fds[0], NULL)) { |
+ goto chroot_failed; |
+ } |
+ bool rc = false; |
+ if (fchdir(fds[0]) == 0 && chroot(".") == 0) { |
+ rc = true; |
+ } |
+ TEMP_FAILURE_RETRY(close(fds[0])); |
+ TEMP_FAILURE_RETRY(close(fds[1])); |
+ TEMP_FAILURE_RETRY(waitpid(pid, NULL, 0)); |
+ return rc; |
+} |
+ |
+int Sandbox::jumpTableSize(int numSyscalls, bool recursing) { |
+ int ret = 0; |
+ if (numSyscalls <= 0) { |
+ // Nothing to do |
+ } else if (numSyscalls > 160) { |
+ for (int i = 0; i < numSyscalls; i += 160) { |
+ ret += jumpTableSize(std::min(160, numSyscalls-i)); |
+ } |
+ } else { |
+ if (numSyscalls <= 3) { |
+ ret += numSyscalls; |
+ } else { |
+ int m = numSyscalls/2; |
+ ret += 1 + jumpTableSize(m, true) + jumpTableSize(numSyscalls-m, true); |
+ } |
+ if (!recursing) { |
+ ++ret; |
+ } |
+ } |
+ return ret; |
+} |
+ |
+void Sandbox::verifyJumpTable(struct sock_filter *filter, int numInsn, |
+ const int *syscallList, int numSyscalls) { |
+ if (numSyscalls <= 0) { |
+ if (numInsn != 0) { |
+ failed: |
+ die("Failed to assemble jump table"); |
+ } |
+ return; |
+ } |
+ int j = 0; |
+ for (int i = syscallList[0]-1; i <= syscallList[numSyscalls-1]+1; ++i) { |
+ for (; j < numSyscalls && syscallList[j] < i; ++j) { } |
+ bool present = j < numSyscalls && syscallList[j] == i; |
+ for (int ip = 0; ip < numInsn; ++ip) { |
+ if (filter[ip].code == BPF_JMP+BPF_JEQ+BPF_K) { |
+ ip += i == (int)filter[ip].k ? filter[ip].jt : filter[ip].jf; |
+ } else if (filter[ip].code == BPF_JMP+BPF_JGE+BPF_K) { |
+ ip += i >= (int)filter[ip].k ? filter[ip].jt : filter[ip].jf; |
+ } else if (filter[ip].code == BPF_RET+BPF_K) { |
+ if (!present) { |
+ goto failed; |
+ } else { |
+ goto ok; |
+ } |
+ } else { |
+ goto failed; |
+ } |
+ if (ip >= numInsn) { |
+ goto failed; |
+ } |
+ } |
+ if (present) { |
+ goto failed; |
+ } |
+ ok:; |
+ } |
+ return; |
+} |
+ |
+static int cmp(const void *a, const void *b) { |
+ return *(const int *)a - *(const int *)b; |
+} |
+ |
+int Sandbox::jumpTable(struct sock_filter *filter, int *idx, |
+ const int *syscallList, int numSyscalls, |
+ int ret, bool sorted, bool recursing) { |
+ const int origIdx = *idx; |
+ |
+ // If the list of system calls is not yet sorted, we have to do that now. |
+ const int *list; |
+ int l[sorted ? 0 : numSyscalls]; |
+ if (sorted) { |
+ list = syscallList; |
+ } else { |
+ memcpy(l, syscallList, sizeof(int)*numSyscalls); |
+ qsort(l, numSyscalls, sizeof(int), cmp); |
+ list = l; |
+ } |
+ |
+ // If the list of system calls is too big, we have to split it. That allows |
+ // us to avoid jumps that are longer than 256 instructions. |
+ if (numSyscalls <= 0) { |
+ // Nothing to do |
+ } else if (numSyscalls > 160) { |
+ for (int i = 0; i < numSyscalls; i += 160) { |
+ jumpTable(filter, idx, list+i, std::min(160, numSyscalls-i), ret, true); |
+ } |
+ verifyJumpTable(filter + origIdx, *idx - origIdx, list, numSyscalls); |
+ } else { |
+ if (numSyscalls <= 3) { |
+ for (int i = 0; i < numSyscalls; ++i) { |
+ // If outputting more than one comparison, only mark the very last one |
+ // with BPF_JMP. When fixing up jump targets, we use this information |
+ // to generate the correct if..else.. sequence of jumps. And at that |
+ // point, we add the missing BPF_JMP into the filter. |
+ filter[(*idx)++] = (struct sock_filter) |
+ BPF_JUMP((i == numSyscalls-1 ? BPF_JMP : 0)+BPF_JEQ+BPF_K, |
+ list[i], 0, 0); |
+ } |
+ } else { |
+ int m = numSyscalls/2; |
+ int x = (*idx)++; |
+ filter[x] = (struct sock_filter) |
+ BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, list[m], 0, 0); |
+ jumpTable(filter, idx, list, m, ret, true, true); |
+ if (*idx - x - 1 > 255) { |
+ die("Failed to assemble jump table"); |
+ } |
+ filter[x].jt = *idx - x - 1; |
+ jumpTable(filter, idx, list+m, numSyscalls-m, ret, true, true); |
+ } |
+ |
+ // If we are done recursing, fix up jump targets and insert the |
+ // return statement. |
+ if (!recursing) { |
+ for (int i = origIdx; i < *idx; ++i) { |
+ if (BPF_OP(filter[i].code) == BPF_JEQ) { |
+ if (*idx - i > 255) { |
+ die("Failed to assemble jump table"); |
+ } |
+ filter[i].jt = *idx - i - 1; |
+ if (BPF_CLASS(filter[i].code) == BPF_JMP) { |
+ filter[i].jf = *idx - i; |
+ } else { |
+ filter[i].code += BPF_JMP; |
+ } |
+ } |
+ } |
+ filter[(*idx)++] = (struct sock_filter)BPF_STMT(BPF_RET+BPF_K, ret); |
+ verifyJumpTable(filter + origIdx, *idx - origIdx, list, numSyscalls); |
+ } |
+ } |
+ |
+ return *idx - origIdx; |
+} |
+ |
+void Sandbox::setSandboxPolicy(EvaluateSyscall syscallEvaluator, |
+ EvaluateArguments argumentEvaluator) { |
+ evaluators_.push_back(std::make_pair<EvaluateSyscall, EvaluateArguments>( |
+ syscallEvaluator, argumentEvaluator)); |
+} |
+ |
+void Sandbox::installFilter() { |
+ // Set new SIGSYS handler |
+ struct sigaction sa; |
+ memset(&sa, 0, sizeof(sa)); |
+ sa.sa_sigaction = &sigSys; |
+ sa.sa_flags = SA_SIGINFO; |
+ if (sigaction(SIGSYS, &sa, NULL) < 0) { |
+ filter_failed: |
+ die("Failed to configure system call filters"); |
+ } |
+ |
+ // Unmask SIGSYS |
+ sigset_t mask; |
+ sigemptyset(&mask); |
+ sigaddset(&mask, SIGSYS); |
+ if (sigprocmask(SIG_UNBLOCK, &mask, NULL)) { |
+ goto filter_failed; |
+ } |
+ |
+ // Static preamble at the beginning of the filter program |
+ // static const struct sock_filter filterPreamble[] = { |
+ // // If the architecture doesn't match SECCOMP_ARCH, disallow the |
+ // // system call. |
+ // BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct arch_seccomp_data, arch)), |
+ // BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, SECCOMP_ARCH, 1, 0), |
+ // BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_DENY), |
+ // |
+ // // Grab the system call number, so that we can implement jump tables. |
+ // BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct arch_seccomp_data, nr)), |
+ // }; |
+ |
+ for (std::vector<std::pair<EvaluateSyscall, EvaluateArguments> >:: |
+ const_iterator iter = evaluators_.begin(); |
+ iter != evaluators_.end(); |
+ ++iter) { |
+ EvaluateSyscall evaluateSyscall = iter->first; |
+ EvaluateArguments evaluateArgs = iter->second; |
+ int oldSysnum = INT32_MIN; |
+ ErrorCode oldErr = evaluateSyscall(oldSysnum); |
+ if (oldErr != evaluateSyscall(-1) || |
+ (oldErr >= SB_INSPECT_ARG_1 && oldErr <= SB_INSPECT_ARG_6)) { |
+ policyErr: |
+ die("Invalid sandbox policy"); |
+ } |
+ for (int sysnum = 0; sysnum <= MAX_SYSCALL; ++sysnum) { |
+ ErrorCode err = evaluateSyscall(sysnum); |
+ if (err != oldErr) { |
+ addRange(oldSysnum, sysnum-1, oldErr); |
+ oldSysnum = sysnum; |
+ oldErr = err; |
+ } |
+ } |
+ if (oldErr != evaluateSyscall(INT32_MAX) || |
+ (oldErr >= SB_INSPECT_ARG_1 && oldErr <= SB_INSPECT_ARG_6)) { |
+ goto policyErr; |
+ } |
+ addRange(oldSysnum, INT32_MAX, oldErr); |
+ |
+ /***/ |
+ |
+ // Install BPF filter program |
+ const struct sock_fprog prog = { 0 /***/, 0 /***/ }; |
+ if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) || |
+ prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) { |
+ goto filter_failed; |
+ } |
+ } |
+ |
+ return; |
+} |
+ |
+void Sandbox::sigSys(int nr, siginfo_t *info, void *void_context) { |
jln (very slow on Chromium)
2012/05/30 20:31:21
For the purpose of merging with Chris code, would
|
+ if (info->si_code != SYS_SECCOMP || !void_context) { |
+ die("Unexpected SIGSYS received"); |
+ } |
+ ucontext_t *ctx = (ucontext_t *)void_context; |
+ int old_errno = errno; |
+ void *rc = |
+ (void *)(intptr_t)-(int)(SECCOMP_RET_DENY & SECCOMP_RET_DATA); |
+ |
+ if (rc == (void *)(intptr_t)-(int)(SECCOMP_RET_DENY & SECCOMP_RET_DATA)) { |
+ // sprintf() is not technically async-signal safe. But in glibc it |
+ // tends to be much safer than calling fprintf() or any other higher- |
+ // level I/O function. |
+ /***/ |
+ char buf[80]; |
+ sprintf(buf, "Seccomp policy denies system call %ld\n", |
+ (long int)ctx->uc_mcontext.gregs[REG_SYSCALL]); |
+ if (TEMP_FAILURE_RETRY(write(2, buf, strlen(buf)))) {} |
+ } |
+ |
+ ctx->uc_mcontext.gregs[REG_RESULT] = (greg_t)rc; |
+ errno = old_errno; |
+ return; |
+} |
+ |
+ |
+Sandbox::SandboxStatus Sandbox::status_ = STATUS_UNKNOWN; |
+int Sandbox::proc_fd_ = -1; |
+std::vector<std::pair<Sandbox::EvaluateSyscall, |
+ Sandbox::EvaluateArguments> > Sandbox::evaluators_; |
+ |
+} // namespace |
Property changes on: sandbox/linux/seccomp_bpf/sandbox_bpf.cc |
___________________________________________________________________ |
Added: svn:eol-style |
+ LF |