Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(219)

Unified Diff: sandbox/linux/seccomp-bpf/sandbox_bpf.cc

Issue 11419121: SECCOMP-BPF: Added support for greylisting of system calls. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Fixed system call forwarding on 32bit architectures Created 8 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: sandbox/linux/seccomp-bpf/sandbox_bpf.cc
diff --git a/sandbox/linux/seccomp-bpf/sandbox_bpf.cc b/sandbox/linux/seccomp-bpf/sandbox_bpf.cc
index eb03995904f362cf589289bc7139577a472958e0..ff855b8a9cb131119963fd0829544b5f00e41b2b 100644
--- a/sandbox/linux/seccomp-bpf/sandbox_bpf.cc
+++ b/sandbox/linux/seccomp-bpf/sandbox_bpf.cc
@@ -2,8 +2,27 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
+#include <endian.h>
+#if __BYTE_ORDER == __BIG_ENDIAN
+// The BPF "struct seccomp_data" layout has to deal with storing 64bit
+// values that need to be inspected by a virtual machine that only ever
+// operates on 32bit values. The kernel developers decided how values
+// should be split into two 32bit words to achieve this goal. But at this
+// time, there is no existing BPF implementation in the kernel that uses
+// 64bit big endian values. So, all we have to go by is the consensus
+// from a discussion on LKLM. Actual implementations, if and when they
+// happen, might very well differ.
+// If this code is ever going to be used with such a kernel, you should
+// disable the "#error" and carefully test the code (e.g. run the unit
+// tests). If things don't work, search for all occurrences of __BYTE_ORDER
+// and verify that the proposed implementation agrees with what the kernel
+// actually does.
+#error Big endian operation is untested and expected to be broken
+#endif
+
#include "sandbox/linux/seccomp-bpf/codegen.h"
#include "sandbox/linux/seccomp-bpf/sandbox_bpf.h"
+#include "sandbox/linux/seccomp-bpf/syscall.h"
#include "sandbox/linux/seccomp-bpf/syscall_iterator.h"
#include "sandbox/linux/seccomp-bpf/verifier.h"
@@ -18,6 +37,30 @@ void WriteFailedStderrSetupMessage(int out_fd) {
}
}
+// We need to tell whether we are performing a "normal" callback, or
+// whether we were called recursively from within a UnsafeTrap() callback.
+// This is a little tricky to do, because we need to somehow get access to
+// per-thread data from within a signal context. Normal TLS storage is not
+// safely accessible at this time. We could roll our own, but that involves
+// a lot of complexity. Instead, we co-opt one bit in the signal mask.
+// If BUS is blocked, we assume that we have been called recursively.
+// There is a possibility for collision with other code that needs to do
+// this, but in practice the risks are low.
+// If SIGBUS turns out to be a problem, we could instead co-opt one of the
+// realtime signals. There are plenty of them. Unfortunately, there is no
+// way to mark a signal as allocated. So, the potential for collision is
+// possibly even worse.
+bool GetIsInSigHandler(const ucontext_t *ctx) {
+ return sigismember(&ctx->uc_sigmask, SIGBUS);
+}
+
+void SetIsInSigHandler() {
+ sigset_t mask;
+ sigemptyset(&mask);
+ sigaddset(&mask, SIGBUS);
+ sigprocmask(SIG_BLOCK, &mask, NULL);
+}
+
} // namespace
// The kernel gives us a sandbox, we turn it into a playground :-)
@@ -319,6 +362,48 @@ void Sandbox::policySanityChecks(EvaluateSyscall syscallEvaluator,
return;
}
+void Sandbox::CheckForUnsafeErrorCodes(Instruction *insn, void *aux) {
+ if (BPF_CLASS(insn->code) == BPF_RET &&
+ insn->k > SECCOMP_RET_TRAP &&
+ insn->k - SECCOMP_RET_TRAP <= trapArraySize_) {
+ const ErrorCode& err = trapArray_[insn->k - SECCOMP_RET_TRAP - 1];
+ if (!err.safe_) {
+ bool *is_unsafe = static_cast<bool *>(aux);
+ *is_unsafe = true;
+ }
+ }
+}
+
+void Sandbox::RedirectToUserspace(Instruction *insn, void *aux) {
+ // When inside an UnsafeTrap() callback, we want to allow all system calls.
+ // This means, we must conditionally disable the sandbox -- and that's not
+ // something that kernel-side BPF filters can do, as they cannot inspect
+ // any state other than the syscall arguments.
+ // But if we redirect all error handlers to user-space, then we can easily
+ // make this decision.
+ // The performance penalty for this extra round-trip to user-space is not
+ // actually that bad, as we only ever pay it for denied system calls; and a
+ // typical program has very few of these.
+ if (BPF_CLASS(insn->code) == BPF_RET &&
+ (insn->k & SECCOMP_RET_ACTION) == SECCOMP_RET_ERRNO) {
+ insn->k = Trap(ReturnErrno,
+ reinterpret_cast<void *>(insn->k & SECCOMP_RET_DATA)).err();
+ }
+}
+
+ErrorCode Sandbox::RedirectToUserspaceEvalWrapper(int sysnum, void *aux) {
+ // We need to replicate the behavior of RedirectToUserspace(), so that our
+ // Verifier can still work correctly.
+ Evaluators *evaluators = reinterpret_cast<Evaluators *>(aux);
+ const std::pair<EvaluateSyscall, void *>& evaluator = *evaluators->begin();
+ ErrorCode err = evaluator.first(sysnum, evaluator.second);
+ if ((err.err() & SECCOMP_RET_ACTION) == SECCOMP_RET_ERRNO) {
+ return Trap(ReturnErrno,
+ reinterpret_cast<void *>(err.err() & SECCOMP_RET_DATA));
+ }
+ return err;
+}
+
void Sandbox::setSandboxPolicy(EvaluateSyscall syscallEvaluator, void *aux) {
if (status_ == STATUS_ENABLED) {
SANDBOX_DIE("Cannot change policy after sandbox has started");
@@ -337,8 +422,8 @@ void Sandbox::installFilter(bool quiet) {
// Set new SIGSYS handler
struct sigaction sa;
memset(&sa, 0, sizeof(sa));
- sa.sa_sigaction = &sigSys;
- sa.sa_flags = SA_SIGINFO;
+ sa.sa_sigaction = sigSys;
+ sa.sa_flags = SA_SIGINFO | SA_NODEFER;
if (sigaction(SIGSYS, &sa, NULL) < 0) {
goto filter_failed;
}
@@ -369,33 +454,13 @@ void Sandbox::installFilter(bool quiet) {
Instruction *head =
gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS,
offsetof(struct arch_seccomp_data, arch),
- gen->MakeInstruction(BPF_JMP+BPF_JEQ+BPF_K, SECCOMP_ARCH,
tail =
- // Grab the system call number, so that we can implement jump tables.
- gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS,
- offsetof(struct arch_seccomp_data, nr)),
+ gen->MakeInstruction(BPF_JMP+BPF_JEQ+BPF_K, SECCOMP_ARCH,
+ NULL,
gen->MakeInstruction(BPF_RET+BPF_K,
Kill(
"Invalid audit architecture in BPF filter").err_)));
- // On Intel architectures, verify that system call numbers are in the
- // expected number range. The older i386 and x86-64 APIs clear bit 30
- // on all system calls. The newer x32 API always sets bit 30.
-#if defined(__i386__) || defined(__x86_64__)
- Instruction *invalidX32 =
- gen->MakeInstruction(BPF_RET+BPF_K,
- Kill("Illegal mixing of system call ABIs").err_);
- Instruction *checkX32 =
-#if defined(__x86_64__) && defined(__ILP32__)
- gen->MakeInstruction(BPF_JMP+BPF_JSET+BPF_K, 0x40000000, 0, invalidX32);
-#else
- gen->MakeInstruction(BPF_JMP+BPF_JSET+BPF_K, 0x40000000, invalidX32, 0);
-#endif
- gen->JoinInstructions(tail, checkX32);
- tail = checkX32;
-#endif
-
-
{
// Evaluate all possible system calls and group their ErrorCodes into
// ranges of identical codes.
@@ -406,6 +471,109 @@ void Sandbox::installFilter(bool quiet) {
Instruction *jumptable =
assembleJumpTable(gen, ranges.begin(), ranges.end());
+ // If there is at least one UnsafeTrap() in our program, the entire sandbox
+ // is unsafe. We need to modify the program so that all non-
+ // SECCOMP_RET_ALLOW ErrorCodes are handled in user-space. This will then
+ // allow us to temporarily disable sandboxing rules inside of callbacks to
+ // UnsafeTrap().
+ has_unsafe_traps_ = false;
+ gen->Traverse(jumptable, CheckForUnsafeErrorCodes, &has_unsafe_traps_);
+
+ // Grab the system call number, so that we can implement jump tables.
+ Instruction *load_nr =
+ gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS,
+ offsetof(struct arch_seccomp_data, nr));
+
+ // If our BPF program has unsafe jumps, enable support for them. This
+ // test happens very early in the BPF filter program. Even before we
+ // consider looking at system call numbers.
+ // As support for unsafe jumps essentially defeats all the security
+ // measures that the sandbox provides, we print a big warning message --
+ // and of course, we make sure to only ever enable this feature if it
+ // is actually requested by the sandbox policy.
+ if (has_unsafe_traps_) {
+ if (SandboxSyscall(-1) == -1 && errno == ENOSYS) {
+ SANDBOX_DIE("Support for UnsafeTrap() has not yet been ported to this "
+ "architecture");
+ }
+
+ EvaluateSyscall evaluateSyscall = evaluators_.begin()->first;
+ void *aux = evaluators_.begin()->second;
+ if (!evaluateSyscall(__NR_rt_sigprocmask, aux).
+ Equals(ErrorCode(ErrorCode::ERR_ALLOWED)) ||
+ !evaluateSyscall(__NR_rt_sigreturn, aux).
+ Equals(ErrorCode(ErrorCode::ERR_ALLOWED))
+#if defined(__NR_sigprocmask)
+ || !evaluateSyscall(__NR_sigprocmask, aux).
+ Equals(ErrorCode(ErrorCode::ERR_ALLOWED))
+#endif
+#if defined(__NR_sigreturn)
+ || !evaluateSyscall(__NR_sigreturn, aux).
+ Equals(ErrorCode(ErrorCode::ERR_ALLOWED))
+#endif
+ ) {
+ SANDBOX_DIE("Invalid seccomp policy; if using UnsafeTrap(), you must "
+ "unconditionally allow sigreturn() and sigprocmask()");
+ }
+
+ SANDBOX_INFO("WARNING! Disabling sandbox for debugging purposes");
+ gen->Traverse(jumptable, RedirectToUserspace, NULL);
+
+ // Allow system calls, if they originate from our magic return address
+ // (which we can query by calling SandboxSyscall(-1)).
+ uintptr_t syscall_entry_point =
+ static_cast<uintptr_t>(SandboxSyscall(-1));
+ uint32_t low = static_cast<uint32_t>(syscall_entry_point);
+#if __SIZEOF_POINTER__ > 4
+ uint32_t hi = static_cast<uint32_t>(syscall_entry_point >> 32);
+#endif
+
+ // BPF cannot do native 64bit comparisons. On 64bit architectures, we
+ // have to compare both 32bit halfs of the instruction pointer. If they
+ // match what we expect, we return ERR_ALLOWED. If either or both don't
+ // match, we continue evalutating the rest of the sandbox policy.
+ Instruction *escape_hatch =
+ gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS,
+ offsetof(struct arch_seccomp_data,
+ instruction_pointer) +
+ (__SIZEOF_POINTER__ > 4 &&
+ __BYTE_ORDER == __BIG_ENDIAN ? 4 : 0),
+ gen->MakeInstruction(BPF_JMP+BPF_JEQ+BPF_K, low,
+#if __SIZEOF_POINTER__ > 4
+ gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS,
+ offsetof(struct arch_seccomp_data,
+ instruction_pointer) +
+ (__BYTE_ORDER == __BIG_ENDIAN ? 0 : 4),
+ gen->MakeInstruction(BPF_JMP+BPF_JEQ+BPF_K, hi,
+#endif
+ gen->MakeInstruction(BPF_RET+BPF_K, ErrorCode(ErrorCode::ERR_ALLOWED)),
+#if __SIZEOF_POINTER__ > 4
+ load_nr)),
+#endif
+ load_nr));
+ gen->JoinInstructions(tail, escape_hatch);
+ } else {
+ gen->JoinInstructions(tail, load_nr);
+ }
+ tail = load_nr;
+
+ // On Intel architectures, verify that system call numbers are in the
+ // expected number range. The older i386 and x86-64 APIs clear bit 30
+ // on all system calls. The newer x32 API always sets bit 30.
+#if defined(__i386__) || defined(__x86_64__)
+ Instruction *invalidX32 =
+ gen->MakeInstruction(BPF_RET+BPF_K,
+ Kill("Illegal mixing of system call ABIs").err_);
+ Instruction *checkX32 =
+#if defined(__x86_64__) && defined(__ILP32__)
+ gen->MakeInstruction(BPF_JMP+BPF_JSET+BPF_K, 0x40000000, 0, invalidX32);
+#else
+ gen->MakeInstruction(BPF_JMP+BPF_JSET+BPF_K, 0x40000000, invalidX32, 0);
+#endif
+ gen->JoinInstructions(tail, checkX32);
+ tail = checkX32;
+#endif
+
// Append jump table to our pre-amble
gen->JoinInstructions(tail, jumptable);
}
@@ -419,9 +587,22 @@ void Sandbox::installFilter(bool quiet) {
// correctly. Otherwise, there is an internal error in our BPF compiler.
// There is really nothing the caller can do until the bug is fixed.
#ifndef NDEBUG
- const char *err = NULL;
- if (!Verifier::VerifyBPF(*program, evaluators_, &err)) {
- SANDBOX_DIE(err);
+ {
+ // If we previously rewrote the BPF program so that it calls user-space
+ // whenever we return an "errno" value from the filter, then we have to
+ // wrap our system call evaluator to perform the same operation. Otherwise,
+ // the verifier would also report a mismatch in return codes.
+ Evaluators redirected_evaluators;
+ redirected_evaluators.push_back(
+ std::make_pair(RedirectToUserspaceEvalWrapper, &evaluators_));
+
+ const char *err = NULL;
+ if (!Verifier::VerifyBPF(
+ *program,
+ has_unsafe_traps_ ? redirected_evaluators : evaluators_,
+ &err)) {
+ SANDBOX_DIE(err);
+ }
}
#endif
@@ -444,7 +625,6 @@ void Sandbox::installFilter(bool quiet) {
// Release memory that is no longer needed
evaluators_.clear();
- errMap_.clear();
#if defined(SECCOMP_BPF_VALGRIND_HACKS)
// Valgrind is really not happy about our sandbox. Disable it when running
@@ -561,27 +741,43 @@ void Sandbox::sigSys(int nr, siginfo_t *info, void *void_context) {
goto sigsys_err;
}
- // Copy the seccomp-specific data into a arch_seccomp_data structure. This
- // is what we are showing to TrapFnc callbacks that the system call evaluator
- // registered with the sandbox.
- struct arch_seccomp_data data = {
- sigsys.nr,
- SECCOMP_ARCH,
- reinterpret_cast<uint64_t>(sigsys.ip),
- {
- static_cast<uint64_t>(SECCOMP_PARM1(ctx)),
- static_cast<uint64_t>(SECCOMP_PARM2(ctx)),
- static_cast<uint64_t>(SECCOMP_PARM3(ctx)),
- static_cast<uint64_t>(SECCOMP_PARM4(ctx)),
- static_cast<uint64_t>(SECCOMP_PARM5(ctx)),
- static_cast<uint64_t>(SECCOMP_PARM6(ctx))
+ intptr_t rc;
+ if (has_unsafe_traps_ && GetIsInSigHandler(ctx)) {
+ errno = old_errno;
+ if (sigsys.nr == __NR_clone) {
+ SANDBOX_DIE("Cannot call clone() from an UnsafeTrap() handler");
+ }
+ rc = SandboxSyscall(sigsys.nr,
+ SECCOMP_PARM1(ctx), SECCOMP_PARM2(ctx),
+ SECCOMP_PARM3(ctx), SECCOMP_PARM4(ctx),
+ SECCOMP_PARM5(ctx), SECCOMP_PARM6(ctx));
+ } else {
+ const ErrorCode& err = trapArray_[info->si_errno - 1];
+ if (!err.safe_) {
+ SetIsInSigHandler();
}
- };
- // Now call the TrapFnc callback associated with this particular instance
- // of SECCOMP_RET_TRAP.
- const ErrorCode& err = trapArray_[info->si_errno - 1];
- intptr_t rc = err.fnc_(data, err.aux_);
+ // Copy the seccomp-specific data into a arch_seccomp_data structure. This
+ // is what we are showing to TrapFnc callbacks that the system call
+ // evaluator registered with the sandbox.
+ struct arch_seccomp_data data = {
+ sigsys.nr,
+ SECCOMP_ARCH,
+ reinterpret_cast<uint64_t>(sigsys.ip),
+ {
+ static_cast<uint64_t>(SECCOMP_PARM1(ctx)),
+ static_cast<uint64_t>(SECCOMP_PARM2(ctx)),
+ static_cast<uint64_t>(SECCOMP_PARM3(ctx)),
+ static_cast<uint64_t>(SECCOMP_PARM4(ctx)),
+ static_cast<uint64_t>(SECCOMP_PARM5(ctx)),
+ static_cast<uint64_t>(SECCOMP_PARM6(ctx))
+ }
+ };
+
+ // Now call the TrapFnc callback associated with this particular instance
+ // of SECCOMP_RET_TRAP.
+ rc = err.fnc_(data, err.aux_);
+ }
// Update the CPU register that stores the return code of the system call
// that we just handled, and restore "errno" to the value that it had
@@ -592,10 +788,21 @@ void Sandbox::sigSys(int nr, siginfo_t *info, void *void_context) {
return;
}
-ErrorCode Sandbox::Trap(ErrorCode::TrapFnc fnc, const void *aux) {
+bool Sandbox::TrapKey::operator<(const Sandbox::TrapKey& o) const {
+ if (fnc != o.fnc) {
+ return fnc < o.fnc;
+ } else if (aux != o.aux) {
+ return aux < o.aux;
+ } else {
+ return safe < o.safe;
+ }
+}
+
+ErrorCode Sandbox::MakeTrap(ErrorCode::TrapFnc fnc, const void *aux,
+ bool safe) {
// Each unique pair of TrapFnc and auxiliary data make up a distinct instance
// of a SECCOMP_RET_TRAP.
- std::pair<ErrorCode::TrapFnc, const void *> key(fnc, aux);
+ TrapKey key(fnc, aux, safe);
TrapIds::const_iterator iter = trapIds_.find(key);
uint16_t id;
if (iter != trapIds_.end()) {
@@ -618,7 +825,7 @@ ErrorCode Sandbox::Trap(ErrorCode::TrapFnc fnc, const void *aux) {
}
id = traps_->size() + 1;
- traps_->push_back(ErrorCode(fnc, aux, id));
+ traps_->push_back(ErrorCode(fnc, aux, safe, id));
trapIds_[key] = id;
// We want to access the traps_ vector from our signal handler. But
@@ -629,10 +836,37 @@ ErrorCode Sandbox::Trap(ErrorCode::TrapFnc fnc, const void *aux) {
// signal handler, where we can safely do so.
trapArray_ = &(*traps_)[0];
trapArraySize_ = id;
+ return traps_->back();
}
- ErrorCode err = ErrorCode(fnc, aux, id);
- return errMap_[err.err()] = err;
+ return ErrorCode(fnc, aux, safe, id);
+}
+
+ErrorCode Sandbox::Trap(ErrorCode::TrapFnc fnc, const void *aux) {
+ return MakeTrap(fnc, aux, true /* Safe Trap */);
+}
+
+ErrorCode Sandbox::UnsafeTrap(ErrorCode::TrapFnc fnc, const void *aux) {
+ return MakeTrap(fnc, aux, false /* Unsafe Trap */);
+}
+
+intptr_t Sandbox::ForwardSyscall(const struct arch_seccomp_data& args) {
+ return SandboxSyscall(args.nr,
+ static_cast<intptr_t>(args.args[0]),
jln (very slow on Chromium) 2012/11/21 23:04:46 Shouldn't you cast to the same type that is used i
Markus (顧孟勤) 2012/11/22 00:29:23 intptr_t should be the same width as ptr_t or as v
jln (very slow on Chromium) 2012/11/22 01:12:39 It practice yeah, in theory int could be larger th
+ static_cast<intptr_t>(args.args[1]),
+ static_cast<intptr_t>(args.args[2]),
+ static_cast<intptr_t>(args.args[3]),
+ static_cast<intptr_t>(args.args[4]),
+ static_cast<intptr_t>(args.args[5]));
+}
+
+intptr_t Sandbox::ReturnErrno(const struct arch_seccomp_data&, void *aux) {
+ // TrapFnc functions report error by following the native kernel convention
+ // of returning an exit code in the range of -1..-4096. They do not try to
+ // set errno themselves. The glibc wrapper that triggered the SIGSYS will
+ // ultimately do so for us.
+ int err = reinterpret_cast<intptr_t>(aux) & SECCOMP_RET_DATA;
+ return -err;
}
intptr_t Sandbox::bpfFailure(const struct arch_seccomp_data&, void *aux) {
@@ -646,10 +880,10 @@ ErrorCode Sandbox::Kill(const char *msg) {
Sandbox::SandboxStatus Sandbox::status_ = STATUS_UNKNOWN;
int Sandbox::proc_fd_ = -1;
Sandbox::Evaluators Sandbox::evaluators_;
-Sandbox::ErrMap Sandbox::errMap_;
Sandbox::Traps *Sandbox::traps_ = NULL;
Sandbox::TrapIds Sandbox::trapIds_;
ErrorCode *Sandbox::trapArray_ = NULL;
size_t Sandbox::trapArraySize_ = 0;
+ bool Sandbox::has_unsafe_traps_ = false;
} // namespace

Powered by Google App Engine
This is Rietveld 408576698