sandbox/linux/seccomp-bpf/sandbox_bpf.cc - Issue 11419121: SECCOMP-BPF: Added support for greylisting of system calls.

Unified Diff: sandbox/linux/seccomp-bpf/sandbox_bpf.cc

Issue 11419121: SECCOMP-BPF: Added support for greylisting of system calls. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Fixed system call forwarding on 32bit architectures Created 8 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « sandbox/linux/seccomp-bpf/sandbox_bpf.h ('k') | sandbox/linux/seccomp-bpf/sandbox_bpf_unittest.cc » ('j') | sandbox/linux/seccomp-bpf/sandbox_bpf_unittest.cc » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: sandbox/linux/seccomp-bpf/sandbox_bpf.cc

diff --git a/sandbox/linux/seccomp-bpf/sandbox_bpf.cc b/sandbox/linux/seccomp-bpf/sandbox_bpf.cc

index eb03995904f362cf589289bc7139577a472958e0..ff855b8a9cb131119963fd0829544b5f00e41b2b 100644

--- a/sandbox/linux/seccomp-bpf/sandbox_bpf.cc

+++ b/sandbox/linux/seccomp-bpf/sandbox_bpf.cc

@@ -2,8 +2,27 @@

// Use of this source code is governed by a BSD-style license that can be

// found in the LICENSE file.

+#include <endian.h>

+#if __BYTE_ORDER == __BIG_ENDIAN

+// The BPF "struct seccomp_data" layout has to deal with storing 64bit

+// values that need to be inspected by a virtual machine that only ever

+// operates on 32bit values. The kernel developers decided how values

+// should be split into two 32bit words to achieve this goal. But at this

+// time, there is no existing BPF implementation in the kernel that uses

+// 64bit big endian values. So, all we have to go by is the consensus

+// from a discussion on LKLM. Actual implementations, if and when they

+// happen, might very well differ.

+// If this code is ever going to be used with such a kernel, you should

+// disable the "#error" and carefully test the code (e.g. run the unit

+// tests). If things don't work, search for all occurrences of __BYTE_ORDER

+// and verify that the proposed implementation agrees with what the kernel

+// actually does.

+#error Big endian operation is untested and expected to be broken

+#endif

#include "sandbox/linux/seccomp-bpf/codegen.h"

#include "sandbox/linux/seccomp-bpf/sandbox_bpf.h"

+#include "sandbox/linux/seccomp-bpf/syscall.h"

#include "sandbox/linux/seccomp-bpf/syscall_iterator.h"

#include "sandbox/linux/seccomp-bpf/verifier.h"

@@ -18,6 +37,30 @@ void WriteFailedStderrSetupMessage(int out_fd) {

}

+// We need to tell whether we are performing a "normal" callback, or

+// whether we were called recursively from within a UnsafeTrap() callback.

+// This is a little tricky to do, because we need to somehow get access to

+// per-thread data from within a signal context. Normal TLS storage is not

+// safely accessible at this time. We could roll our own, but that involves

+// a lot of complexity. Instead, we co-opt one bit in the signal mask.

+// If BUS is blocked, we assume that we have been called recursively.

+// There is a possibility for collision with other code that needs to do

+// this, but in practice the risks are low.

+// If SIGBUS turns out to be a problem, we could instead co-opt one of the

+// realtime signals. There are plenty of them. Unfortunately, there is no

+// way to mark a signal as allocated. So, the potential for collision is

+// possibly even worse.

+bool GetIsInSigHandler(const ucontext_t *ctx) {

+ return sigismember(&ctx->uc_sigmask, SIGBUS);

+void SetIsInSigHandler() {

+ sigset_t mask;

+ sigemptyset(&mask);

+ sigaddset(&mask, SIGBUS);

+ sigprocmask(SIG_BLOCK, &mask, NULL);

} // namespace

// The kernel gives us a sandbox, we turn it into a playground :-)

@@ -319,6 +362,48 @@ void Sandbox::policySanityChecks(EvaluateSyscall syscallEvaluator,

return;

}

+void Sandbox::CheckForUnsafeErrorCodes(Instruction *insn, void *aux) {

+ if (BPF_CLASS(insn->code) == BPF_RET &&

+ insn->k > SECCOMP_RET_TRAP &&

+ insn->k - SECCOMP_RET_TRAP <= trapArraySize_) {

+ const ErrorCode& err = trapArray_[insn->k - SECCOMP_RET_TRAP - 1];

+ if (!err.safe_) {

+ bool *is_unsafe = static_cast<bool *>(aux);

+ *is_unsafe = true;

+ }

+void Sandbox::RedirectToUserspace(Instruction *insn, void *aux) {

+ // When inside an UnsafeTrap() callback, we want to allow all system calls.

+ // This means, we must conditionally disable the sandbox -- and that's not

+ // something that kernel-side BPF filters can do, as they cannot inspect

+ // any state other than the syscall arguments.

+ // But if we redirect all error handlers to user-space, then we can easily

+ // make this decision.

+ // The performance penalty for this extra round-trip to user-space is not

+ // actually that bad, as we only ever pay it for denied system calls; and a

+ // typical program has very few of these.

+ if (BPF_CLASS(insn->code) == BPF_RET &&

+ (insn->k & SECCOMP_RET_ACTION) == SECCOMP_RET_ERRNO) {

+ insn->k = Trap(ReturnErrno,

+ reinterpret_cast<void *>(insn->k & SECCOMP_RET_DATA)).err();

+ }

+ErrorCode Sandbox::RedirectToUserspaceEvalWrapper(int sysnum, void *aux) {

+ // We need to replicate the behavior of RedirectToUserspace(), so that our

+ // Verifier can still work correctly.

+ Evaluators *evaluators = reinterpret_cast<Evaluators *>(aux);

+ const std::pair<EvaluateSyscall, void *>& evaluator = *evaluators->begin();

+ ErrorCode err = evaluator.first(sysnum, evaluator.second);

+ if ((err.err() & SECCOMP_RET_ACTION) == SECCOMP_RET_ERRNO) {

+ return Trap(ReturnErrno,

+ reinterpret_cast<void *>(err.err() & SECCOMP_RET_DATA));

+ }

+ return err;

void Sandbox::setSandboxPolicy(EvaluateSyscall syscallEvaluator, void *aux) {

if (status_ == STATUS_ENABLED) {

SANDBOX_DIE("Cannot change policy after sandbox has started");

@@ -337,8 +422,8 @@ void Sandbox::installFilter(bool quiet) {

// Set new SIGSYS handler

struct sigaction sa;

memset(&sa, 0, sizeof(sa));

- sa.sa_sigaction = &sigSys;

- sa.sa_flags = SA_SIGINFO;

+ sa.sa_sigaction = sigSys;

+ sa.sa_flags = SA_SIGINFO | SA_NODEFER;

if (sigaction(SIGSYS, &sa, NULL) < 0) {

goto filter_failed;

}

@@ -369,33 +454,13 @@ void Sandbox::installFilter(bool quiet) {

Instruction *head =

gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS,

offsetof(struct arch_seccomp_data, arch),

- gen->MakeInstruction(BPF_JMP+BPF_JEQ+BPF_K, SECCOMP_ARCH,

tail =

- // Grab the system call number, so that we can implement jump tables.

- gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS,

- offsetof(struct arch_seccomp_data, nr)),

+ gen->MakeInstruction(BPF_JMP+BPF_JEQ+BPF_K, SECCOMP_ARCH,

+ NULL,

gen->MakeInstruction(BPF_RET+BPF_K,

Kill(

"Invalid audit architecture in BPF filter").err_)));

- // On Intel architectures, verify that system call numbers are in the

- // expected number range. The older i386 and x86-64 APIs clear bit 30

- // on all system calls. The newer x32 API always sets bit 30.

-#if defined(__i386__) || defined(__x86_64__)

- Instruction *invalidX32 =

- gen->MakeInstruction(BPF_RET+BPF_K,

- Kill("Illegal mixing of system call ABIs").err_);

- Instruction *checkX32 =

-#if defined(__x86_64__) && defined(__ILP32__)

- gen->MakeInstruction(BPF_JMP+BPF_JSET+BPF_K, 0x40000000, 0, invalidX32);

-#else

- gen->MakeInstruction(BPF_JMP+BPF_JSET+BPF_K, 0x40000000, invalidX32, 0);

-#endif

- gen->JoinInstructions(tail, checkX32);

- tail = checkX32;

-#endif

{

// Evaluate all possible system calls and group their ErrorCodes into

// ranges of identical codes.

@@ -406,6 +471,109 @@ void Sandbox::installFilter(bool quiet) {

Instruction *jumptable =

assembleJumpTable(gen, ranges.begin(), ranges.end());

+ // If there is at least one UnsafeTrap() in our program, the entire sandbox

+ // is unsafe. We need to modify the program so that all non-

+ // SECCOMP_RET_ALLOW ErrorCodes are handled in user-space. This will then

+ // allow us to temporarily disable sandboxing rules inside of callbacks to

+ // UnsafeTrap().

+ has_unsafe_traps_ = false;

+ gen->Traverse(jumptable, CheckForUnsafeErrorCodes, &has_unsafe_traps_);

+ // Grab the system call number, so that we can implement jump tables.

+ Instruction *load_nr =

+ gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS,

+ offsetof(struct arch_seccomp_data, nr));

+ // If our BPF program has unsafe jumps, enable support for them. This

+ // test happens very early in the BPF filter program. Even before we

+ // consider looking at system call numbers.

+ // As support for unsafe jumps essentially defeats all the security

+ // measures that the sandbox provides, we print a big warning message --

+ // and of course, we make sure to only ever enable this feature if it

+ // is actually requested by the sandbox policy.

+ if (has_unsafe_traps_) {

+ if (SandboxSyscall(-1) == -1 && errno == ENOSYS) {

+ SANDBOX_DIE("Support for UnsafeTrap() has not yet been ported to this "

+ "architecture");

+ }

+ EvaluateSyscall evaluateSyscall = evaluators_.begin()->first;

+ void *aux = evaluators_.begin()->second;

+ if (!evaluateSyscall(__NR_rt_sigprocmask, aux).

+ Equals(ErrorCode(ErrorCode::ERR_ALLOWED)) ||

+ !evaluateSyscall(__NR_rt_sigreturn, aux).

+ Equals(ErrorCode(ErrorCode::ERR_ALLOWED))

+#if defined(__NR_sigprocmask)

+ || !evaluateSyscall(__NR_sigprocmask, aux).

+ Equals(ErrorCode(ErrorCode::ERR_ALLOWED))

+#endif

+#if defined(__NR_sigreturn)

+ || !evaluateSyscall(__NR_sigreturn, aux).

+ Equals(ErrorCode(ErrorCode::ERR_ALLOWED))

+#endif

+ ) {

+ SANDBOX_DIE("Invalid seccomp policy; if using UnsafeTrap(), you must "

+ "unconditionally allow sigreturn() and sigprocmask()");

+ }

+ SANDBOX_INFO("WARNING! Disabling sandbox for debugging purposes");

+ gen->Traverse(jumptable, RedirectToUserspace, NULL);

+ // Allow system calls, if they originate from our magic return address

+ // (which we can query by calling SandboxSyscall(-1)).

+ uintptr_t syscall_entry_point =

+ static_cast<uintptr_t>(SandboxSyscall(-1));

+ uint32_t low = static_cast<uint32_t>(syscall_entry_point);

+#if __SIZEOF_POINTER__ > 4

+ uint32_t hi = static_cast<uint32_t>(syscall_entry_point >> 32);

+#endif

+ // BPF cannot do native 64bit comparisons. On 64bit architectures, we

+ // have to compare both 32bit halfs of the instruction pointer. If they

+ // match what we expect, we return ERR_ALLOWED. If either or both don't

+ // match, we continue evalutating the rest of the sandbox policy.

+ Instruction *escape_hatch =

+ gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS,

+ offsetof(struct arch_seccomp_data,

+ instruction_pointer) +

+ (__SIZEOF_POINTER__ > 4 &&

+ __BYTE_ORDER == __BIG_ENDIAN ? 4 : 0),

+ gen->MakeInstruction(BPF_JMP+BPF_JEQ+BPF_K, low,

+#if __SIZEOF_POINTER__ > 4

+ gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS,

+ offsetof(struct arch_seccomp_data,

+ instruction_pointer) +

+ (__BYTE_ORDER == __BIG_ENDIAN ? 0 : 4),

+ gen->MakeInstruction(BPF_JMP+BPF_JEQ+BPF_K, hi,

+#endif

+ gen->MakeInstruction(BPF_RET+BPF_K, ErrorCode(ErrorCode::ERR_ALLOWED)),

+#if __SIZEOF_POINTER__ > 4

+ load_nr)),

+#endif

+ load_nr));

+ gen->JoinInstructions(tail, escape_hatch);

+ } else {

+ gen->JoinInstructions(tail, load_nr);

+ }

+ tail = load_nr;

+ // On Intel architectures, verify that system call numbers are in the

+ // expected number range. The older i386 and x86-64 APIs clear bit 30

+ // on all system calls. The newer x32 API always sets bit 30.

+#if defined(__i386__) || defined(__x86_64__)

+ Instruction *invalidX32 =

+ gen->MakeInstruction(BPF_RET+BPF_K,

+ Kill("Illegal mixing of system call ABIs").err_);

+ Instruction *checkX32 =

+#if defined(__x86_64__) && defined(__ILP32__)

+ gen->MakeInstruction(BPF_JMP+BPF_JSET+BPF_K, 0x40000000, 0, invalidX32);

+#else

+ gen->MakeInstruction(BPF_JMP+BPF_JSET+BPF_K, 0x40000000, invalidX32, 0);

+#endif

+ gen->JoinInstructions(tail, checkX32);

+ tail = checkX32;

+#endif

// Append jump table to our pre-amble

gen->JoinInstructions(tail, jumptable);

}

@@ -419,9 +587,22 @@ void Sandbox::installFilter(bool quiet) {

// correctly. Otherwise, there is an internal error in our BPF compiler.

// There is really nothing the caller can do until the bug is fixed.

#ifndef NDEBUG

- const char *err = NULL;

- if (!Verifier::VerifyBPF(*program, evaluators_, &err)) {

- SANDBOX_DIE(err);

+ {

+ // If we previously rewrote the BPF program so that it calls user-space

+ // whenever we return an "errno" value from the filter, then we have to

+ // wrap our system call evaluator to perform the same operation. Otherwise,

+ // the verifier would also report a mismatch in return codes.

+ Evaluators redirected_evaluators;

+ redirected_evaluators.push_back(

+ std::make_pair(RedirectToUserspaceEvalWrapper, &evaluators_));

+ const char *err = NULL;

+ if (!Verifier::VerifyBPF(

+ *program,

+ has_unsafe_traps_ ? redirected_evaluators : evaluators_,

+ &err)) {

+ SANDBOX_DIE(err);

+ }

}

#endif

@@ -444,7 +625,6 @@ void Sandbox::installFilter(bool quiet) {

// Release memory that is no longer needed

evaluators_.clear();

- errMap_.clear();

#if defined(SECCOMP_BPF_VALGRIND_HACKS)

// Valgrind is really not happy about our sandbox. Disable it when running

@@ -561,27 +741,43 @@ void Sandbox::sigSys(int nr, siginfo_t *info, void *void_context) {

goto sigsys_err;

}

- // Copy the seccomp-specific data into a arch_seccomp_data structure. This

- // is what we are showing to TrapFnc callbacks that the system call evaluator

- // registered with the sandbox.

- struct arch_seccomp_data data = {

- sigsys.nr,

- SECCOMP_ARCH,

- reinterpret_cast<uint64_t>(sigsys.ip),

- {

- static_cast<uint64_t>(SECCOMP_PARM1(ctx)),

- static_cast<uint64_t>(SECCOMP_PARM2(ctx)),

- static_cast<uint64_t>(SECCOMP_PARM3(ctx)),

- static_cast<uint64_t>(SECCOMP_PARM4(ctx)),

- static_cast<uint64_t>(SECCOMP_PARM5(ctx)),

- static_cast<uint64_t>(SECCOMP_PARM6(ctx))

+ intptr_t rc;

+ if (has_unsafe_traps_ && GetIsInSigHandler(ctx)) {

+ errno = old_errno;

+ if (sigsys.nr == __NR_clone) {

+ SANDBOX_DIE("Cannot call clone() from an UnsafeTrap() handler");

+ }

+ rc = SandboxSyscall(sigsys.nr,

+ SECCOMP_PARM1(ctx), SECCOMP_PARM2(ctx),

+ SECCOMP_PARM3(ctx), SECCOMP_PARM4(ctx),

+ SECCOMP_PARM5(ctx), SECCOMP_PARM6(ctx));

+ } else {

+ const ErrorCode& err = trapArray_[info->si_errno - 1];

+ if (!err.safe_) {

+ SetIsInSigHandler();

}

- };

- // Now call the TrapFnc callback associated with this particular instance

- // of SECCOMP_RET_TRAP.

- const ErrorCode& err = trapArray_[info->si_errno - 1];

- intptr_t rc = err.fnc_(data, err.aux_);

+ // Copy the seccomp-specific data into a arch_seccomp_data structure. This

+ // is what we are showing to TrapFnc callbacks that the system call

+ // evaluator registered with the sandbox.

+ struct arch_seccomp_data data = {

+ sigsys.nr,

+ SECCOMP_ARCH,

+ reinterpret_cast<uint64_t>(sigsys.ip),

+ {

+ static_cast<uint64_t>(SECCOMP_PARM1(ctx)),

+ static_cast<uint64_t>(SECCOMP_PARM2(ctx)),

+ static_cast<uint64_t>(SECCOMP_PARM3(ctx)),

+ static_cast<uint64_t>(SECCOMP_PARM4(ctx)),

+ static_cast<uint64_t>(SECCOMP_PARM5(ctx)),

+ static_cast<uint64_t>(SECCOMP_PARM6(ctx))

+ }

+ };

+ // Now call the TrapFnc callback associated with this particular instance

+ // of SECCOMP_RET_TRAP.

+ rc = err.fnc_(data, err.aux_);

+ }

// Update the CPU register that stores the return code of the system call

// that we just handled, and restore "errno" to the value that it had

@@ -592,10 +788,21 @@ void Sandbox::sigSys(int nr, siginfo_t *info, void *void_context) {

return;

}

-ErrorCode Sandbox::Trap(ErrorCode::TrapFnc fnc, const void *aux) {

+bool Sandbox::TrapKey::operator<(const Sandbox::TrapKey& o) const {

+ if (fnc != o.fnc) {

+ return fnc < o.fnc;

+ } else if (aux != o.aux) {

+ return aux < o.aux;

+ } else {

+ return safe < o.safe;

+ }

+ErrorCode Sandbox::MakeTrap(ErrorCode::TrapFnc fnc, const void *aux,

+ bool safe) {

// Each unique pair of TrapFnc and auxiliary data make up a distinct instance

// of a SECCOMP_RET_TRAP.

- std::pair<ErrorCode::TrapFnc, const void *> key(fnc, aux);

+ TrapKey key(fnc, aux, safe);

TrapIds::const_iterator iter = trapIds_.find(key);

uint16_t id;

if (iter != trapIds_.end()) {

@@ -618,7 +825,7 @@ ErrorCode Sandbox::Trap(ErrorCode::TrapFnc fnc, const void *aux) {

}

id = traps_->size() + 1;

- traps_->push_back(ErrorCode(fnc, aux, id));

+ traps_->push_back(ErrorCode(fnc, aux, safe, id));

trapIds_[key] = id;

// We want to access the traps_ vector from our signal handler. But

@@ -629,10 +836,37 @@ ErrorCode Sandbox::Trap(ErrorCode::TrapFnc fnc, const void *aux) {

// signal handler, where we can safely do so.

trapArray_ = &(*traps_)[0];

trapArraySize_ = id;

+ return traps_->back();

}

- ErrorCode err = ErrorCode(fnc, aux, id);

- return errMap_[err.err()] = err;

+ return ErrorCode(fnc, aux, safe, id);

+ErrorCode Sandbox::Trap(ErrorCode::TrapFnc fnc, const void *aux) {

+ return MakeTrap(fnc, aux, true /* Safe Trap */);

+ErrorCode Sandbox::UnsafeTrap(ErrorCode::TrapFnc fnc, const void *aux) {

+ return MakeTrap(fnc, aux, false /* Unsafe Trap */);

+intptr_t Sandbox::ForwardSyscall(const struct arch_seccomp_data& args) {

+ return SandboxSyscall(args.nr,

+ static_cast<intptr_t>(args.args[0]),

jln (very slow on Chromium) 2012/11/21 23:04:46 Shouldn't you cast to the same type that is used i

Markus (顧孟勤) 2012/11/22 00:29:23 intptr_t should be the same width as ptr_t or as v

jln (very slow on Chromium) 2012/11/22 01:12:39 It practice yeah, in theory int could be larger th

+ static_cast<intptr_t>(args.args[1]),

+ static_cast<intptr_t>(args.args[2]),

+ static_cast<intptr_t>(args.args[3]),

+ static_cast<intptr_t>(args.args[4]),

+ static_cast<intptr_t>(args.args[5]));

+intptr_t Sandbox::ReturnErrno(const struct arch_seccomp_data&, void *aux) {

+ // TrapFnc functions report error by following the native kernel convention

+ // of returning an exit code in the range of -1..-4096. They do not try to

+ // set errno themselves. The glibc wrapper that triggered the SIGSYS will

+ // ultimately do so for us.

+ int err = reinterpret_cast<intptr_t>(aux) & SECCOMP_RET_DATA;

+ return -err;

}

intptr_t Sandbox::bpfFailure(const struct arch_seccomp_data&, void *aux) {

@@ -646,10 +880,10 @@ ErrorCode Sandbox::Kill(const char *msg) {

Sandbox::SandboxStatus Sandbox::status_ = STATUS_UNKNOWN;

int Sandbox::proc_fd_ = -1;

Sandbox::Evaluators Sandbox::evaluators_;

-Sandbox::ErrMap Sandbox::errMap_;

Sandbox::Traps *Sandbox::traps_ = NULL;

Sandbox::TrapIds Sandbox::trapIds_;

ErrorCode *Sandbox::trapArray_ = NULL;

size_t Sandbox::trapArraySize_ = 0;

+ bool Sandbox::has_unsafe_traps_ = false;

} // namespace