| Index: sandbox/linux/seccomp-bpf/sandbox_bpf.cc
|
| diff --git a/sandbox/linux/seccomp-bpf/sandbox_bpf.cc b/sandbox/linux/seccomp-bpf/sandbox_bpf.cc
|
| index eb03995904f362cf589289bc7139577a472958e0..17ab4d25b5b289924b8f0de2373651b41d995b5f 100644
|
| --- a/sandbox/linux/seccomp-bpf/sandbox_bpf.cc
|
| +++ b/sandbox/linux/seccomp-bpf/sandbox_bpf.cc
|
| @@ -4,6 +4,7 @@
|
|
|
| #include "sandbox/linux/seccomp-bpf/codegen.h"
|
| #include "sandbox/linux/seccomp-bpf/sandbox_bpf.h"
|
| +#include "sandbox/linux/seccomp-bpf/syscall.h"
|
| #include "sandbox/linux/seccomp-bpf/syscall_iterator.h"
|
| #include "sandbox/linux/seccomp-bpf/verifier.h"
|
|
|
| @@ -319,6 +320,48 @@ void Sandbox::policySanityChecks(EvaluateSyscall syscallEvaluator,
|
| return;
|
| }
|
|
|
| +void Sandbox::CheckForUnsafeErrorCodes(Instruction *insn, void *aux) {
|
| + if (BPF_CLASS(insn->code) == BPF_RET &&
|
| + insn->k > SECCOMP_RET_TRAP &&
|
| + insn->k <= SECCOMP_RET_TRAP+trapArraySize_) {
|
| + const ErrorCode& err = trapArray_[insn->k - SECCOMP_RET_TRAP - 1];
|
| + if (!err.safe_) {
|
| + bool *is_unsafe = static_cast<bool *>(aux);
|
| + *is_unsafe = true;
|
| + }
|
| + }
|
| +}
|
| +
|
| +void Sandbox::RedirectToUserspace(Instruction *insn, void *aux) {
|
| + // When inside an UnsafeTrap() callback, we want to allow all system calls.
|
| + // This means, we must conditionally disable the sandbox -- and that's not
|
| + // something that kernel-side BPF filters can do, as they cannot inspect
|
| + // any state other than the syscall arguments.
|
| + // But if we redirect all error handlers to user-space, then we can easily
|
| + // make this decision.
|
| + // The performance penalty for this extra round-trip to user-space is not
|
| + // actually that bad, as we only ever pay it for denied system calls; and a
|
| + // typical program has very few of these.
|
| + if (BPF_CLASS(insn->code) == BPF_RET &&
|
| + (insn->k & SECCOMP_RET_ACTION) == SECCOMP_RET_ERRNO) {
|
| + insn->k = Trap(ReturnErrno,
|
| + reinterpret_cast<void *>(insn->k & SECCOMP_RET_DATA)).err();
|
| + }
|
| +}
|
| +
|
| +ErrorCode Sandbox::RedirectToUserspaceEvalWrapper(int sysnum, void *aux) {
|
| + // We need to replicate the behavior of RedirectToUserspace(), so that our
|
| + // Verifier can still work correctly.
|
| + Evaluators *evaluators = reinterpret_cast<Evaluators *>(aux);
|
| + const std::pair<EvaluateSyscall, void *>& evaluator = *evaluators->begin();
|
| + ErrorCode err = evaluator.first(sysnum, evaluator.second);
|
| + if ((err.err() & SECCOMP_RET_ACTION) == SECCOMP_RET_ERRNO) {
|
| + return Trap(ReturnErrno,
|
| + reinterpret_cast<void *>(err.err() & SECCOMP_RET_DATA));
|
| + }
|
| + return err;
|
| +}
|
| +
|
| void Sandbox::setSandboxPolicy(EvaluateSyscall syscallEvaluator, void *aux) {
|
| if (status_ == STATUS_ENABLED) {
|
| SANDBOX_DIE("Cannot change policy after sandbox has started");
|
| @@ -337,8 +380,8 @@ void Sandbox::installFilter(bool quiet) {
|
| // Set new SIGSYS handler
|
| struct sigaction sa;
|
| memset(&sa, 0, sizeof(sa));
|
| - sa.sa_sigaction = &sigSys;
|
| - sa.sa_flags = SA_SIGINFO;
|
| + sa.sa_sigaction = sigSys;
|
| + sa.sa_flags = SA_SIGINFO | SA_NODEFER;
|
| if (sigaction(SIGSYS, &sa, NULL) < 0) {
|
| goto filter_failed;
|
| }
|
| @@ -369,33 +412,13 @@ void Sandbox::installFilter(bool quiet) {
|
| Instruction *head =
|
| gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS,
|
| offsetof(struct arch_seccomp_data, arch),
|
| - gen->MakeInstruction(BPF_JMP+BPF_JEQ+BPF_K, SECCOMP_ARCH,
|
| tail =
|
| - // Grab the system call number, so that we can implement jump tables.
|
| - gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS,
|
| - offsetof(struct arch_seccomp_data, nr)),
|
| + gen->MakeInstruction(BPF_JMP+BPF_JEQ+BPF_K, SECCOMP_ARCH,
|
| + NULL,
|
| gen->MakeInstruction(BPF_RET+BPF_K,
|
| Kill(
|
| "Invalid audit architecture in BPF filter").err_)));
|
|
|
| - // On Intel architectures, verify that system call numbers are in the
|
| - // expected number range. The older i386 and x86-64 APIs clear bit 30
|
| - // on all system calls. The newer x32 API always sets bit 30.
|
| -#if defined(__i386__) || defined(__x86_64__)
|
| - Instruction *invalidX32 =
|
| - gen->MakeInstruction(BPF_RET+BPF_K,
|
| - Kill("Illegal mixing of system call ABIs").err_);
|
| - Instruction *checkX32 =
|
| -#if defined(__x86_64__) && defined(__ILP32__)
|
| - gen->MakeInstruction(BPF_JMP+BPF_JSET+BPF_K, 0x40000000, 0, invalidX32);
|
| -#else
|
| - gen->MakeInstruction(BPF_JMP+BPF_JSET+BPF_K, 0x40000000, invalidX32, 0);
|
| -#endif
|
| - gen->JoinInstructions(tail, checkX32);
|
| - tail = checkX32;
|
| -#endif
|
| -
|
| -
|
| {
|
| // Evaluate all possible system calls and group their ErrorCodes into
|
| // ranges of identical codes.
|
| @@ -406,6 +429,103 @@ void Sandbox::installFilter(bool quiet) {
|
| Instruction *jumptable =
|
| assembleJumpTable(gen, ranges.begin(), ranges.end());
|
|
|
| + // If there is at least one UnsafeTrap() in our program, the entire sandbox
|
| + // is unsafe. We need to modify the program so that all non-
|
| + // SECCOMP_RET_ALLOW ErrorCodes are handled in user-space. This will then
|
| + // allow us to temporarily disable sandboxing rules inside of callbacks to
|
| + // UnsafeTrap().
|
| + has_unsafe_traps_ = false;
|
| + gen->Traverse(jumptable, CheckForUnsafeErrorCodes, &has_unsafe_traps_);
|
| +
|
| + // Grab the system call number, so that we can implement jump tables.
|
| + Instruction *load_nr =
|
| + gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS,
|
| + offsetof(struct arch_seccomp_data, nr));
|
| +
|
| + // If our BPF program has unsafe jumps, enable support for them. This
|
| + // test happens very early in the BPF filter program. Even before we
|
| + // consider looking at system call numbers.
|
| + // As support for unsafe jumps essentially defeats all the security
|
| + // measures that the sandbox provides, we print a big warning message --
|
| + // and of course, we make sure to only ever enable this feature if it
|
| + // is actually requested by the sandbox policy.
|
| + if (has_unsafe_traps_) {
|
| + EvaluateSyscall evaluateSyscall = evaluators_.begin()->first;
|
| + void *aux = evaluators_.begin()->second;
|
| + if (!evaluateSyscall(__NR_rt_sigprocmask, aux).
|
| + Equals(ErrorCode(ErrorCode::ERR_ALLOWED)) ||
|
| + !evaluateSyscall(__NR_rt_sigreturn, aux).
|
| + Equals(ErrorCode(ErrorCode::ERR_ALLOWED))
|
| +#if defined(__NR_sigprocmask)
|
| + || !evaluateSyscall(__NR_sigprocmask, aux).
|
| + Equals(ErrorCode(ErrorCode::ERR_ALLOWED))
|
| +#endif
|
| +#if defined(__NR_sigreturn)
|
| + || !evaluateSyscall(__NR_sigreturn, aux).
|
| + Equals(ErrorCode(ErrorCode::ERR_ALLOWED))
|
| +#endif
|
| + ) {
|
| + SANDBOX_DIE("Invalid seccomp policy; if using UnsafeTrap(), you must "
|
| + "unconditionally allow sigreturn() and sigprocmask()");
|
| + }
|
| +
|
| + SANDBOX_INFO("WARNING! Disabling sandbox for debugging purposes");
|
| + gen->Traverse(jumptable, RedirectToUserspace, NULL);
|
| +
|
| + // Allow system calls, if they originate from our magic return address
|
| + // (which we can query by calling Syscall(-1)).
|
| + uintptr_t syscall_entry_point = static_cast<uintptr_t>(Syscall(-1));
|
| + uint32_t low = static_cast<uint32_t>(syscall_entry_point);
|
| +#if __SIZEOF_POINTER__ > 4
|
| + uint32_t hi = static_cast<uint32_t>(syscall_entry_point >> 32);
|
| +#endif
|
| +
|
| + // BPF cannot do native 64bit comparisons. On 64bit architectures, we
|
| + // have to compare both 32bit halfs of the instruction pointer. If they
|
| + // match what we expect, we return ERR_ALLOWED. If either or both don't
|
| + // match, we continue evalutating the rest of the sandbox policy.
|
| + Instruction *escape_hatch =
|
| + gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS,
|
| + offsetof(struct arch_seccomp_data,
|
| + instruction_pointer) +
|
| + (__SIZEOF_POINTER__ > 4 &&
|
| + __BYTE_ORDER == __BIG_ENDIAN ? 4 : 0),
|
| + gen->MakeInstruction(BPF_JMP+BPF_JEQ+BPF_K, low,
|
| +#if __SIZEOF_POINTER__ > 4
|
| + gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS,
|
| + offsetof(struct arch_seccomp_data,
|
| + instruction_pointer) +
|
| + (__BYTE_ORDER == __BIG_ENDIAN ? 0 : 4),
|
| + gen->MakeInstruction(BPF_JMP+BPF_JEQ+BPF_K, hi,
|
| +#endif
|
| + gen->MakeInstruction(BPF_RET+BPF_K, ErrorCode(ErrorCode::ERR_ALLOWED)),
|
| +#if __SIZEOF_POINTER__ > 4
|
| + load_nr)),
|
| +#endif
|
| + load_nr));
|
| + gen->JoinInstructions(tail, escape_hatch);
|
| + } else {
|
| + gen->JoinInstructions(tail, load_nr);
|
| + }
|
| + tail = load_nr;
|
| +
|
| + // On Intel architectures, verify that system call numbers are in the
|
| + // expected number range. The older i386 and x86-64 APIs clear bit 30
|
| + // on all system calls. The newer x32 API always sets bit 30.
|
| +#if defined(__i386__) || defined(__x86_64__)
|
| + Instruction *invalidX32 =
|
| + gen->MakeInstruction(BPF_RET+BPF_K,
|
| + Kill("Illegal mixing of system call ABIs").err_);
|
| + Instruction *checkX32 =
|
| +#if defined(__x86_64__) && defined(__ILP32__)
|
| + gen->MakeInstruction(BPF_JMP+BPF_JSET+BPF_K, 0x40000000, 0, invalidX32);
|
| +#else
|
| + gen->MakeInstruction(BPF_JMP+BPF_JSET+BPF_K, 0x40000000, invalidX32, 0);
|
| +#endif
|
| + gen->JoinInstructions(tail, checkX32);
|
| + tail = checkX32;
|
| +#endif
|
| +
|
| // Append jump table to our pre-amble
|
| gen->JoinInstructions(tail, jumptable);
|
| }
|
| @@ -419,9 +539,20 @@ void Sandbox::installFilter(bool quiet) {
|
| // correctly. Otherwise, there is an internal error in our BPF compiler.
|
| // There is really nothing the caller can do until the bug is fixed.
|
| #ifndef NDEBUG
|
| - const char *err = NULL;
|
| - if (!Verifier::VerifyBPF(*program, evaluators_, &err)) {
|
| - SANDBOX_DIE(err);
|
| + {
|
| + // If we previously rewrote the BPF program so that it calls user-space
|
| + // whenever we return an "errno" value from the filter, then we have to
|
| + // wrap our system call evaluator to perform the same operation. Otherwise,
|
| + // the verifier would also report a mismatch in return codes.
|
| + Evaluators redirected_evaluators;
|
| + redirected_evaluators.push_back(
|
| + std::make_pair(RedirectToUserspaceEvalWrapper, &evaluators_));
|
| +
|
| + const char *err = NULL;
|
| + if (!Verifier::VerifyBPF(*program,
|
| + has_unsafe_traps_ ? redirected_evaluators : evaluators_, &err)) {
|
| + SANDBOX_DIE(err);
|
| + }
|
| }
|
| #endif
|
|
|
| @@ -444,7 +575,6 @@ void Sandbox::installFilter(bool quiet) {
|
|
|
| // Release memory that is no longer needed
|
| evaluators_.clear();
|
| - errMap_.clear();
|
|
|
| #if defined(SECCOMP_BPF_VALGRIND_HACKS)
|
| // Valgrind is really not happy about our sandbox. Disable it when running
|
| @@ -561,27 +691,53 @@ void Sandbox::sigSys(int nr, siginfo_t *info, void *void_context) {
|
| goto sigsys_err;
|
| }
|
|
|
| - // Copy the seccomp-specific data into a arch_seccomp_data structure. This
|
| - // is what we are showing to TrapFnc callbacks that the system call evaluator
|
| - // registered with the sandbox.
|
| - struct arch_seccomp_data data = {
|
| - sigsys.nr,
|
| - SECCOMP_ARCH,
|
| - reinterpret_cast<uint64_t>(sigsys.ip),
|
| - {
|
| - static_cast<uint64_t>(SECCOMP_PARM1(ctx)),
|
| - static_cast<uint64_t>(SECCOMP_PARM2(ctx)),
|
| - static_cast<uint64_t>(SECCOMP_PARM3(ctx)),
|
| - static_cast<uint64_t>(SECCOMP_PARM4(ctx)),
|
| - static_cast<uint64_t>(SECCOMP_PARM5(ctx)),
|
| - static_cast<uint64_t>(SECCOMP_PARM6(ctx))
|
| + // We need to tell whether we are performing a "normal" callback, or
|
| + // whether we were called recursively from within a UnsafeTrap() callback.
|
| + // This is a little tricky to do, because we need to somehow get access to
|
| + // per-thread data from within a signal context. Normal TLS storage is not
|
| + // safely accessible at this time. We could roll our own, but that involves
|
| + // a lot of complexity. Instead, we co-opt one bit in the signal mask.
|
| + // If BUS is blocked, we assume that we have been called recursively.
|
| + // There is a possibility for collision with other code that needs to do
|
| + // this, but in practice the risks are low.
|
| + intptr_t rc;
|
| + if (has_unsafe_traps_ &&
|
| + sigismember(&ctx->uc_sigmask, SIGBUS)) {
|
| + errno = old_errno;
|
| + rc = Syscall(sigsys.nr,
|
| + SECCOMP_PARM1(ctx), SECCOMP_PARM2(ctx),
|
| + SECCOMP_PARM3(ctx), SECCOMP_PARM4(ctx),
|
| + SECCOMP_PARM5(ctx), SECCOMP_PARM6(ctx));
|
| + } else {
|
| + const ErrorCode& err = trapArray_[info->si_errno - 1];
|
| + if (!err.safe_) {
|
| + sigset_t mask;
|
| + sigemptyset(&mask);
|
| + sigaddset(&mask, SIGBUS);
|
| + sigprocmask(SIG_BLOCK, &mask, NULL);
|
| }
|
| - };
|
|
|
| - // Now call the TrapFnc callback associated with this particular instance
|
| - // of SECCOMP_RET_TRAP.
|
| - const ErrorCode& err = trapArray_[info->si_errno - 1];
|
| - intptr_t rc = err.fnc_(data, err.aux_);
|
| + // Copy the seccomp-specific data into a arch_seccomp_data structure. This
|
| + // is what we are showing to TrapFnc callbacks that the system call
|
| + // evaluator registered with the sandbox.
|
| + struct arch_seccomp_data data = {
|
| + sigsys.nr,
|
| + SECCOMP_ARCH,
|
| + reinterpret_cast<uint64_t>(sigsys.ip),
|
| + {
|
| + static_cast<uint64_t>(SECCOMP_PARM1(ctx)),
|
| + static_cast<uint64_t>(SECCOMP_PARM2(ctx)),
|
| + static_cast<uint64_t>(SECCOMP_PARM3(ctx)),
|
| + static_cast<uint64_t>(SECCOMP_PARM4(ctx)),
|
| + static_cast<uint64_t>(SECCOMP_PARM5(ctx)),
|
| + static_cast<uint64_t>(SECCOMP_PARM6(ctx))
|
| + }
|
| + };
|
| +
|
| + // Now call the TrapFnc callback associated with this particular instance
|
| + // of SECCOMP_RET_TRAP.
|
| + rc = err.fnc_(data, err.aux_);
|
| + }
|
|
|
| // Update the CPU register that stores the return code of the system call
|
| // that we just handled, and restore "errno" to the value that it had
|
| @@ -592,10 +748,21 @@ void Sandbox::sigSys(int nr, siginfo_t *info, void *void_context) {
|
| return;
|
| }
|
|
|
| -ErrorCode Sandbox::Trap(ErrorCode::TrapFnc fnc, const void *aux) {
|
| +bool Sandbox::TrapKey::operator<(const Sandbox::TrapKey& o) const {
|
| + if (fnc != o.fnc) {
|
| + return fnc < o.fnc;
|
| + } else if (aux != o.aux) {
|
| + return aux < o.aux;
|
| + } else {
|
| + return safe < o.safe;
|
| + }
|
| +}
|
| +
|
| +ErrorCode Sandbox::MakeTrap(ErrorCode::TrapFnc fnc, const void *aux,
|
| + bool safe) {
|
| // Each unique pair of TrapFnc and auxiliary data make up a distinct instance
|
| // of a SECCOMP_RET_TRAP.
|
| - std::pair<ErrorCode::TrapFnc, const void *> key(fnc, aux);
|
| + TrapKey key(fnc, aux, safe);
|
| TrapIds::const_iterator iter = trapIds_.find(key);
|
| uint16_t id;
|
| if (iter != trapIds_.end()) {
|
| @@ -618,7 +785,7 @@ ErrorCode Sandbox::Trap(ErrorCode::TrapFnc fnc, const void *aux) {
|
| }
|
| id = traps_->size() + 1;
|
|
|
| - traps_->push_back(ErrorCode(fnc, aux, id));
|
| + traps_->push_back(ErrorCode(fnc, aux, safe, id));
|
| trapIds_[key] = id;
|
|
|
| // We want to access the traps_ vector from our signal handler. But
|
| @@ -629,10 +796,23 @@ ErrorCode Sandbox::Trap(ErrorCode::TrapFnc fnc, const void *aux) {
|
| // signal handler, where we can safely do so.
|
| trapArray_ = &(*traps_)[0];
|
| trapArraySize_ = id;
|
| + return traps_->back();
|
| }
|
|
|
| - ErrorCode err = ErrorCode(fnc, aux, id);
|
| - return errMap_[err.err()] = err;
|
| + return ErrorCode(fnc, aux, safe, id);
|
| +}
|
| +
|
| +ErrorCode Sandbox::Trap(ErrorCode::TrapFnc fnc, const void *aux) {
|
| + return MakeTrap(fnc, aux, true);
|
| +}
|
| +
|
| +ErrorCode Sandbox::UnsafeTrap(ErrorCode::TrapFnc fnc, const void *aux) {
|
| + return MakeTrap(fnc, aux, false);
|
| +}
|
| +
|
| +intptr_t Sandbox::ReturnErrno(const struct arch_seccomp_data&, void *aux) {
|
| + int err = reinterpret_cast<intptr_t>(aux) & SECCOMP_RET_DATA;
|
| + return -err;
|
| }
|
|
|
| intptr_t Sandbox::bpfFailure(const struct arch_seccomp_data&, void *aux) {
|
| @@ -646,10 +826,10 @@ ErrorCode Sandbox::Kill(const char *msg) {
|
| Sandbox::SandboxStatus Sandbox::status_ = STATUS_UNKNOWN;
|
| int Sandbox::proc_fd_ = -1;
|
| Sandbox::Evaluators Sandbox::evaluators_;
|
| -Sandbox::ErrMap Sandbox::errMap_;
|
| Sandbox::Traps *Sandbox::traps_ = NULL;
|
| Sandbox::TrapIds Sandbox::trapIds_;
|
| ErrorCode *Sandbox::trapArray_ = NULL;
|
| size_t Sandbox::trapArraySize_ = 0;
|
| + bool Sandbox::has_unsafe_traps_ = false;
|
|
|
| } // namespace
|
|
|