sandbox/linux/seccomp-bpf/sandbox_bpf.cc - Issue 11419121: SECCOMP-BPF: Added support for greylisting of system calls.

Side by Side Diff: sandbox/linux/seccomp-bpf/sandbox_bpf.cc

Issue 11419121: SECCOMP-BPF: Added support for greylisting of system calls. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: More unittest coverage Created 8 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

	5 #include <endian.h>

	6 #if __BYTE_ORDER == __BIG_ENDIAN

	7 // The BPF "struct seccomp_data" layout has to deal with storing 64bit

	8 // values that need to be inspected by a virtual machine that only ever

	9 // operates on 32bit values. The kernel developers decided how values

	10 // should be split into two 32bit words to achieve this goal. But at this

	11 // time, there is no existing BPF implementation in the kernel that uses

	12 // 64bit big endian values. So, all we have to go by is the consensus

	13 // from a discussion on LKLM. Actual implementations, if and when they

	14 // happen, might very well differ.

	15 // If this code is ever going to be used with such a kernel, you should

	16 // disable the "#error" and carefully test the code (e.g. run the unit

	17 // tests). If things don't work, search for all occurrences of __BYTE_ORDER

	18 // and verify that the proposed implementation agrees with what the kernel

	19 // actually does.

	20 #error Big endian operation is untested and expected to be broken

	21 #endif

	22

5 #include "sandbox/linux/seccomp-bpf/codegen.h"	23 #include "sandbox/linux/seccomp-bpf/codegen.h"

6 #include "sandbox/linux/seccomp-bpf/sandbox_bpf.h"	24 #include "sandbox/linux/seccomp-bpf/sandbox_bpf.h"

	25 #include "sandbox/linux/seccomp-bpf/syscall.h"

7 #include "sandbox/linux/seccomp-bpf/syscall_iterator.h"	26 #include "sandbox/linux/seccomp-bpf/syscall_iterator.h"

8 #include "sandbox/linux/seccomp-bpf/verifier.h"	27 #include "sandbox/linux/seccomp-bpf/verifier.h"

9	28

10 namespace {	29 namespace {

11	30

12 void WriteFailedStderrSetupMessage(int out_fd) {	31 void WriteFailedStderrSetupMessage(int out_fd) {

13 const char* error_string = strerror(errno);	32 const char* error_string = strerror(errno);

14 static const char msg[] = "Failed to set up stderr: ";	33 static const char msg[] = "Failed to set up stderr: ";

15 if (HANDLE_EINTR(write(out_fd, msg, sizeof(msg)-1)) > 0 && error_string &&	34 if (HANDLE_EINTR(write(out_fd, msg, sizeof(msg)-1)) > 0 && error_string &&

16 HANDLE_EINTR(write(out_fd, error_string, strlen(error_string))) > 0 &&	35 HANDLE_EINTR(write(out_fd, error_string, strlen(error_string))) > 0 &&

17 HANDLE_EINTR(write(out_fd, "\n", 1))) {	36 HANDLE_EINTR(write(out_fd, "\n", 1))) {

18 }	37 }

19 }	38 }

20	39

	40 // We need to tell whether we are performing a "normal" callback, or

	41 // whether we were called recursively from within a UnsafeTrap() callback.

	42 // This is a little tricky to do, because we need to somehow get access to

	43 // per-thread data from within a signal context. Normal TLS storage is not

	44 // safely accessible at this time. We could roll our own, but that involves

	45 // a lot of complexity. Instead, we co-opt one bit in the signal mask.

	46 // If BUS is blocked, we assume that we have been called recursively.

	47 // There is a possibility for collision with other code that needs to do

	48 // this, but in practice the risks are low.

	49 // If SIGBUS turns out to be a problem, we could instead co-opt one of the

	50 // realtime signals. There are plenty of them. Unfortunately, there is no

	51 // way to mark a signal as allocated. So, the potential for collision is

	52 // possibly even worse.

	53 bool GetIsInSigHandler(const ucontext_t *ctx) {

	54 return sigismember(&ctx->uc_sigmask, SIGBUS);

	55 }

	56

	57 void SetIsInSigHandler() {

	58 sigset_t mask;

	59 sigemptyset(&mask);

	60 sigaddset(&mask, SIGBUS);

	61 sigprocmask(SIG_BLOCK, &mask, NULL);

	62 }

	63

21 } // namespace	64 } // namespace

22	65

23 // The kernel gives us a sandbox, we turn it into a playground :-)	66 // The kernel gives us a sandbox, we turn it into a playground :-)

24 // This is version 2 of the playground; version 1 was built on top of	67 // This is version 2 of the playground; version 1 was built on top of

25 // pre-BPF seccomp mode.	68 // pre-BPF seccomp mode.

26 namespace playground2 {	69 namespace playground2 {

27	70

28 const int kExpectedExitCode = 100;	71 const int kExpectedExitCode = 100;

29	72

30 // We define a really simple sandbox policy. It is just good enough for us	73 // We define a really simple sandbox policy. It is just good enough for us

(...skipping 281 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
312 for (SyscallIterator iter(true); !iter.Done(); ) {	355 for (SyscallIterator iter(true); !iter.Done(); ) {

313 uint32_t sysnum = iter.Next();	356 uint32_t sysnum = iter.Next();

314 if (!isDenied(syscallEvaluator(sysnum, aux))) {	357 if (!isDenied(syscallEvaluator(sysnum, aux))) {

315 SANDBOX_DIE("Policies should deny system calls that are outside the "	358 SANDBOX_DIE("Policies should deny system calls that are outside the "

316 "expected range (typically MIN_SYSCALL..MAX_SYSCALL)");	359 "expected range (typically MIN_SYSCALL..MAX_SYSCALL)");

317 }	360 }

318 }	361 }

319 return;	362 return;

320 }	363 }

321	364

	365 void Sandbox::CheckForUnsafeErrorCodes(Instruction insn, void aux) {

	366 if (BPF_CLASS(insn->code) == BPF_RET &&

	367 insn->k > SECCOMP_RET_TRAP &&

	368 insn->k - SECCOMP_RET_TRAP <= trapArraySize_) {

	369 const ErrorCode& err = trapArray_[insn->k - SECCOMP_RET_TRAP - 1];

	370 if (!err.safe_) {

	371 bool is_unsafe = static_cast<bool >(aux);

	372 *is_unsafe = true;

	373 }

	374 }

	375 }

	376

	377 void Sandbox::RedirectToUserspace(Instruction insn, void aux) {

	378 // When inside an UnsafeTrap() callback, we want to allow all system calls.

	379 // This means, we must conditionally disable the sandbox -- and that's not

	380 // something that kernel-side BPF filters can do, as they cannot inspect

	381 // any state other than the syscall arguments.

	382 // But if we redirect all error handlers to user-space, then we can easily

	383 // make this decision.

	384 // The performance penalty for this extra round-trip to user-space is not

	385 // actually that bad, as we only ever pay it for denied system calls; and a

	386 // typical program has very few of these.

	387 if (BPF_CLASS(insn->code) == BPF_RET &&

	388 (insn->k & SECCOMP_RET_ACTION) == SECCOMP_RET_ERRNO) {

	389 insn->k = Trap(ReturnErrno,

	390 reinterpret_cast<void *>(insn->k & SECCOMP_RET_DATA)).err();

	391 }

	392 }

	393

	394 ErrorCode Sandbox::RedirectToUserspaceEvalWrapper(int sysnum, void *aux) {

	395 // We need to replicate the behavior of RedirectToUserspace(), so that our

	396 // Verifier can still work correctly.

	397 Evaluators evaluators = reinterpret_cast<Evaluators >(aux);

	398 const std::pair<EvaluateSyscall, void >& evaluator = evaluators->begin();

	399 ErrorCode err = evaluator.first(sysnum, evaluator.second);

	400 if ((err.err() & SECCOMP_RET_ACTION) == SECCOMP_RET_ERRNO) {

	401 return Trap(ReturnErrno,

	402 reinterpret_cast<void *>(err.err() & SECCOMP_RET_DATA));

	403 }

	404 return err;

	405 }

	406

322 void Sandbox::setSandboxPolicy(EvaluateSyscall syscallEvaluator, void *aux) {	407 void Sandbox::setSandboxPolicy(EvaluateSyscall syscallEvaluator, void *aux) {

323 if (status_ == STATUS_ENABLED) {	408 if (status_ == STATUS_ENABLED) {

324 SANDBOX_DIE("Cannot change policy after sandbox has started");	409 SANDBOX_DIE("Cannot change policy after sandbox has started");

325 }	410 }

326 policySanityChecks(syscallEvaluator, aux);	411 policySanityChecks(syscallEvaluator, aux);

327 evaluators_.push_back(std::make_pair(syscallEvaluator, aux));	412 evaluators_.push_back(std::make_pair(syscallEvaluator, aux));

328 }	413 }

329	414

330 void Sandbox::installFilter(bool quiet) {	415 void Sandbox::installFilter(bool quiet) {

331 // Verify that the user pushed a policy.	416 // Verify that the user pushed a policy.

332 if (evaluators_.empty()) {	417 if (evaluators_.empty()) {

333 filter_failed:	418 filter_failed:

334 SANDBOX_DIE("Failed to configure system call filters");	419 SANDBOX_DIE("Failed to configure system call filters");

335 }	420 }

336	421

337 // Set new SIGSYS handler	422 // Set new SIGSYS handler

338 struct sigaction sa;	423 struct sigaction sa;

339 memset(&sa, 0, sizeof(sa));	424 memset(&sa, 0, sizeof(sa));

340 sa.sa_sigaction = &sigSys;	425 sa.sa_sigaction = sigSys;

341 sa.sa_flags = SA_SIGINFO;	426 sa.sa_flags = SA_SIGINFO \| SA_NODEFER;

342 if (sigaction(SIGSYS, &sa, NULL) < 0) {	427 if (sigaction(SIGSYS, &sa, NULL) < 0) {

343 goto filter_failed;	428 goto filter_failed;

344 }	429 }

345	430

346 // Unmask SIGSYS	431 // Unmask SIGSYS

347 sigset_t mask;	432 sigset_t mask;

348 if (sigemptyset(&mask) \|\|	433 if (sigemptyset(&mask) \|\|

349 sigaddset(&mask, SIGSYS) \|\|	434 sigaddset(&mask, SIGSYS) \|\|

350 sigprocmask(SIG_UNBLOCK, &mask, NULL)) {	435 sigprocmask(SIG_UNBLOCK, &mask, NULL)) {

351 goto filter_failed;	436 goto filter_failed;

(...skipping 10 matching lines...) Expand all Loading...
362 if (!gen) {	447 if (!gen) {

363 SANDBOX_DIE("Out of memory");	448 SANDBOX_DIE("Out of memory");

364 }	449 }

365	450

366 // If the architecture doesn't match SECCOMP_ARCH, disallow the	451 // If the architecture doesn't match SECCOMP_ARCH, disallow the

367 // system call.	452 // system call.

368 Instruction *tail;	453 Instruction *tail;

369 Instruction *head =	454 Instruction *head =

370 gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS,	455 gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS,

371 offsetof(struct arch_seccomp_data, arch),	456 offsetof(struct arch_seccomp_data, arch),

	457 tail =

372 gen->MakeInstruction(BPF_JMP+BPF_JEQ+BPF_K, SECCOMP_ARCH,	458 gen->MakeInstruction(BPF_JMP+BPF_JEQ+BPF_K, SECCOMP_ARCH,

373 tail =	459 NULL,

374 // Grab the system call number, so that we can implement jump tables.

375 gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS,

376 offsetof(struct arch_seccomp_data, nr)),

377 gen->MakeInstruction(BPF_RET+BPF_K,	460 gen->MakeInstruction(BPF_RET+BPF_K,

378 Kill(	461 Kill(

379 "Invalid audit architecture in BPF filter").err_)));	462 "Invalid audit architecture in BPF filter").err_)));

380	463

381 // On Intel architectures, verify that system call numbers are in the

382 // expected number range. The older i386 and x86-64 APIs clear bit 30

383 // on all system calls. The newer x32 API always sets bit 30.

384 #if defined(__i386__) \|\| defined(__x86_64__)

385 Instruction *invalidX32 =

386 gen->MakeInstruction(BPF_RET+BPF_K,

387 Kill("Illegal mixing of system call ABIs").err_);

388 Instruction *checkX32 =

389 #if defined(__x86_64__) && defined(__ILP32__)

390 gen->MakeInstruction(BPF_JMP+BPF_JSET+BPF_K, 0x40000000, 0, invalidX32);

391 #else

392 gen->MakeInstruction(BPF_JMP+BPF_JSET+BPF_K, 0x40000000, invalidX32, 0);

393 #endif

394 gen->JoinInstructions(tail, checkX32);

395 tail = checkX32;

396 #endif

397

398

399 {	464 {

400 // Evaluate all possible system calls and group their ErrorCodes into	465 // Evaluate all possible system calls and group their ErrorCodes into

401 // ranges of identical codes.	466 // ranges of identical codes.

402 Ranges ranges;	467 Ranges ranges;

403 findRanges(&ranges);	468 findRanges(&ranges);

404	469

405 // Compile the system call ranges to an optimized BPF jumptable	470 // Compile the system call ranges to an optimized BPF jumptable

406 Instruction *jumptable =	471 Instruction *jumptable =

407 assembleJumpTable(gen, ranges.begin(), ranges.end());	472 assembleJumpTable(gen, ranges.begin(), ranges.end());

408	473

	474 // If there is at least one UnsafeTrap() in our program, the entire sandbox

	475 // is unsafe. We need to modify the program so that all non-

	476 // SECCOMP_RET_ALLOW ErrorCodes are handled in user-space. This will then

	477 // allow us to temporarily disable sandboxing rules inside of callbacks to

	478 // UnsafeTrap().

	479 has_unsafe_traps_ = false;

	480 gen->Traverse(jumptable, CheckForUnsafeErrorCodes, &has_unsafe_traps_);

	481

	482 // Grab the system call number, so that we can implement jump tables.

	483 Instruction *load_nr =

	484 gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS,

	485 offsetof(struct arch_seccomp_data, nr));

	486

	487 // If our BPF program has unsafe jumps, enable support for them. This

	488 // test happens very early in the BPF filter program. Even before we

	489 // consider looking at system call numbers.

	490 // As support for unsafe jumps essentially defeats all the security

	491 // measures that the sandbox provides, we print a big warning message --

	492 // and of course, we make sure to only ever enable this feature if it

	493 // is actually requested by the sandbox policy.

	494 if (has_unsafe_traps_) {

	495 if (SandboxSyscall(-1) == -1 && errno == ENOSYS) {

	496 SANDBOX_DIE("Support for UnsafeTrap() has not yet been ported to this "

	497 "architecture");

	498 }

	499

	500 EvaluateSyscall evaluateSyscall = evaluators_.begin()->first;

	501 void *aux = evaluators_.begin()->second;

	502 if (!evaluateSyscall(__NR_rt_sigprocmask, aux).

	503 Equals(ErrorCode(ErrorCode::ERR_ALLOWED)) \|\|

	504 !evaluateSyscall(__NR_rt_sigreturn, aux).

	505 Equals(ErrorCode(ErrorCode::ERR_ALLOWED))

	506 #if defined(__NR_sigprocmask)

	507 \|\| !evaluateSyscall(__NR_sigprocmask, aux).

	508 Equals(ErrorCode(ErrorCode::ERR_ALLOWED))

	509 #endif

	510 #if defined(__NR_sigreturn)

	511 \|\| !evaluateSyscall(__NR_sigreturn, aux).

	512 Equals(ErrorCode(ErrorCode::ERR_ALLOWED))

	513 #endif

	514 ) {

	515 SANDBOX_DIE("Invalid seccomp policy; if using UnsafeTrap(), you must "

	516 "unconditionally allow sigreturn() and sigprocmask()");

	517 }

	518

	519 SANDBOX_INFO("WARNING! Disabling sandbox for debugging purposes");

	520 gen->Traverse(jumptable, RedirectToUserspace, NULL);

	521

	522 // Allow system calls, if they originate from our magic return address

	523 // (which we can query by calling SandboxSyscall(-1)).

	524 uintptr_t syscall_entry_point =

	525 static_cast<uintptr_t>(SandboxSyscall(-1));

	526 uint32_t low = static_cast<uint32_t>(syscall_entry_point);

	527 #if __SIZEOF_POINTER__ > 4

	528 uint32_t hi = static_cast<uint32_t>(syscall_entry_point >> 32);

	529 #endif

	530

	531 // BPF cannot do native 64bit comparisons. On 64bit architectures, we

	532 // have to compare both 32bit halfs of the instruction pointer. If they

	533 // match what we expect, we return ERR_ALLOWED. If either or both don't

	534 // match, we continue evalutating the rest of the sandbox policy.

	535 Instruction *escape_hatch =

	536 gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS,

	537 offsetof(struct arch_seccomp_data,

	538 instruction_pointer) +

	539 (__SIZEOF_POINTER__ > 4 &&

	540 __BYTE_ORDER == __BIG_ENDIAN ? 4 : 0),

	541 gen->MakeInstruction(BPF_JMP+BPF_JEQ+BPF_K, low,

	542 #if __SIZEOF_POINTER__ > 4

	543 gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS,

	544 offsetof(struct arch_seccomp_data,

	545 instruction_pointer) +

	546 (__BYTE_ORDER == __BIG_ENDIAN ? 0 : 4),

	547 gen->MakeInstruction(BPF_JMP+BPF_JEQ+BPF_K, hi,

	548 #endif

	549 gen->MakeInstruction(BPF_RET+BPF_K, ErrorCode(ErrorCode::ERR_ALLOWED)),

	550 #if __SIZEOF_POINTER__ > 4

	551 load_nr)),

	552 #endif

	553 load_nr));

	554 gen->JoinInstructions(tail, escape_hatch);

	555 } else {

	556 gen->JoinInstructions(tail, load_nr);

	557 }

	558 tail = load_nr;

	559

	560 // On Intel architectures, verify that system call numbers are in the

	561 // expected number range. The older i386 and x86-64 APIs clear bit 30

	562 // on all system calls. The newer x32 API always sets bit 30.

	563 #if defined(__i386__) \|\| defined(__x86_64__)

	564 Instruction *invalidX32 =

	565 gen->MakeInstruction(BPF_RET+BPF_K,

	566 Kill("Illegal mixing of system call ABIs").err_);

	567 Instruction *checkX32 =

	568 #if defined(__x86_64__) && defined(__ILP32__)

	569 gen->MakeInstruction(BPF_JMP+BPF_JSET+BPF_K, 0x40000000, 0, invalidX32);

	570 #else

	571 gen->MakeInstruction(BPF_JMP+BPF_JSET+BPF_K, 0x40000000, invalidX32, 0);

	572 #endif

	573 gen->JoinInstructions(tail, checkX32);

	574 tail = checkX32;

	575 #endif

	576

409 // Append jump table to our pre-amble	577 // Append jump table to our pre-amble

410 gen->JoinInstructions(tail, jumptable);	578 gen->JoinInstructions(tail, jumptable);

411 }	579 }

412	580

413 // Turn the DAG into a vector of instructions.	581 // Turn the DAG into a vector of instructions.

414 Program *program = new Program();	582 Program *program = new Program();

415 gen->Compile(head, program);	583 gen->Compile(head, program);

416 delete gen;	584 delete gen;

417	585

418 // Make sure compilation resulted in BPF program that executes	586 // Make sure compilation resulted in BPF program that executes

419 // correctly. Otherwise, there is an internal error in our BPF compiler.	587 // correctly. Otherwise, there is an internal error in our BPF compiler.

420 // There is really nothing the caller can do until the bug is fixed.	588 // There is really nothing the caller can do until the bug is fixed.

421 #ifndef NDEBUG	589 #ifndef NDEBUG

422 const char *err = NULL;	590 {

423 if (!Verifier::VerifyBPF(*program, evaluators_, &err)) {	591 // If we previously rewrote the BPF program so that it calls user-space

424 SANDBOX_DIE(err);	592 // whenever we return an "errno" value from the filter, then we have to

	593 // wrap our system call evaluator to perform the same operation. Otherwise,

	594 // the verifier would also report a mismatch in return codes.

	595 Evaluators redirected_evaluators;

	596 redirected_evaluators.push_back(

	597 std::make_pair(RedirectToUserspaceEvalWrapper, &evaluators_));

	598

	599 const char *err = NULL;

	600 if (!Verifier::VerifyBPF(

	601 *program,

	602 has_unsafe_traps_ ? redirected_evaluators : evaluators_,

	603 &err)) {

	604 SANDBOX_DIE(err);

	605 }

425 }	606 }

426 #endif	607 #endif

427	608

428 // We want to be very careful in not imposing any requirements on the	609 // We want to be very careful in not imposing any requirements on the

429 // policies that are set with setSandboxPolicy(). This means, as soon as	610 // policies that are set with setSandboxPolicy(). This means, as soon as

430 // the sandbox is active, we shouldn't be relying on libraries that could	611 // the sandbox is active, we shouldn't be relying on libraries that could

431 // be making system calls. This, for example, means we should avoid	612 // be making system calls. This, for example, means we should avoid

432 // using the heap and we should avoid using STL functions.	613 // using the heap and we should avoid using STL functions.

433 // Temporarily copy the contents of the "program" vector into a	614 // Temporarily copy the contents of the "program" vector into a

434 // stack-allocated array; and then explicitly destroy that object.	615 // stack-allocated array; and then explicitly destroy that object.

435 // This makes sure we don't ex- or implicitly call new/delete after we	616 // This makes sure we don't ex- or implicitly call new/delete after we

436 // installed the BPF filter program in the kernel. Depending on the	617 // installed the BPF filter program in the kernel. Depending on the

437 // system memory allocator that is in effect, these operators can result	618 // system memory allocator that is in effect, these operators can result

438 // in system calls to things like munmap() or brk().	619 // in system calls to things like munmap() or brk().

439 struct sock_filter bpf[program->size()];	620 struct sock_filter bpf[program->size()];

440 const struct sock_fprog prog = {	621 const struct sock_fprog prog = {

441 static_cast<unsigned short>(program->size()), bpf };	622 static_cast<unsigned short>(program->size()), bpf };

442 memcpy(bpf, &(*program)[0], sizeof(bpf));	623 memcpy(bpf, &(*program)[0], sizeof(bpf));

443 delete program;	624 delete program;

444	625

445 // Release memory that is no longer needed	626 // Release memory that is no longer needed

446 evaluators_.clear();	627 evaluators_.clear();

447 errMap_.clear();

448	628

449 #if defined(SECCOMP_BPF_VALGRIND_HACKS)	629 #if defined(SECCOMP_BPF_VALGRIND_HACKS)

450 // Valgrind is really not happy about our sandbox. Disable it when running	630 // Valgrind is really not happy about our sandbox. Disable it when running

451 // in Valgrind. This feature is dangerous and should never be enabled by	631 // in Valgrind. This feature is dangerous and should never be enabled by

452 // default. We protect it behind a pre-processor option.	632 // default. We protect it behind a pre-processor option.

453 if (!RUNNING_ON_VALGRIND)	633 if (!RUNNING_ON_VALGRIND)

454 #endif	634 #endif

455 {	635 {

456 // Install BPF filter program	636 // Install BPF filter program

457 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {	637 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {

(...skipping 96 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
554 struct arch_sigsys sigsys;	734 struct arch_sigsys sigsys;

555 memcpy(&sigsys, &info->_sifields, sizeof(sigsys));	735 memcpy(&sigsys, &info->_sifields, sizeof(sigsys));

556	736

557 // Some more sanity checks.	737 // Some more sanity checks.

558 if (sigsys.ip != reinterpret_cast<void *>(SECCOMP_IP(ctx)) \|\|	738 if (sigsys.ip != reinterpret_cast<void *>(SECCOMP_IP(ctx)) \|\|

559 sigsys.nr != static_cast<int>(SECCOMP_SYSCALL(ctx)) \|\|	739 sigsys.nr != static_cast<int>(SECCOMP_SYSCALL(ctx)) \|\|

560 sigsys.arch != SECCOMP_ARCH) {	740 sigsys.arch != SECCOMP_ARCH) {

561 goto sigsys_err;	741 goto sigsys_err;

562 }	742 }

563	743

564 // Copy the seccomp-specific data into a arch_seccomp_data structure. This	744 intptr_t rc;

565 // is what we are showing to TrapFnc callbacks that the system call evaluator	745 if (has_unsafe_traps_ && GetIsInSigHandler(ctx)) {

566 // registered with the sandbox.	746 errno = old_errno;

567 struct arch_seccomp_data data = {	747 if (sigsys.nr == __NR_clone) {

568 sigsys.nr,	748 SANDBOX_DIE("Cannot call clone() from an UnsafeTrap() handler");

569 SECCOMP_ARCH,

570 reinterpret_cast<uint64_t>(sigsys.ip),

571 {

572 static_cast<uint64_t>(SECCOMP_PARM1(ctx)),

573 static_cast<uint64_t>(SECCOMP_PARM2(ctx)),

574 static_cast<uint64_t>(SECCOMP_PARM3(ctx)),

575 static_cast<uint64_t>(SECCOMP_PARM4(ctx)),

576 static_cast<uint64_t>(SECCOMP_PARM5(ctx)),

577 static_cast<uint64_t>(SECCOMP_PARM6(ctx))

578 }	749 }

579 };	750 rc = SandboxSyscall(sigsys.nr,

	751 SECCOMP_PARM1(ctx), SECCOMP_PARM2(ctx),

	752 SECCOMP_PARM3(ctx), SECCOMP_PARM4(ctx),

	753 SECCOMP_PARM5(ctx), SECCOMP_PARM6(ctx));

	754 } else {

	755 const ErrorCode& err = trapArray_[info->si_errno - 1];

	756 if (!err.safe_) {

	757 SetIsInSigHandler();

	758 }

580	759

581 // Now call the TrapFnc callback associated with this particular instance	760 // Copy the seccomp-specific data into a arch_seccomp_data structure. This

582 // of SECCOMP_RET_TRAP.	761 // is what we are showing to TrapFnc callbacks that the system call

583 const ErrorCode& err = trapArray_[info->si_errno - 1];	762 // evaluator registered with the sandbox.

584 intptr_t rc = err.fnc_(data, err.aux_);	763 struct arch_seccomp_data data = {

	764 sigsys.nr,

	765 SECCOMP_ARCH,

	766 reinterpret_cast<uint64_t>(sigsys.ip),

	767 {

	768 static_cast<uint64_t>(SECCOMP_PARM1(ctx)),

	769 static_cast<uint64_t>(SECCOMP_PARM2(ctx)),

	770 static_cast<uint64_t>(SECCOMP_PARM3(ctx)),

	771 static_cast<uint64_t>(SECCOMP_PARM4(ctx)),

	772 static_cast<uint64_t>(SECCOMP_PARM5(ctx)),

	773 static_cast<uint64_t>(SECCOMP_PARM6(ctx))

	774 }

	775 };

	776

	777 // Now call the TrapFnc callback associated with this particular instance

	778 // of SECCOMP_RET_TRAP.

	779 rc = err.fnc_(data, err.aux_);

	780 }

585	781

586 // Update the CPU register that stores the return code of the system call	782 // Update the CPU register that stores the return code of the system call

587 // that we just handled, and restore "errno" to the value that it had	783 // that we just handled, and restore "errno" to the value that it had

588 // before entering the signal handler.	784 // before entering the signal handler.

589 SECCOMP_RESULT(ctx) = static_cast<greg_t>(rc);	785 SECCOMP_RESULT(ctx) = static_cast<greg_t>(rc);

590 errno = old_errno;	786 errno = old_errno;

591	787

592 return;	788 return;

593 }	789 }

594	790

595 ErrorCode Sandbox::Trap(ErrorCode::TrapFnc fnc, const void *aux) {	791 bool Sandbox::TrapKey::operator<(const Sandbox::TrapKey& o) const {

	792 if (fnc != o.fnc) {

	793 return fnc < o.fnc;

	794 } else if (aux != o.aux) {

	795 return aux < o.aux;

	796 } else {

	797 return safe < o.safe;

	798 }

	799 }

	800

	801 ErrorCode Sandbox::MakeTrap(ErrorCode::TrapFnc fnc, const void *aux,

	802 bool safe) {

596 // Each unique pair of TrapFnc and auxiliary data make up a distinct instance	803 // Each unique pair of TrapFnc and auxiliary data make up a distinct instance

597 // of a SECCOMP_RET_TRAP.	804 // of a SECCOMP_RET_TRAP.

598 std::pair<ErrorCode::TrapFnc, const void *> key(fnc, aux);	805 TrapKey key(fnc, aux, safe);

599 TrapIds::const_iterator iter = trapIds_.find(key);	806 TrapIds::const_iterator iter = trapIds_.find(key);

600 uint16_t id;	807 uint16_t id;

601 if (iter != trapIds_.end()) {	808 if (iter != trapIds_.end()) {

602 // We have seen this pair before. Return the same id that we assigned	809 // We have seen this pair before. Return the same id that we assigned

603 // earlier.	810 // earlier.

604 id = iter->second;	811 id = iter->second;

605 } else {	812 } else {

606 // This is a new pair. Remember it and assign a new id.	813 // This is a new pair. Remember it and assign a new id.

607 // Please note that we have to store traps in memory that doesn't get	814 // Please note that we have to store traps in memory that doesn't get

608 // deallocated when the program is shutting down. A memory leak is	815 // deallocated when the program is shutting down. A memory leak is

609 // intentional, because we might otherwise not be able to execute	816 // intentional, because we might otherwise not be able to execute

610 // system calls part way through the program shutting down	817 // system calls part way through the program shutting down

611 if (!traps_) {	818 if (!traps_) {

612 traps_ = new Traps();	819 traps_ = new Traps();

613 }	820 }

614 if (traps_->size() >= SECCOMP_RET_DATA) {	821 if (traps_->size() >= SECCOMP_RET_DATA) {

615 // In practice, this is pretty much impossible to trigger, as there	822 // In practice, this is pretty much impossible to trigger, as there

616 // are other kernel limitations that restrict overall BPF program sizes.	823 // are other kernel limitations that restrict overall BPF program sizes.

617 SANDBOX_DIE("Too many SECCOMP_RET_TRAP callback instances");	824 SANDBOX_DIE("Too many SECCOMP_RET_TRAP callback instances");

618 }	825 }

619 id = traps_->size() + 1;	826 id = traps_->size() + 1;

620	827

621 traps_->push_back(ErrorCode(fnc, aux, id));	828 traps_->push_back(ErrorCode(fnc, aux, safe, id));

622 trapIds_[key] = id;	829 trapIds_[key] = id;

623	830

624 // We want to access the traps_ vector from our signal handler. But	831 // We want to access the traps_ vector from our signal handler. But

625 // we are not assured that doing so is async-signal safe. On the other	832 // we are not assured that doing so is async-signal safe. On the other

626 // hand, C++ guarantees that the contents of a vector is stored in a	833 // hand, C++ guarantees that the contents of a vector is stored in a

627 // contiguous C-style array.	834 // contiguous C-style array.

628 // So, we look up the address and size of this array outside of the	835 // So, we look up the address and size of this array outside of the

629 // signal handler, where we can safely do so.	836 // signal handler, where we can safely do so.

630 trapArray_ = &(*traps_)[0];	837 trapArray_ = &(*traps_)[0];

631 trapArraySize_ = id;	838 trapArraySize_ = id;

	839 return traps_->back();

632 }	840 }

633	841

634 ErrorCode err = ErrorCode(fnc, aux, id);	842 return ErrorCode(fnc, aux, safe, id);

635 return errMap_[err.err()] = err;	843 }

	844

	845 ErrorCode Sandbox::Trap(ErrorCode::TrapFnc fnc, const void *aux) {

	846 return MakeTrap(fnc, aux, true /* Safe Trap */);

	847 }

	848

	849 ErrorCode Sandbox::UnsafeTrap(ErrorCode::TrapFnc fnc, const void *aux) {

	850 return MakeTrap(fnc, aux, false /* Unsafe Trap */);

	851 }

	852

	853 intptr_t Sandbox::ForwardSyscall(const struct arch_seccomp_data& args) {

	854 return SandboxSyscall(args.nr,

	855 static_cast<intptr_t>(args.args[0]),

	856 static_cast<intptr_t>(args.args[1]),

	857 static_cast<intptr_t>(args.args[2]),

	858 static_cast<intptr_t>(args.args[3]),

	859 static_cast<intptr_t>(args.args[4]),

	860 static_cast<intptr_t>(args.args[5]));

	861 }

	862

	863 intptr_t Sandbox::ReturnErrno(const struct arch_seccomp_data&, void *aux) {

	864 // TrapFnc functions report error by following the native kernel convention

	865 // of returning an exit code in the range of -1..-4096. They do not try to

	866 // set errno themselves. The glibc wrapper that triggered the SIGSYS will

	867 // ultimately do so for us.

	868 int err = reinterpret_cast<intptr_t>(aux) & SECCOMP_RET_DATA;

	869 return -err;

636 }	870 }

637	871

638 intptr_t Sandbox::bpfFailure(const struct arch_seccomp_data&, void *aux) {	872 intptr_t Sandbox::bpfFailure(const struct arch_seccomp_data&, void *aux) {

639 SANDBOX_DIE(static_cast<char *>(aux));	873 SANDBOX_DIE(static_cast<char *>(aux));

640 }	874 }

641	875

642 ErrorCode Sandbox::Kill(const char *msg) {	876 ErrorCode Sandbox::Kill(const char *msg) {

643 return Trap(bpfFailure, const_cast<char *>(msg));	877 return Trap(bpfFailure, const_cast<char *>(msg));

644 }	878 }

645	879

646 Sandbox::SandboxStatus Sandbox::status_ = STATUS_UNKNOWN;	880 Sandbox::SandboxStatus Sandbox::status_ = STATUS_UNKNOWN;

647 int Sandbox::proc_fd_ = -1;	881 int Sandbox::proc_fd_ = -1;

648 Sandbox::Evaluators Sandbox::evaluators_;	882 Sandbox::Evaluators Sandbox::evaluators_;

649 Sandbox::ErrMap Sandbox::errMap_;

650 Sandbox::Traps *Sandbox::traps_ = NULL;	883 Sandbox::Traps *Sandbox::traps_ = NULL;

651 Sandbox::TrapIds Sandbox::trapIds_;	884 Sandbox::TrapIds Sandbox::trapIds_;

652 ErrorCode *Sandbox::trapArray_ = NULL;	885 ErrorCode *Sandbox::trapArray_ = NULL;

653 size_t Sandbox::trapArraySize_ = 0;	886 size_t Sandbox::trapArraySize_ = 0;

	887 bool Sandbox::has_unsafe_traps_ = false;

654	888

655 } // namespace	889 } // namespace

OLD	NEW