| OLD | NEW |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include <endian.h> |
| 6 #if __BYTE_ORDER == __BIG_ENDIAN |
| 7 // The BPF "struct seccomp_data" layout has to deal with storing 64bit |
| 8 // values that need to be inspected by a virtual machine that only ever |
| 9 // operates on 32bit values. The kernel developers decided how values |
| 10 // should be split into two 32bit words to achieve this goal. But at this |
| 11 // time, there is no existing BPF implementation in the kernel that uses |
| 12 // 64bit big endian values. So, all we have to go by is the consensus |
| 13 // from a discussion on LKLM. Actual implementations, if and when they |
| 14 // happen, might very well differ. |
| 15 // If this code is ever going to be used with such a kernel, you should |
| 16 // disable the "#error" and carefully test the code (e.g. run the unit |
| 17 // tests). If things don't work, search for all occurrences of __BYTE_ORDER |
| 18 // and verify that the proposed implementation agrees with what the kernel |
| 19 // actually does. |
| 20 #error Big endian operation is untested and expected to be broken |
| 21 #endif |
| 22 |
| 5 #include "sandbox/linux/seccomp-bpf/codegen.h" | 23 #include "sandbox/linux/seccomp-bpf/codegen.h" |
| 6 #include "sandbox/linux/seccomp-bpf/sandbox_bpf.h" | 24 #include "sandbox/linux/seccomp-bpf/sandbox_bpf.h" |
| 25 #include "sandbox/linux/seccomp-bpf/syscall.h" |
| 7 #include "sandbox/linux/seccomp-bpf/syscall_iterator.h" | 26 #include "sandbox/linux/seccomp-bpf/syscall_iterator.h" |
| 8 #include "sandbox/linux/seccomp-bpf/verifier.h" | 27 #include "sandbox/linux/seccomp-bpf/verifier.h" |
| 9 | 28 |
| 10 namespace { | 29 namespace { |
| 11 | 30 |
| 12 void WriteFailedStderrSetupMessage(int out_fd) { | 31 void WriteFailedStderrSetupMessage(int out_fd) { |
| 13 const char* error_string = strerror(errno); | 32 const char* error_string = strerror(errno); |
| 14 static const char msg[] = "Failed to set up stderr: "; | 33 static const char msg[] = "Failed to set up stderr: "; |
| 15 if (HANDLE_EINTR(write(out_fd, msg, sizeof(msg)-1)) > 0 && error_string && | 34 if (HANDLE_EINTR(write(out_fd, msg, sizeof(msg)-1)) > 0 && error_string && |
| 16 HANDLE_EINTR(write(out_fd, error_string, strlen(error_string))) > 0 && | 35 HANDLE_EINTR(write(out_fd, error_string, strlen(error_string))) > 0 && |
| 17 HANDLE_EINTR(write(out_fd, "\n", 1))) { | 36 HANDLE_EINTR(write(out_fd, "\n", 1))) { |
| 18 } | 37 } |
| 19 } | 38 } |
| 20 | 39 |
| 40 // We need to tell whether we are performing a "normal" callback, or |
| 41 // whether we were called recursively from within a UnsafeTrap() callback. |
| 42 // This is a little tricky to do, because we need to somehow get access to |
| 43 // per-thread data from within a signal context. Normal TLS storage is not |
| 44 // safely accessible at this time. We could roll our own, but that involves |
| 45 // a lot of complexity. Instead, we co-opt one bit in the signal mask. |
| 46 // If BUS is blocked, we assume that we have been called recursively. |
| 47 // There is a possibility for collision with other code that needs to do |
| 48 // this, but in practice the risks are low. |
| 49 // If SIGBUS turns out to be a problem, we could instead co-opt one of the |
| 50 // realtime signals. There are plenty of them. Unfortunately, there is no |
| 51 // way to mark a signal as allocated. So, the potential for collision is |
| 52 // possibly even worse. |
| 53 bool GetIsInSigHandler(const ucontext_t *ctx) { |
| 54 return sigismember(&ctx->uc_sigmask, SIGBUS); |
| 55 } |
| 56 |
| 57 void SetIsInSigHandler() { |
| 58 sigset_t mask; |
| 59 sigemptyset(&mask); |
| 60 sigaddset(&mask, SIGBUS); |
| 61 sigprocmask(SIG_BLOCK, &mask, NULL); |
| 62 } |
| 63 |
| 21 } // namespace | 64 } // namespace |
| 22 | 65 |
| 23 // The kernel gives us a sandbox, we turn it into a playground :-) | 66 // The kernel gives us a sandbox, we turn it into a playground :-) |
| 24 // This is version 2 of the playground; version 1 was built on top of | 67 // This is version 2 of the playground; version 1 was built on top of |
| 25 // pre-BPF seccomp mode. | 68 // pre-BPF seccomp mode. |
| 26 namespace playground2 { | 69 namespace playground2 { |
| 27 | 70 |
| 28 const int kExpectedExitCode = 100; | 71 const int kExpectedExitCode = 100; |
| 29 | 72 |
| 30 // We define a really simple sandbox policy. It is just good enough for us | 73 // We define a really simple sandbox policy. It is just good enough for us |
| (...skipping 281 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 312 for (SyscallIterator iter(true); !iter.Done(); ) { | 355 for (SyscallIterator iter(true); !iter.Done(); ) { |
| 313 uint32_t sysnum = iter.Next(); | 356 uint32_t sysnum = iter.Next(); |
| 314 if (!isDenied(syscallEvaluator(sysnum, aux))) { | 357 if (!isDenied(syscallEvaluator(sysnum, aux))) { |
| 315 SANDBOX_DIE("Policies should deny system calls that are outside the " | 358 SANDBOX_DIE("Policies should deny system calls that are outside the " |
| 316 "expected range (typically MIN_SYSCALL..MAX_SYSCALL)"); | 359 "expected range (typically MIN_SYSCALL..MAX_SYSCALL)"); |
| 317 } | 360 } |
| 318 } | 361 } |
| 319 return; | 362 return; |
| 320 } | 363 } |
| 321 | 364 |
| 365 void Sandbox::CheckForUnsafeErrorCodes(Instruction *insn, void *aux) { |
| 366 if (BPF_CLASS(insn->code) == BPF_RET && |
| 367 insn->k > SECCOMP_RET_TRAP && |
| 368 insn->k - SECCOMP_RET_TRAP <= trapArraySize_) { |
| 369 const ErrorCode& err = trapArray_[insn->k - SECCOMP_RET_TRAP - 1]; |
| 370 if (!err.safe_) { |
| 371 bool *is_unsafe = static_cast<bool *>(aux); |
| 372 *is_unsafe = true; |
| 373 } |
| 374 } |
| 375 } |
| 376 |
| 377 void Sandbox::RedirectToUserspace(Instruction *insn, void *aux) { |
| 378 // When inside an UnsafeTrap() callback, we want to allow all system calls. |
| 379 // This means, we must conditionally disable the sandbox -- and that's not |
| 380 // something that kernel-side BPF filters can do, as they cannot inspect |
| 381 // any state other than the syscall arguments. |
| 382 // But if we redirect all error handlers to user-space, then we can easily |
| 383 // make this decision. |
| 384 // The performance penalty for this extra round-trip to user-space is not |
| 385 // actually that bad, as we only ever pay it for denied system calls; and a |
| 386 // typical program has very few of these. |
| 387 if (BPF_CLASS(insn->code) == BPF_RET && |
| 388 (insn->k & SECCOMP_RET_ACTION) == SECCOMP_RET_ERRNO) { |
| 389 insn->k = Trap(ReturnErrno, |
| 390 reinterpret_cast<void *>(insn->k & SECCOMP_RET_DATA)).err(); |
| 391 } |
| 392 } |
| 393 |
| 394 ErrorCode Sandbox::RedirectToUserspaceEvalWrapper(int sysnum, void *aux) { |
| 395 // We need to replicate the behavior of RedirectToUserspace(), so that our |
| 396 // Verifier can still work correctly. |
| 397 Evaluators *evaluators = reinterpret_cast<Evaluators *>(aux); |
| 398 const std::pair<EvaluateSyscall, void *>& evaluator = *evaluators->begin(); |
| 399 ErrorCode err = evaluator.first(sysnum, evaluator.second); |
| 400 if ((err.err() & SECCOMP_RET_ACTION) == SECCOMP_RET_ERRNO) { |
| 401 return Trap(ReturnErrno, |
| 402 reinterpret_cast<void *>(err.err() & SECCOMP_RET_DATA)); |
| 403 } |
| 404 return err; |
| 405 } |
| 406 |
| 322 void Sandbox::setSandboxPolicy(EvaluateSyscall syscallEvaluator, void *aux) { | 407 void Sandbox::setSandboxPolicy(EvaluateSyscall syscallEvaluator, void *aux) { |
| 323 if (status_ == STATUS_ENABLED) { | 408 if (status_ == STATUS_ENABLED) { |
| 324 SANDBOX_DIE("Cannot change policy after sandbox has started"); | 409 SANDBOX_DIE("Cannot change policy after sandbox has started"); |
| 325 } | 410 } |
| 326 policySanityChecks(syscallEvaluator, aux); | 411 policySanityChecks(syscallEvaluator, aux); |
| 327 evaluators_.push_back(std::make_pair(syscallEvaluator, aux)); | 412 evaluators_.push_back(std::make_pair(syscallEvaluator, aux)); |
| 328 } | 413 } |
| 329 | 414 |
| 330 void Sandbox::installFilter(bool quiet) { | 415 void Sandbox::installFilter(bool quiet) { |
| 331 // Verify that the user pushed a policy. | 416 // Verify that the user pushed a policy. |
| 332 if (evaluators_.empty()) { | 417 if (evaluators_.empty()) { |
| 333 filter_failed: | 418 filter_failed: |
| 334 SANDBOX_DIE("Failed to configure system call filters"); | 419 SANDBOX_DIE("Failed to configure system call filters"); |
| 335 } | 420 } |
| 336 | 421 |
| 337 // Set new SIGSYS handler | 422 // Set new SIGSYS handler |
| 338 struct sigaction sa; | 423 struct sigaction sa; |
| 339 memset(&sa, 0, sizeof(sa)); | 424 memset(&sa, 0, sizeof(sa)); |
| 340 sa.sa_sigaction = &sigSys; | 425 sa.sa_sigaction = sigSys; |
| 341 sa.sa_flags = SA_SIGINFO; | 426 sa.sa_flags = SA_SIGINFO | SA_NODEFER; |
| 342 if (sigaction(SIGSYS, &sa, NULL) < 0) { | 427 if (sigaction(SIGSYS, &sa, NULL) < 0) { |
| 343 goto filter_failed; | 428 goto filter_failed; |
| 344 } | 429 } |
| 345 | 430 |
| 346 // Unmask SIGSYS | 431 // Unmask SIGSYS |
| 347 sigset_t mask; | 432 sigset_t mask; |
| 348 if (sigemptyset(&mask) || | 433 if (sigemptyset(&mask) || |
| 349 sigaddset(&mask, SIGSYS) || | 434 sigaddset(&mask, SIGSYS) || |
| 350 sigprocmask(SIG_UNBLOCK, &mask, NULL)) { | 435 sigprocmask(SIG_UNBLOCK, &mask, NULL)) { |
| 351 goto filter_failed; | 436 goto filter_failed; |
| (...skipping 10 matching lines...) Expand all Loading... |
| 362 if (!gen) { | 447 if (!gen) { |
| 363 SANDBOX_DIE("Out of memory"); | 448 SANDBOX_DIE("Out of memory"); |
| 364 } | 449 } |
| 365 | 450 |
| 366 // If the architecture doesn't match SECCOMP_ARCH, disallow the | 451 // If the architecture doesn't match SECCOMP_ARCH, disallow the |
| 367 // system call. | 452 // system call. |
| 368 Instruction *tail; | 453 Instruction *tail; |
| 369 Instruction *head = | 454 Instruction *head = |
| 370 gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS, | 455 gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS, |
| 371 offsetof(struct arch_seccomp_data, arch), | 456 offsetof(struct arch_seccomp_data, arch), |
| 457 tail = |
| 372 gen->MakeInstruction(BPF_JMP+BPF_JEQ+BPF_K, SECCOMP_ARCH, | 458 gen->MakeInstruction(BPF_JMP+BPF_JEQ+BPF_K, SECCOMP_ARCH, |
| 373 tail = | 459 NULL, |
| 374 // Grab the system call number, so that we can implement jump tables. | |
| 375 gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS, | |
| 376 offsetof(struct arch_seccomp_data, nr)), | |
| 377 gen->MakeInstruction(BPF_RET+BPF_K, | 460 gen->MakeInstruction(BPF_RET+BPF_K, |
| 378 Kill( | 461 Kill( |
| 379 "Invalid audit architecture in BPF filter").err_))); | 462 "Invalid audit architecture in BPF filter").err_))); |
| 380 | 463 |
| 381 // On Intel architectures, verify that system call numbers are in the | |
| 382 // expected number range. The older i386 and x86-64 APIs clear bit 30 | |
| 383 // on all system calls. The newer x32 API always sets bit 30. | |
| 384 #if defined(__i386__) || defined(__x86_64__) | |
| 385 Instruction *invalidX32 = | |
| 386 gen->MakeInstruction(BPF_RET+BPF_K, | |
| 387 Kill("Illegal mixing of system call ABIs").err_); | |
| 388 Instruction *checkX32 = | |
| 389 #if defined(__x86_64__) && defined(__ILP32__) | |
| 390 gen->MakeInstruction(BPF_JMP+BPF_JSET+BPF_K, 0x40000000, 0, invalidX32); | |
| 391 #else | |
| 392 gen->MakeInstruction(BPF_JMP+BPF_JSET+BPF_K, 0x40000000, invalidX32, 0); | |
| 393 #endif | |
| 394 gen->JoinInstructions(tail, checkX32); | |
| 395 tail = checkX32; | |
| 396 #endif | |
| 397 | |
| 398 | |
| 399 { | 464 { |
| 400 // Evaluate all possible system calls and group their ErrorCodes into | 465 // Evaluate all possible system calls and group their ErrorCodes into |
| 401 // ranges of identical codes. | 466 // ranges of identical codes. |
| 402 Ranges ranges; | 467 Ranges ranges; |
| 403 findRanges(&ranges); | 468 findRanges(&ranges); |
| 404 | 469 |
| 405 // Compile the system call ranges to an optimized BPF jumptable | 470 // Compile the system call ranges to an optimized BPF jumptable |
| 406 Instruction *jumptable = | 471 Instruction *jumptable = |
| 407 assembleJumpTable(gen, ranges.begin(), ranges.end()); | 472 assembleJumpTable(gen, ranges.begin(), ranges.end()); |
| 408 | 473 |
| 474 // If there is at least one UnsafeTrap() in our program, the entire sandbox |
| 475 // is unsafe. We need to modify the program so that all non- |
| 476 // SECCOMP_RET_ALLOW ErrorCodes are handled in user-space. This will then |
| 477 // allow us to temporarily disable sandboxing rules inside of callbacks to |
| 478 // UnsafeTrap(). |
| 479 has_unsafe_traps_ = false; |
| 480 gen->Traverse(jumptable, CheckForUnsafeErrorCodes, &has_unsafe_traps_); |
| 481 |
| 482 // Grab the system call number, so that we can implement jump tables. |
| 483 Instruction *load_nr = |
| 484 gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS, |
| 485 offsetof(struct arch_seccomp_data, nr)); |
| 486 |
| 487 // If our BPF program has unsafe jumps, enable support for them. This |
| 488 // test happens very early in the BPF filter program. Even before we |
| 489 // consider looking at system call numbers. |
| 490 // As support for unsafe jumps essentially defeats all the security |
| 491 // measures that the sandbox provides, we print a big warning message -- |
| 492 // and of course, we make sure to only ever enable this feature if it |
| 493 // is actually requested by the sandbox policy. |
| 494 if (has_unsafe_traps_) { |
| 495 if (SandboxSyscall(-1) == -1 && errno == ENOSYS) { |
| 496 SANDBOX_DIE("Support for UnsafeTrap() has not yet been ported to this " |
| 497 "architecture"); |
| 498 } |
| 499 |
| 500 EvaluateSyscall evaluateSyscall = evaluators_.begin()->first; |
| 501 void *aux = evaluators_.begin()->second; |
| 502 if (!evaluateSyscall(__NR_rt_sigprocmask, aux). |
| 503 Equals(ErrorCode(ErrorCode::ERR_ALLOWED)) || |
| 504 !evaluateSyscall(__NR_rt_sigreturn, aux). |
| 505 Equals(ErrorCode(ErrorCode::ERR_ALLOWED)) |
| 506 #if defined(__NR_sigprocmask) |
| 507 || !evaluateSyscall(__NR_sigprocmask, aux). |
| 508 Equals(ErrorCode(ErrorCode::ERR_ALLOWED)) |
| 509 #endif |
| 510 #if defined(__NR_sigreturn) |
| 511 || !evaluateSyscall(__NR_sigreturn, aux). |
| 512 Equals(ErrorCode(ErrorCode::ERR_ALLOWED)) |
| 513 #endif |
| 514 ) { |
| 515 SANDBOX_DIE("Invalid seccomp policy; if using UnsafeTrap(), you must " |
| 516 "unconditionally allow sigreturn() and sigprocmask()"); |
| 517 } |
| 518 |
| 519 SANDBOX_INFO("WARNING! Disabling sandbox for debugging purposes"); |
| 520 gen->Traverse(jumptable, RedirectToUserspace, NULL); |
| 521 |
| 522 // Allow system calls, if they originate from our magic return address |
| 523 // (which we can query by calling SandboxSyscall(-1)). |
| 524 uintptr_t syscall_entry_point = |
| 525 static_cast<uintptr_t>(SandboxSyscall(-1)); |
| 526 uint32_t low = static_cast<uint32_t>(syscall_entry_point); |
| 527 #if __SIZEOF_POINTER__ > 4 |
| 528 uint32_t hi = static_cast<uint32_t>(syscall_entry_point >> 32); |
| 529 #endif |
| 530 |
| 531 // BPF cannot do native 64bit comparisons. On 64bit architectures, we |
| 532 // have to compare both 32bit halfs of the instruction pointer. If they |
| 533 // match what we expect, we return ERR_ALLOWED. If either or both don't |
| 534 // match, we continue evalutating the rest of the sandbox policy. |
| 535 Instruction *escape_hatch = |
| 536 gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS, |
| 537 offsetof(struct arch_seccomp_data, |
| 538 instruction_pointer) + |
| 539 (__SIZEOF_POINTER__ > 4 && |
| 540 __BYTE_ORDER == __BIG_ENDIAN ? 4 : 0), |
| 541 gen->MakeInstruction(BPF_JMP+BPF_JEQ+BPF_K, low, |
| 542 #if __SIZEOF_POINTER__ > 4 |
| 543 gen->MakeInstruction(BPF_LD+BPF_W+BPF_ABS, |
| 544 offsetof(struct arch_seccomp_data, |
| 545 instruction_pointer) + |
| 546 (__BYTE_ORDER == __BIG_ENDIAN ? 0 : 4), |
| 547 gen->MakeInstruction(BPF_JMP+BPF_JEQ+BPF_K, hi, |
| 548 #endif |
| 549 gen->MakeInstruction(BPF_RET+BPF_K, ErrorCode(ErrorCode::ERR_ALLOWED)), |
| 550 #if __SIZEOF_POINTER__ > 4 |
| 551 load_nr)), |
| 552 #endif |
| 553 load_nr)); |
| 554 gen->JoinInstructions(tail, escape_hatch); |
| 555 } else { |
| 556 gen->JoinInstructions(tail, load_nr); |
| 557 } |
| 558 tail = load_nr; |
| 559 |
| 560 // On Intel architectures, verify that system call numbers are in the |
| 561 // expected number range. The older i386 and x86-64 APIs clear bit 30 |
| 562 // on all system calls. The newer x32 API always sets bit 30. |
| 563 #if defined(__i386__) || defined(__x86_64__) |
| 564 Instruction *invalidX32 = |
| 565 gen->MakeInstruction(BPF_RET+BPF_K, |
| 566 Kill("Illegal mixing of system call ABIs").err_); |
| 567 Instruction *checkX32 = |
| 568 #if defined(__x86_64__) && defined(__ILP32__) |
| 569 gen->MakeInstruction(BPF_JMP+BPF_JSET+BPF_K, 0x40000000, 0, invalidX32); |
| 570 #else |
| 571 gen->MakeInstruction(BPF_JMP+BPF_JSET+BPF_K, 0x40000000, invalidX32, 0); |
| 572 #endif |
| 573 gen->JoinInstructions(tail, checkX32); |
| 574 tail = checkX32; |
| 575 #endif |
| 576 |
| 409 // Append jump table to our pre-amble | 577 // Append jump table to our pre-amble |
| 410 gen->JoinInstructions(tail, jumptable); | 578 gen->JoinInstructions(tail, jumptable); |
| 411 } | 579 } |
| 412 | 580 |
| 413 // Turn the DAG into a vector of instructions. | 581 // Turn the DAG into a vector of instructions. |
| 414 Program *program = new Program(); | 582 Program *program = new Program(); |
| 415 gen->Compile(head, program); | 583 gen->Compile(head, program); |
| 416 delete gen; | 584 delete gen; |
| 417 | 585 |
| 418 // Make sure compilation resulted in BPF program that executes | 586 // Make sure compilation resulted in BPF program that executes |
| 419 // correctly. Otherwise, there is an internal error in our BPF compiler. | 587 // correctly. Otherwise, there is an internal error in our BPF compiler. |
| 420 // There is really nothing the caller can do until the bug is fixed. | 588 // There is really nothing the caller can do until the bug is fixed. |
| 421 #ifndef NDEBUG | 589 #ifndef NDEBUG |
| 422 const char *err = NULL; | 590 { |
| 423 if (!Verifier::VerifyBPF(*program, evaluators_, &err)) { | 591 // If we previously rewrote the BPF program so that it calls user-space |
| 424 SANDBOX_DIE(err); | 592 // whenever we return an "errno" value from the filter, then we have to |
| 593 // wrap our system call evaluator to perform the same operation. Otherwise, |
| 594 // the verifier would also report a mismatch in return codes. |
| 595 Evaluators redirected_evaluators; |
| 596 redirected_evaluators.push_back( |
| 597 std::make_pair(RedirectToUserspaceEvalWrapper, &evaluators_)); |
| 598 |
| 599 const char *err = NULL; |
| 600 if (!Verifier::VerifyBPF( |
| 601 *program, |
| 602 has_unsafe_traps_ ? redirected_evaluators : evaluators_, |
| 603 &err)) { |
| 604 SANDBOX_DIE(err); |
| 605 } |
| 425 } | 606 } |
| 426 #endif | 607 #endif |
| 427 | 608 |
| 428 // We want to be very careful in not imposing any requirements on the | 609 // We want to be very careful in not imposing any requirements on the |
| 429 // policies that are set with setSandboxPolicy(). This means, as soon as | 610 // policies that are set with setSandboxPolicy(). This means, as soon as |
| 430 // the sandbox is active, we shouldn't be relying on libraries that could | 611 // the sandbox is active, we shouldn't be relying on libraries that could |
| 431 // be making system calls. This, for example, means we should avoid | 612 // be making system calls. This, for example, means we should avoid |
| 432 // using the heap and we should avoid using STL functions. | 613 // using the heap and we should avoid using STL functions. |
| 433 // Temporarily copy the contents of the "program" vector into a | 614 // Temporarily copy the contents of the "program" vector into a |
| 434 // stack-allocated array; and then explicitly destroy that object. | 615 // stack-allocated array; and then explicitly destroy that object. |
| 435 // This makes sure we don't ex- or implicitly call new/delete after we | 616 // This makes sure we don't ex- or implicitly call new/delete after we |
| 436 // installed the BPF filter program in the kernel. Depending on the | 617 // installed the BPF filter program in the kernel. Depending on the |
| 437 // system memory allocator that is in effect, these operators can result | 618 // system memory allocator that is in effect, these operators can result |
| 438 // in system calls to things like munmap() or brk(). | 619 // in system calls to things like munmap() or brk(). |
| 439 struct sock_filter bpf[program->size()]; | 620 struct sock_filter bpf[program->size()]; |
| 440 const struct sock_fprog prog = { | 621 const struct sock_fprog prog = { |
| 441 static_cast<unsigned short>(program->size()), bpf }; | 622 static_cast<unsigned short>(program->size()), bpf }; |
| 442 memcpy(bpf, &(*program)[0], sizeof(bpf)); | 623 memcpy(bpf, &(*program)[0], sizeof(bpf)); |
| 443 delete program; | 624 delete program; |
| 444 | 625 |
| 445 // Release memory that is no longer needed | 626 // Release memory that is no longer needed |
| 446 evaluators_.clear(); | 627 evaluators_.clear(); |
| 447 errMap_.clear(); | |
| 448 | 628 |
| 449 #if defined(SECCOMP_BPF_VALGRIND_HACKS) | 629 #if defined(SECCOMP_BPF_VALGRIND_HACKS) |
| 450 // Valgrind is really not happy about our sandbox. Disable it when running | 630 // Valgrind is really not happy about our sandbox. Disable it when running |
| 451 // in Valgrind. This feature is dangerous and should never be enabled by | 631 // in Valgrind. This feature is dangerous and should never be enabled by |
| 452 // default. We protect it behind a pre-processor option. | 632 // default. We protect it behind a pre-processor option. |
| 453 if (!RUNNING_ON_VALGRIND) | 633 if (!RUNNING_ON_VALGRIND) |
| 454 #endif | 634 #endif |
| 455 { | 635 { |
| 456 // Install BPF filter program | 636 // Install BPF filter program |
| 457 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { | 637 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { |
| (...skipping 96 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 554 struct arch_sigsys sigsys; | 734 struct arch_sigsys sigsys; |
| 555 memcpy(&sigsys, &info->_sifields, sizeof(sigsys)); | 735 memcpy(&sigsys, &info->_sifields, sizeof(sigsys)); |
| 556 | 736 |
| 557 // Some more sanity checks. | 737 // Some more sanity checks. |
| 558 if (sigsys.ip != reinterpret_cast<void *>(SECCOMP_IP(ctx)) || | 738 if (sigsys.ip != reinterpret_cast<void *>(SECCOMP_IP(ctx)) || |
| 559 sigsys.nr != static_cast<int>(SECCOMP_SYSCALL(ctx)) || | 739 sigsys.nr != static_cast<int>(SECCOMP_SYSCALL(ctx)) || |
| 560 sigsys.arch != SECCOMP_ARCH) { | 740 sigsys.arch != SECCOMP_ARCH) { |
| 561 goto sigsys_err; | 741 goto sigsys_err; |
| 562 } | 742 } |
| 563 | 743 |
| 564 // Copy the seccomp-specific data into a arch_seccomp_data structure. This | 744 intptr_t rc; |
| 565 // is what we are showing to TrapFnc callbacks that the system call evaluator | 745 if (has_unsafe_traps_ && GetIsInSigHandler(ctx)) { |
| 566 // registered with the sandbox. | 746 errno = old_errno; |
| 567 struct arch_seccomp_data data = { | 747 if (sigsys.nr == __NR_clone) { |
| 568 sigsys.nr, | 748 SANDBOX_DIE("Cannot call clone() from an UnsafeTrap() handler"); |
| 569 SECCOMP_ARCH, | |
| 570 reinterpret_cast<uint64_t>(sigsys.ip), | |
| 571 { | |
| 572 static_cast<uint64_t>(SECCOMP_PARM1(ctx)), | |
| 573 static_cast<uint64_t>(SECCOMP_PARM2(ctx)), | |
| 574 static_cast<uint64_t>(SECCOMP_PARM3(ctx)), | |
| 575 static_cast<uint64_t>(SECCOMP_PARM4(ctx)), | |
| 576 static_cast<uint64_t>(SECCOMP_PARM5(ctx)), | |
| 577 static_cast<uint64_t>(SECCOMP_PARM6(ctx)) | |
| 578 } | 749 } |
| 579 }; | 750 rc = SandboxSyscall(sigsys.nr, |
| 751 SECCOMP_PARM1(ctx), SECCOMP_PARM2(ctx), |
| 752 SECCOMP_PARM3(ctx), SECCOMP_PARM4(ctx), |
| 753 SECCOMP_PARM5(ctx), SECCOMP_PARM6(ctx)); |
| 754 } else { |
| 755 const ErrorCode& err = trapArray_[info->si_errno - 1]; |
| 756 if (!err.safe_) { |
| 757 SetIsInSigHandler(); |
| 758 } |
| 580 | 759 |
| 581 // Now call the TrapFnc callback associated with this particular instance | 760 // Copy the seccomp-specific data into a arch_seccomp_data structure. This |
| 582 // of SECCOMP_RET_TRAP. | 761 // is what we are showing to TrapFnc callbacks that the system call |
| 583 const ErrorCode& err = trapArray_[info->si_errno - 1]; | 762 // evaluator registered with the sandbox. |
| 584 intptr_t rc = err.fnc_(data, err.aux_); | 763 struct arch_seccomp_data data = { |
| 764 sigsys.nr, |
| 765 SECCOMP_ARCH, |
| 766 reinterpret_cast<uint64_t>(sigsys.ip), |
| 767 { |
| 768 static_cast<uint64_t>(SECCOMP_PARM1(ctx)), |
| 769 static_cast<uint64_t>(SECCOMP_PARM2(ctx)), |
| 770 static_cast<uint64_t>(SECCOMP_PARM3(ctx)), |
| 771 static_cast<uint64_t>(SECCOMP_PARM4(ctx)), |
| 772 static_cast<uint64_t>(SECCOMP_PARM5(ctx)), |
| 773 static_cast<uint64_t>(SECCOMP_PARM6(ctx)) |
| 774 } |
| 775 }; |
| 776 |
| 777 // Now call the TrapFnc callback associated with this particular instance |
| 778 // of SECCOMP_RET_TRAP. |
| 779 rc = err.fnc_(data, err.aux_); |
| 780 } |
| 585 | 781 |
| 586 // Update the CPU register that stores the return code of the system call | 782 // Update the CPU register that stores the return code of the system call |
| 587 // that we just handled, and restore "errno" to the value that it had | 783 // that we just handled, and restore "errno" to the value that it had |
| 588 // before entering the signal handler. | 784 // before entering the signal handler. |
| 589 SECCOMP_RESULT(ctx) = static_cast<greg_t>(rc); | 785 SECCOMP_RESULT(ctx) = static_cast<greg_t>(rc); |
| 590 errno = old_errno; | 786 errno = old_errno; |
| 591 | 787 |
| 592 return; | 788 return; |
| 593 } | 789 } |
| 594 | 790 |
| 595 ErrorCode Sandbox::Trap(ErrorCode::TrapFnc fnc, const void *aux) { | 791 bool Sandbox::TrapKey::operator<(const Sandbox::TrapKey& o) const { |
| 792 if (fnc != o.fnc) { |
| 793 return fnc < o.fnc; |
| 794 } else if (aux != o.aux) { |
| 795 return aux < o.aux; |
| 796 } else { |
| 797 return safe < o.safe; |
| 798 } |
| 799 } |
| 800 |
| 801 ErrorCode Sandbox::MakeTrap(ErrorCode::TrapFnc fnc, const void *aux, |
| 802 bool safe) { |
| 596 // Each unique pair of TrapFnc and auxiliary data make up a distinct instance | 803 // Each unique pair of TrapFnc and auxiliary data make up a distinct instance |
| 597 // of a SECCOMP_RET_TRAP. | 804 // of a SECCOMP_RET_TRAP. |
| 598 std::pair<ErrorCode::TrapFnc, const void *> key(fnc, aux); | 805 TrapKey key(fnc, aux, safe); |
| 599 TrapIds::const_iterator iter = trapIds_.find(key); | 806 TrapIds::const_iterator iter = trapIds_.find(key); |
| 600 uint16_t id; | 807 uint16_t id; |
| 601 if (iter != trapIds_.end()) { | 808 if (iter != trapIds_.end()) { |
| 602 // We have seen this pair before. Return the same id that we assigned | 809 // We have seen this pair before. Return the same id that we assigned |
| 603 // earlier. | 810 // earlier. |
| 604 id = iter->second; | 811 id = iter->second; |
| 605 } else { | 812 } else { |
| 606 // This is a new pair. Remember it and assign a new id. | 813 // This is a new pair. Remember it and assign a new id. |
| 607 // Please note that we have to store traps in memory that doesn't get | 814 // Please note that we have to store traps in memory that doesn't get |
| 608 // deallocated when the program is shutting down. A memory leak is | 815 // deallocated when the program is shutting down. A memory leak is |
| 609 // intentional, because we might otherwise not be able to execute | 816 // intentional, because we might otherwise not be able to execute |
| 610 // system calls part way through the program shutting down | 817 // system calls part way through the program shutting down |
| 611 if (!traps_) { | 818 if (!traps_) { |
| 612 traps_ = new Traps(); | 819 traps_ = new Traps(); |
| 613 } | 820 } |
| 614 if (traps_->size() >= SECCOMP_RET_DATA) { | 821 if (traps_->size() >= SECCOMP_RET_DATA) { |
| 615 // In practice, this is pretty much impossible to trigger, as there | 822 // In practice, this is pretty much impossible to trigger, as there |
| 616 // are other kernel limitations that restrict overall BPF program sizes. | 823 // are other kernel limitations that restrict overall BPF program sizes. |
| 617 SANDBOX_DIE("Too many SECCOMP_RET_TRAP callback instances"); | 824 SANDBOX_DIE("Too many SECCOMP_RET_TRAP callback instances"); |
| 618 } | 825 } |
| 619 id = traps_->size() + 1; | 826 id = traps_->size() + 1; |
| 620 | 827 |
| 621 traps_->push_back(ErrorCode(fnc, aux, id)); | 828 traps_->push_back(ErrorCode(fnc, aux, safe, id)); |
| 622 trapIds_[key] = id; | 829 trapIds_[key] = id; |
| 623 | 830 |
| 624 // We want to access the traps_ vector from our signal handler. But | 831 // We want to access the traps_ vector from our signal handler. But |
| 625 // we are not assured that doing so is async-signal safe. On the other | 832 // we are not assured that doing so is async-signal safe. On the other |
| 626 // hand, C++ guarantees that the contents of a vector is stored in a | 833 // hand, C++ guarantees that the contents of a vector is stored in a |
| 627 // contiguous C-style array. | 834 // contiguous C-style array. |
| 628 // So, we look up the address and size of this array outside of the | 835 // So, we look up the address and size of this array outside of the |
| 629 // signal handler, where we can safely do so. | 836 // signal handler, where we can safely do so. |
| 630 trapArray_ = &(*traps_)[0]; | 837 trapArray_ = &(*traps_)[0]; |
| 631 trapArraySize_ = id; | 838 trapArraySize_ = id; |
| 839 return traps_->back(); |
| 632 } | 840 } |
| 633 | 841 |
| 634 ErrorCode err = ErrorCode(fnc, aux, id); | 842 return ErrorCode(fnc, aux, safe, id); |
| 635 return errMap_[err.err()] = err; | 843 } |
| 844 |
| 845 ErrorCode Sandbox::Trap(ErrorCode::TrapFnc fnc, const void *aux) { |
| 846 return MakeTrap(fnc, aux, true /* Safe Trap */); |
| 847 } |
| 848 |
| 849 ErrorCode Sandbox::UnsafeTrap(ErrorCode::TrapFnc fnc, const void *aux) { |
| 850 return MakeTrap(fnc, aux, false /* Unsafe Trap */); |
| 851 } |
| 852 |
| 853 intptr_t Sandbox::ForwardSyscall(const struct arch_seccomp_data& args) { |
| 854 return SandboxSyscall(args.nr, |
| 855 args.args[0], args.args[1], args.args[2], |
| 856 args.args[3], args.args[4], args.args[5]); |
| 857 } |
| 858 |
| 859 intptr_t Sandbox::ReturnErrno(const struct arch_seccomp_data&, void *aux) { |
| 860 // TrapFnc functions report error by following the native kernel convention |
| 861 // of returning an exit code in the range of -1..-4096. They do not try to |
| 862 // set errno themselves. The glibc wrapper that triggered the SIGSYS will |
| 863 // ultimately do so for us. |
| 864 int err = reinterpret_cast<intptr_t>(aux) & SECCOMP_RET_DATA; |
| 865 return -err; |
| 636 } | 866 } |
| 637 | 867 |
| 638 intptr_t Sandbox::bpfFailure(const struct arch_seccomp_data&, void *aux) { | 868 intptr_t Sandbox::bpfFailure(const struct arch_seccomp_data&, void *aux) { |
| 639 SANDBOX_DIE(static_cast<char *>(aux)); | 869 SANDBOX_DIE(static_cast<char *>(aux)); |
| 640 } | 870 } |
| 641 | 871 |
| 642 ErrorCode Sandbox::Kill(const char *msg) { | 872 ErrorCode Sandbox::Kill(const char *msg) { |
| 643 return Trap(bpfFailure, const_cast<char *>(msg)); | 873 return Trap(bpfFailure, const_cast<char *>(msg)); |
| 644 } | 874 } |
| 645 | 875 |
| 646 Sandbox::SandboxStatus Sandbox::status_ = STATUS_UNKNOWN; | 876 Sandbox::SandboxStatus Sandbox::status_ = STATUS_UNKNOWN; |
| 647 int Sandbox::proc_fd_ = -1; | 877 int Sandbox::proc_fd_ = -1; |
| 648 Sandbox::Evaluators Sandbox::evaluators_; | 878 Sandbox::Evaluators Sandbox::evaluators_; |
| 649 Sandbox::ErrMap Sandbox::errMap_; | |
| 650 Sandbox::Traps *Sandbox::traps_ = NULL; | 879 Sandbox::Traps *Sandbox::traps_ = NULL; |
| 651 Sandbox::TrapIds Sandbox::trapIds_; | 880 Sandbox::TrapIds Sandbox::trapIds_; |
| 652 ErrorCode *Sandbox::trapArray_ = NULL; | 881 ErrorCode *Sandbox::trapArray_ = NULL; |
| 653 size_t Sandbox::trapArraySize_ = 0; | 882 size_t Sandbox::trapArraySize_ = 0; |
| 883 bool Sandbox::has_unsafe_traps_ = false; |
| 654 | 884 |
| 655 } // namespace | 885 } // namespace |
| OLD | NEW |