sandbox/linux/seccomp_bpf/sandbox_bpf.cc - Issue 10458040: Initial snapshot of the new BPF-enabled seccomp sandbox. This code is

Side by Side Diff: sandbox/linux/seccomp_bpf/sandbox_bpf.cc

Issue 10458040: Initial snapshot of the new BPF-enabled seccomp sandbox. This code is (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src/

Patch Set: Created 8 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include "sandbox/linux/seccomp_bpf/sandbox_bpf.h"

	6

	7 // The kernel gives us a sandbox, we turn it into a playground :-)

	8 // This is version 2 of the playground; version 1 was built on top of

	9 // pre-BPF seccomp mode.

	10 namespace playground2 {

	11

	12 Sandbox::ErrorCode Sandbox::probeEvaluator(int signo) {

	13 switch (signo) {

	14 case __NR_getpid:

	15 // Return EPERM so that we can check that the filter actually ran.

	16 return (ErrorCode)EPERM;

	17 case __NR_exit_group:

	18 // Allow exit() with a non-default return code.

	19 return SB_ALLOWED;

	20 default:

	21 // Make everything else fail in an easily recognizable way.

	22 return (ErrorCode)EINVAL;

	23 }

	24 }

	25

	26 bool Sandbox::kernelSupportSeccompBPF(int proc_fd) {

	27 // Block all signals before forking a child process. This prevents an

	28 // attacker from manipulating our test by sending us an unexpected signal.

	29 sigset_t oldMask, newMask;

	30 if (sigfillset(&newMask) \|\|

	31 sigprocmask(SIG_BLOCK, &newMask, &oldMask)) {

	32 die("sigprocmask() failed");

	33 }

	34

	35 pid_t pid = fork();

	36 if (pid < 0) {

	37 // Die if we cannot fork(). We would probably fail a little later

	38 // anyway, as the machine is likely very close to running out of

	39 // memory.

	40 // But what we don't want to do is return "false", as a crafty

	41 // attacker might cause fork() to fail at will and could trick us

	42 // into running without a sandbox.

	43 sigprocmask(SIG_SETMASK, &oldMask, NULL); // OK, if it fails

	44 die("fork() failed unexpectedly");

	45 }

	46

	47 // In the child process

	48 if (!pid) {

	49 // Test a very simple sandbox policy to verify that we can

	50 // successfully turn on sandboxing.

	51 suppressLogging_ = true;

	52 evaluators_.clear();

	53 setSandboxPolicy(probeEvaluator, NULL);

	54 setProcFd(proc_fd);

	55 startSandbox();

	56 if (syscall(__NR_getpid) < 0 && errno == EPERM) {

	57 syscall(__NR_exit_group, (intptr_t)100);

	58 }

	59 die(NULL);

	60 }

	61

	62 // In the parent process

	63 if (sigprocmask(SIG_SETMASK, &oldMask, NULL)) {

	64 die("sigprocmask() failed");

	65 }

	66 int status;

	67 if (HANDLE_EINTR(waitpid(pid, &status, 0)) != pid) {

	68 die("waitpid() failed unexpectedly");

	69 }

	70 return WIFEXITED(status) && WEXITSTATUS(status) == 100;

	71 }

	72

	73 Sandbox::SandboxStatus Sandbox::supportsSeccompSandbox(int proc_fd) {

	74 // It the sandbox is currently active, we clearly must have support for

	75 // sandboxing.

	76 if (status_ == STATUS_ENABLED) {

	77 return status_;

	78 }

	79

	80 // Even if the sandbox was previously available, something might have

	81 // changed in our run-time environment. Check one more time.

	82 if (status_ == STATUS_AVAILABLE) {

	83 if (!isSingleThreaded(proc_fd)) {

	84 status_ = STATUS_UNAVAILABLE;

	85 }

	86 return status_;

	87 }

	88

	89 if (status_ == STATUS_UNAVAILABLE && isSingleThreaded(proc_fd)) {

	90 // All state transitions resulting in STATUS_UNAVAILABLE are immediately

	91 // preceded by STATUS_AVAILABLE. Furthermore, these transitions all

	92 // happen, if and only if they are triggered by the process being multi-

	93 // threaded.

	94 // In other words, if a single-threaded process is currently in the

	95 // STATUS_UNAVAILABLE state, it is safe to assume that sandboxing is

	96 // actually available.

	97 status_ == STATUS_AVAILABLE;

	98 return status_;

	99 }

	100

	101 // If we have not previously checked for availability of the sandbox or if

	102 // we otherwise don't believe to have a good cached value, we have to

	103 // perform a thorough check now.

	104 if (status_ == STATUS_UNKNOWN) {

	105 status_ = kernelSupportSeccompBPF(proc_fd)

	106 ? STATUS_AVAILABLE : STATUS_UNSUPPORTED;

	107

	108 // As we are performing our tests from a child process, the run-time

	109 // environment that is visible to the sandbox is always guaranteed to be

	110 // single-threaded. Let's check here whether the caller is single-

	111 // threaded. Otherwise, we mark the sandbox as temporarily unavailable.

	112 if (status_ == STATUS_AVAILABLE && !isSingleThreaded(proc_fd)) {

	113 status_ = STATUS_UNAVAILABLE;

	114 }

	115 }

	116 return status_;

	117 }

	118

	119 void Sandbox::setProcFd(int proc_fd) {

	120 proc_fd_ = proc_fd;

	121 }

	122

	123 void Sandbox::startSandbox() {

	124 if (status_ == STATUS_UNSUPPORTED \|\| status_ == STATUS_UNAVAILABLE) {

	125 die("Trying to start sandbox, even though it is known to be unavailable");

	126 } else if (status_ == STATUS_ENABLED) {

	127 die("Cannot start sandbox recursively. Use multiple calls to "

	128 "setSandboxPolicy() to stack policies instead");

	129 }

	130 if (proc_fd_ < 0) {

	131 proc_fd_ = open("/proc", O_RDONLY\|O_DIRECTORY);

	132 }

	133 if (proc_fd_ < 0) {

	134 // For now, continue in degraded mode, if we can't access /proc.

	135 // In the future, we might want to tighten this requirement.

	136 }

	137 if (!isSingleThreaded(proc_fd_)) {

	138 die("Cannot start sandbox, if process is already multi-threaded");

	139 }

	140

	141 // We no longer need access to any files in /proc. We want to do this

	142 // before installing the filters, just in case that our policy denies

	143 // close().

	144 if (proc_fd_ >= 0) {

	145 if (HANDLE_EINTR(close(proc_fd_))) {

	146 die("Failed to close file descriptor for /proc");

	147 }

	148 proc_fd_ = -1;

	149 }

	150

	151 // Install the filters.

	152 installFilter();

	153

	154 // We are now inside the sandbox.

	155 status_ = STATUS_ENABLED;

	156 }

	157

	158 bool Sandbox::isSingleThreaded(int proc_fd) {

	159 if (proc_fd < 0) {

	160 // Cannot determine whether program is single-threaded. Hope for

	161 // the best...

	162 return true;

	163 }

	164

	165 struct stat sb;

	166 int task = -1;

	167 if ((task = openat(proc_fd, "self/task", O_RDONLY\|O_DIRECTORY)) < 0 \|\|

	168 fstat(task, &sb) != 0 \|\|

	169 sb.st_nlink != 3 \|\|

	170 HANDLE_EINTR(close(task))) {

	171 if (task >= 0) {

	172 HANDLE_EINTR(close(task));

	173 }

	174 return false;

	175 }

	176 return true;

	177 }

	178

	179 void Sandbox::setSandboxPolicy(EvaluateSyscall syscallEvaluator,

	180 EvaluateArguments argumentEvaluator) {

	181 evaluators_.push_back(std::make_pair(syscallEvaluator, argumentEvaluator));

	182 }

	183

	184 void Sandbox::installFilter() {

	185 // Verify that the user pushed a policy.

	186 if (evaluators_.empty()) {

	187 filter_failed:

	188 die("Failed to configure system call filters");

	189 }

	190

	191 // Set new SIGSYS handler

	192 struct sigaction sa;

	193 memset(&sa, 0, sizeof(sa));

	194 sa.sa_sigaction = &sigSys;

	195 sa.sa_flags = SA_SIGINFO;

	196 if (sigaction(SIGSYS, &sa, NULL) < 0) {

	197 goto filter_failed;

	198 }

	199

	200 // Unmask SIGSYS

	201 sigset_t mask;

	202 sigemptyset(&mask);

	203 sigaddset(&mask, SIGSYS);
	Chris Evans 2012/06/04 22:21:55 Nit: we took the trouble to check the sigfillset() Nit: we took the trouble to check the sigfillset() return value above, so we should probably check these too.
	204 if (sigprocmask(SIG_UNBLOCK, &mask, NULL)) {

	205 goto filter_failed;

	206 }

	207

	208 // We can't handle stacked evaluators, yet. We'll get there eventually

	209 // though. Hang tight.

	210 if (evaluators_.size() != 1) {

	211 die("Not implemented");

	212 }

	213

	214 // If the architecture doesn't match SECCOMP_ARCH, disallow the

	215 // system call.

	216 std::vector<struct sock_filter> program;

	217 program.push_back((struct sock_filter)

	218 BPF_STMT(BPF_LD+BPF_W+BPF_ABS,

	219 offsetof(struct arch_seccomp_data, arch)));

	220 program.push_back((struct sock_filter)

	221 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, SECCOMP_ARCH, 1, 0));

	222 program.push_back((struct sock_filter)

	223 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ERRNO + SECCOMP_DENY_ERRNO));

	224

	225 // Grab the system call number, so that we can implement jump tables.

	226 program.push_back((struct sock_filter)

	227 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct arch_seccomp_data, nr)));

	228

	229 // Evaluate all possible system calls and depending on their

	230 // exit codes generate a BPF filter.

	231 // This is very inefficient right now. We need to be much smarter

	232 // eventually.
	Jorge Lucangeli Obes 2012/06/04 22:16:14 We'll probably want to fix this (to at least avoid We'll probably want to fix this (to at least avoid walking all the syscalls every time) before switching from the previous implementation. It's great that it's already planned though =). I don't think it should block submitting though. Chris Evans 2012/06/04 22:21:55 Nit: note the actual run time in a comment. Julien Nit: note the actual run time in a comment. Julien and I both (initially) had a different impression so it'd be great to add a comment (and keep that comment in sync as we improve things). Looks like current run time for evaluating system call N is N? (Current run time in sandbox_init_linux.cc is more proportional to the number of syscalls _permitted_, call this P, and best we can do without BPF enhancements is probably log2(P) or so)
	233 EvaluateSyscall evaluateSyscall = evaluators_.begin()->first;

	234 for (int sysnum = MIN_SYSCALL; sysnum <= MAX_SYSCALL; ++sysnum) {

	235 ErrorCode err = evaluateSyscall(sysnum);

	236 int ret;

	237 switch (err) {

	238 case SB_INSPECT_ARG_1...SB_INSPECT_ARG_6:

	239 die("Not implemented");

	240 case SB_TRAP:

	241 ret = SECCOMP_RET_TRAP;

	242 break;

	243 case SB_ALLOWED:

	244 ret = SECCOMP_RET_ALLOW;

	245 break;

	246 default:

	247 if (err >= static_cast<ErrorCode>(1) &&

	248 err <= static_cast<ErrorCode>(4096)) {

	249 // We limit errno values to a reasonable range. In fact, the Linux ABI

	250 // doesn't support errno values outside of this range.

	251 ret = SECCOMP_RET_ERRNO + err;

	252 } else {

	253 die("Invalid ErrorCode reported by sandbox system call evaluator");

	254 }

	255 break;

	256 }

	257 program.push_back((struct sock_filter)

	258 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, sysnum, 0, 1));

	259 program.push_back((struct sock_filter)

	260 BPF_STMT(BPF_RET+BPF_K, ret));

	261 }

	262

	263 // Everything that isn't allowed is forbidden. Eventually, we would

	264 // like to have a way to log forbidden calls, when in debug mode.

	265 program.push_back((struct sock_filter)

	266 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ERRNO + SECCOMP_DENY_ERRNO));

	267

	268 // Install BPF filter program

	269 const struct sock_fprog prog = { program.size(), &program[0] };

	270 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) \|\|

	271 prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) {

	272 goto filter_failed;

	273 }

	274

	275 return;

	276 }

	277

	278 void Sandbox::sigSys(int nr, siginfo_t info, void void_context) {

	279 if (nr != SIGSYS \|\| info->si_code != SYS_SECCOMP \|\| !void_context) {
	Chris Evans 2012/06/04 22:21:55 Nit: add a comment that die() might call LOG(FATAL Nit: add a comment that die() might call LOG(FATAL), which might non be async safe. One final idea: if we're worried about making sure we see a distinctive log here, why not just dereference 0x1 or some such <1 page constant? It would result in a very clearly readable crash report.
	280 die("Unexpected SIGSYS received");

	281 }

	282 ucontext_t ctx = reinterpret_cast<ucontext_t >(void_context);

	283 int old_errno = errno;

	284

	285 // In case of error, set the REG_RESULT CPU register to the default

	286 // errno value (i.e. EPERM).

	287 // We need to be very careful when doing this, as some of our target

	288 // platforms have pointer types and CPU registers that are wider than

	289 // ints. Furthermore, the kernel ABI requires us to return a negative

	290 // value, but errno values are usually positive. And in fact, it would

	291 // be perfectly reasonable for somebody to have defined them as unsigned

	292 // properties. This makes the correct incantation of type casts rather

	293 // subtle. Sometimes, C++ is just too smart for its own good.

	294 void rc = (void )(intptr_t)-(int)SECCOMP_DENY_ERRNO;

	295

	296 // This is where we can add extra code to handle complex system calls.

	297 // ...

	298

	299 ctx->uc_mcontext.gregs[REG_RESULT] = reinterpret_cast<greg_t>(rc);

	300 errno = old_errno;

	301 return;

	302 }

	303

	304

	305 bool Sandbox::suppressLogging_ = false;

	306 Sandbox::SandboxStatus Sandbox::status_ = STATUS_UNKNOWN;

	307 int Sandbox::proc_fd_ = -1;

	308 std::vector<std::pair<Sandbox::EvaluateSyscall,

	309 Sandbox::EvaluateArguments> > Sandbox::evaluators_;

	310

	311 } // namespace

OLD	NEW

« no previous file with comments | « sandbox/linux/seccomp_bpf/sandbox_bpf.h ('k') | sandbox/linux/seccomp_bpf/util.h » ('j') | no next file with comments »