sandbox/linux/seccomp_bpf/sandbox_bpf.cc - Issue 10458040: Initial snapshot of the new BPF-enabled seccomp sandbox. This code is

Side by Side Diff: sandbox/linux/seccomp_bpf/sandbox_bpf.cc

Issue 10458040: Initial snapshot of the new BPF-enabled seccomp sandbox. This code is (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src/

Patch Set: Created 8 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

Property Changes:

Added: svn:eol-style
+ LF

OLD	NEW
(Empty)
	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include "sandbox_bpf.h"

	6

	7

	8 namespace playground2 {

	9

	10 int Sandbox::supportsSeccompSandbox(int proc_fd) {
	jln (very slow on Chromium) 2012/05/30 20:31:21 It looks like content/common/sandbox_init_linux.cc It looks like content/common/sandbox_init_linux.cc is the right place for all sandbox initialization code to go. I think we need to get seccomp initialization out of the Zygote, which means that giving proc_fd will be difficult (we don't want to have a /proc fd that far down the stack). We can solve this later, but I really would prefer to have the signature be int supportsSeccompSandbox(void) for now. It's perhaps ok if isSingleThreaded only works in DEBUG mode without when the setuid sandbox is not here (which is what Chris did). Otherwise, I think you can do a proper check with the ChildProcess singleton: ChildProcess::current() == NULL. Need to finish understanding all the code to be sure. Of course it only works for Chrome code, if glibc or some other decides to create new threads on us that won't work. It's nice to detect it, but it's not worth the trouble for the first iteration of the patch.
	11 if (status_ == STATUS_UNKNOWN) {

	12 if (!isSingleThreaded(proc_fd)) {

	13 status_ = STATUS_UNSUPPORTED;

	14 } else {

	15 pid_t pid = fork();

	16 if (pid < 0) {

	17 die("Failed to check for sandbox support");

	18 }

	19 if (!pid) {

	20 static const struct sock_filter filter[] = {

	21 // If the architecture doesn't match SECCOMP_ARCH, disallow the

	22 // system call.

	23 BPF_STMT(BPF_LD+BPF_W+BPF_ABS,

	24 offsetof(struct arch_seccomp_data, arch)),

	25 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, SECCOMP_ARCH, 0, 3),

	26

	27 // Check the system call number. The only allowed call are getpid()

	28 // and exit_group()

	29 BPF_STMT(BPF_LD+BPF_W+BPF_ABS,

	30 offsetof(struct arch_seccomp_data, nr)),

	31 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_getpid, 2, 1),

	32 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_exit_group, 0, 2),

	33 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),

	34 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ERRNO \| EPERM),

	35 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL),

	36 };

	37

	38 // Try to install filter. If we succeed, return success.

	39 const struct sock_fprog prog = {

	40 ARRAYSIZE(filter),

	41 (struct sock_filter *)filter

	42 };

	43 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == 0 &&

	44 prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) == 0 &&

	45 syscall(__NR_getpid) == -1 && errno == EPERM) {

	46 syscall(__NR_exit_group, (intptr_t)0);

	47 }

	48 _exit(1);

	49 }

	50 int status;

	51 TEMP_FAILURE_RETRY(waitpid(pid, &status, 0));

	52 status_ = WIFEXITED(status) && !WEXITSTATUS(status)

	53 ? STATUS_AVAILABLE : STATUS_UNSUPPORTED;

	54 }

	55 }

	56 return status_ == STATUS_AVAILABLE;

	57 }

	58

	59 void Sandbox::setProcFd(int proc_fd) {

	60 proc_fd_ = proc_fd;

	61 }

	62

	63 void Sandbox::startSandbox() {

	64 if (status_ == STATUS_UNSUPPORTED) {

	65 die("Trying to start sandbox, even though it is known to be unavailable");

	66 }

	67 if (proc_fd_ < 0) {

	68 proc_fd_ = open("/proc", O_RDONLY\|O_DIRECTORY);

	69 }

	70 if (proc_fd_ < 0) {

	71 die("Cannot access /proc");

	72 }

	73 if (!isSingleThreaded(proc_fd_)) {

	74 die("Cannot start sandbox, if process is already multi-threaded");

	75 }

	76 disableFilesystem();

	77 installFilter();

	78

	79 // We no longer need access to any files in /proc

	80 if (proc_fd_ >= 0) {

	81 if (TEMP_FAILURE_RETRY(close(proc_fd_))) {

	82 die("Failed to close file descriptor for /proc");

	83 }

	84 proc_fd_ = -1;

	85 }

	86 }

	87

	88 bool Sandbox::isSingleThreaded(int proc_fd) {

	89 struct stat sb;

	90 int task = -1;

	91 if (proc_fd < 0 \|\|

	92 (task = openat(proc_fd, "self/task", O_RDONLY\|O_DIRECTORY)) < 0 \|\|

	93 fstat(task, &sb) != 0 \|\|

	94 sb.st_nlink != 3 \|\|

	95 TEMP_FAILURE_RETRY(close(task))) {

	96 if (task >= 0) {

	97 TEMP_FAILURE_RETRY(close(task));

	98 }

	99 return false;

	100 }

	101 return true;

	102 }

	103

	104 bool Sandbox::disableFilesystem() {
	jln (very slow on Chromium) 2012/05/30 20:31:21 Looks good, but this should be kept independent of Looks good, but this should be kept independent of seccomp BPF IMO. Could be dropped for now to reduce the initial patch complexity ?
	105 // Some versions of PR_SET_NO_NEW_PRIVS allow unprivileged processes

	106 // to call chroot(). If this feature is available in the kernel, move

	107 // us into a non-existent directory.

	108 // This is slightly more difficult than it sounds. We don't want

	109 // to actually create a directory anywhere, as that is difficult

	110 // to do securely. Instead, we rely on the /proc filesystem to

	111 // give us a directory for our child process. We can then remove

	112 // this directory by terminating the process.

	113 // Also, pass the file descriptor from the child process to the

	114 // parent rather than opening the directory by "/proc/${PID}". The

	115 // latter doesn't necessarily work, if somebody already pushed us

	116 // into a new pid namespace. Access by "/proc/self" is more reliable.

	117 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {

	118 return false;

	119 }

	120 int fds[2];

	121 pid_t pid;

	122 if (socketpair(AF_UNIX, SOCK_STREAM \| SOCK_CLOEXEC, 0, fds) < 0 \|\|

	123 (pid = fork()) < 0) {

	124 chroot_failed:

	125 die("Failed to isolate file system accesses");

	126 }

	127 if (!pid) {

	128 TEMP_FAILURE_RETRY(close(fds[1]));

	129 prctl(PR_SET_DUMPABLE, 1);

	130 fds[1] = openat(proc_fd_, "self/fdinfo", O_RDONLY\|O_DIRECTORY);

	131 if (fds[1] >= 0) {

	132 Util::sendFds(fds[0], NULL, 0, fds[1], -1);

	133 }

	134 _exit(0);

	135 }

	136 TEMP_FAILURE_RETRY(close(fds[0]));

	137 if (!Util::getFds(fds[1], NULL, 0, &fds[0], NULL)) {

	138 goto chroot_failed;

	139 }

	140 bool rc = false;

	141 if (fchdir(fds[0]) == 0 && chroot(".") == 0) {

	142 rc = true;

	143 }

	144 TEMP_FAILURE_RETRY(close(fds[0]));

	145 TEMP_FAILURE_RETRY(close(fds[1]));

	146 TEMP_FAILURE_RETRY(waitpid(pid, NULL, 0));

	147 return rc;

	148 }

	149

	150 int Sandbox::jumpTableSize(int numSyscalls, bool recursing) {

	151 int ret = 0;

	152 if (numSyscalls <= 0) {

	153 // Nothing to do

	154 } else if (numSyscalls > 160) {

	155 for (int i = 0; i < numSyscalls; i += 160) {

	156 ret += jumpTableSize(std::min(160, numSyscalls-i));

	157 }

	158 } else {

	159 if (numSyscalls <= 3) {

	160 ret += numSyscalls;

	161 } else {

	162 int m = numSyscalls/2;

	163 ret += 1 + jumpTableSize(m, true) + jumpTableSize(numSyscalls-m, true);

	164 }

	165 if (!recursing) {

	166 ++ret;

	167 }

	168 }

	169 return ret;

	170 }

	171

	172 void Sandbox::verifyJumpTable(struct sock_filter *filter, int numInsn,

	173 const int *syscallList, int numSyscalls) {

	174 if (numSyscalls <= 0) {

	175 if (numInsn != 0) {

	176 failed:

	177 die("Failed to assemble jump table");

	178 }

	179 return;

	180 }

	181 int j = 0;

	182 for (int i = syscallList[0]-1; i <= syscallList[numSyscalls-1]+1; ++i) {

	183 for (; j < numSyscalls && syscallList[j] < i; ++j) { }

	184 bool present = j < numSyscalls && syscallList[j] == i;

	185 for (int ip = 0; ip < numInsn; ++ip) {

	186 if (filter[ip].code == BPF_JMP+BPF_JEQ+BPF_K) {

	187 ip += i == (int)filter[ip].k ? filter[ip].jt : filter[ip].jf;

	188 } else if (filter[ip].code == BPF_JMP+BPF_JGE+BPF_K) {

	189 ip += i >= (int)filter[ip].k ? filter[ip].jt : filter[ip].jf;

	190 } else if (filter[ip].code == BPF_RET+BPF_K) {

	191 if (!present) {

	192 goto failed;

	193 } else {

	194 goto ok;

	195 }

	196 } else {

	197 goto failed;

	198 }

	199 if (ip >= numInsn) {

	200 goto failed;

	201 }

	202 }

	203 if (present) {

	204 goto failed;

	205 }

	206 ok:;

	207 }

	208 return;

	209 }

	210

	211 static int cmp(const void a, const void b) {

	212 return (const int )a - (const int )b;

	213 }

	214

	215 int Sandbox::jumpTable(struct sock_filter filter, int idx,

	216 const int *syscallList, int numSyscalls,

	217 int ret, bool sorted, bool recursing) {

	218 const int origIdx = *idx;

	219

	220 // If the list of system calls is not yet sorted, we have to do that now.

	221 const int *list;

	222 int l[sorted ? 0 : numSyscalls];

	223 if (sorted) {

	224 list = syscallList;

	225 } else {

	226 memcpy(l, syscallList, sizeof(int)*numSyscalls);

	227 qsort(l, numSyscalls, sizeof(int), cmp);

	228 list = l;

	229 }

	230

	231 // If the list of system calls is too big, we have to split it. That allows

	232 // us to avoid jumps that are longer than 256 instructions.

	233 if (numSyscalls <= 0) {

	234 // Nothing to do

	235 } else if (numSyscalls > 160) {

	236 for (int i = 0; i < numSyscalls; i += 160) {

	237 jumpTable(filter, idx, list+i, std::min(160, numSyscalls-i), ret, true);

	238 }

	239 verifyJumpTable(filter + origIdx, *idx - origIdx, list, numSyscalls);

	240 } else {

	241 if (numSyscalls <= 3) {

	242 for (int i = 0; i < numSyscalls; ++i) {

	243 // If outputting more than one comparison, only mark the very last one

	244 // with BPF_JMP. When fixing up jump targets, we use this information

	245 // to generate the correct if..else.. sequence of jumps. And at that

	246 // point, we add the missing BPF_JMP into the filter.

	247 filter[(*idx)++] = (struct sock_filter)

	248 BPF_JUMP((i == numSyscalls-1 ? BPF_JMP : 0)+BPF_JEQ+BPF_K,

	249 list[i], 0, 0);

	250 }

	251 } else {

	252 int m = numSyscalls/2;

	253 int x = (*idx)++;

	254 filter[x] = (struct sock_filter)

	255 BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, list[m], 0, 0);

	256 jumpTable(filter, idx, list, m, ret, true, true);

	257 if (*idx - x - 1 > 255) {

	258 die("Failed to assemble jump table");

	259 }

	260 filter[x].jt = *idx - x - 1;

	261 jumpTable(filter, idx, list+m, numSyscalls-m, ret, true, true);

	262 }

	263

	264 // If we are done recursing, fix up jump targets and insert the

	265 // return statement.

	266 if (!recursing) {

	267 for (int i = origIdx; i < *idx; ++i) {

	268 if (BPF_OP(filter[i].code) == BPF_JEQ) {

	269 if (*idx - i > 255) {

	270 die("Failed to assemble jump table");

	271 }

	272 filter[i].jt = *idx - i - 1;

	273 if (BPF_CLASS(filter[i].code) == BPF_JMP) {

	274 filter[i].jf = *idx - i;

	275 } else {

	276 filter[i].code += BPF_JMP;

	277 }

	278 }

	279 }

	280 filter[(*idx)++] = (struct sock_filter)BPF_STMT(BPF_RET+BPF_K, ret);

	281 verifyJumpTable(filter + origIdx, *idx - origIdx, list, numSyscalls);

	282 }

	283 }

	284

	285 return *idx - origIdx;

	286 }

	287

	288 void Sandbox::setSandboxPolicy(EvaluateSyscall syscallEvaluator,

	289 EvaluateArguments argumentEvaluator) {

	290 evaluators_.push_back(std::make_pair<EvaluateSyscall, EvaluateArguments>(

	291 syscallEvaluator, argumentEvaluator));

	292 }

	293

	294 void Sandbox::installFilter() {

	295 // Set new SIGSYS handler

	296 struct sigaction sa;

	297 memset(&sa, 0, sizeof(sa));

	298 sa.sa_sigaction = &sigSys;

	299 sa.sa_flags = SA_SIGINFO;

	300 if (sigaction(SIGSYS, &sa, NULL) < 0) {

	301 filter_failed:

	302 die("Failed to configure system call filters");

	303 }

	304

	305 // Unmask SIGSYS

	306 sigset_t mask;

	307 sigemptyset(&mask);

	308 sigaddset(&mask, SIGSYS);

	309 if (sigprocmask(SIG_UNBLOCK, &mask, NULL)) {

	310 goto filter_failed;

	311 }

	312

	313 // Static preamble at the beginning of the filter program

	314 // static const struct sock_filter filterPreamble[] = {

	315 // // If the architecture doesn't match SECCOMP_ARCH, disallow the

	316 // // system call.

	317 // BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct arch_seccomp_data, arch)),

	318 // BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, SECCOMP_ARCH, 1, 0),

	319 // BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_DENY),

	320 //

	321 // // Grab the system call number, so that we can implement jump tables.

	322 // BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct arch_seccomp_data, nr)),

	323 // };

	324

	325 for (std::vector<std::pair<EvaluateSyscall, EvaluateArguments> >::

	326 const_iterator iter = evaluators_.begin();

	327 iter != evaluators_.end();

	328 ++iter) {

	329 EvaluateSyscall evaluateSyscall = iter->first;

	330 EvaluateArguments evaluateArgs = iter->second;

	331 int oldSysnum = INT32_MIN;

	332 ErrorCode oldErr = evaluateSyscall(oldSysnum);

	333 if (oldErr != evaluateSyscall(-1) \|\|

	334 (oldErr >= SB_INSPECT_ARG_1 && oldErr <= SB_INSPECT_ARG_6)) {

	335 policyErr:

	336 die("Invalid sandbox policy");

	337 }

	338 for (int sysnum = 0; sysnum <= MAX_SYSCALL; ++sysnum) {

	339 ErrorCode err = evaluateSyscall(sysnum);

	340 if (err != oldErr) {

	341 addRange(oldSysnum, sysnum-1, oldErr);

	342 oldSysnum = sysnum;

	343 oldErr = err;

	344 }

	345 }

	346 if (oldErr != evaluateSyscall(INT32_MAX) \|\|

	347 (oldErr >= SB_INSPECT_ARG_1 && oldErr <= SB_INSPECT_ARG_6)) {

	348 goto policyErr;

	349 }

	350 addRange(oldSysnum, INT32_MAX, oldErr);

	351

	352 /***/

	353

	354 // Install BPF filter program

	355 const struct sock_fprog prog = { 0 /*/, 0 /*/ };

	356 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) \|\|

	357 prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) {

	358 goto filter_failed;

	359 }

	360 }

	361

	362 return;

	363 }

	364

	365 void Sandbox::sigSys(int nr, siginfo_t info, void void_context) {
	jln (very slow on Chromium) 2012/05/30 20:31:21 For the purpose of merging with Chris code, would For the purpose of merging with Chris code, would you mind doing the same thing as he does for now (access memory with an address that leaks the syscall number)? We can review it later, and I agree that we need a better way to log eventually (I think we should just have a helper process take care of that and write to it via a pipe in the handler).
	366 if (info->si_code != SYS_SECCOMP \|\| !void_context) {

	367 die("Unexpected SIGSYS received");

	368 }

	369 ucontext_t ctx = (ucontext_t )void_context;

	370 int old_errno = errno;

	371 void *rc =

	372 (void *)(intptr_t)-(int)(SECCOMP_RET_DENY & SECCOMP_RET_DATA);

	373

	374 if (rc == (void *)(intptr_t)-(int)(SECCOMP_RET_DENY & SECCOMP_RET_DATA)) {

	375 // sprintf() is not technically async-signal safe. But in glibc it

	376 // tends to be much safer than calling fprintf() or any other higher-

	377 // level I/O function.

	378 /***/

	379 char buf[80];

	380 sprintf(buf, "Seccomp policy denies system call %ld\n",

	381 (long int)ctx->uc_mcontext.gregs[REG_SYSCALL]);

	382 if (TEMP_FAILURE_RETRY(write(2, buf, strlen(buf)))) {}

	383 }

	384

	385 ctx->uc_mcontext.gregs[REG_RESULT] = (greg_t)rc;

	386 errno = old_errno;

	387 return;

	388 }

	389

	390

	391 Sandbox::SandboxStatus Sandbox::status_ = STATUS_UNKNOWN;

	392 int Sandbox::proc_fd_ = -1;

	393 std::vector<std::pair<Sandbox::EvaluateSyscall,

	394 Sandbox::EvaluateArguments> > Sandbox::evaluators_;

	395

	396 } // namespace

OLD	NEW

« no previous file with comments | « sandbox/linux/seccomp_bpf/sandbox_bpf.h ('k') | sandbox/linux/seccomp_bpf/util.h » ('j') | no next file with comments »