|
OLD | NEW |
---|---|
(Empty) | |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "sandbox_bpf.h" | |
6 | |
7 | |
8 namespace playground2 { | |
9 | |
10 int Sandbox::supportsSeccompSandbox(int proc_fd) { | |
jln (very slow on Chromium)
2012/05/30 20:31:21
It looks like content/common/sandbox_init_linux.cc
| |
11 if (status_ == STATUS_UNKNOWN) { | |
12 if (!isSingleThreaded(proc_fd)) { | |
13 status_ = STATUS_UNSUPPORTED; | |
14 } else { | |
15 pid_t pid = fork(); | |
16 if (pid < 0) { | |
17 die("Failed to check for sandbox support"); | |
18 } | |
19 if (!pid) { | |
20 static const struct sock_filter filter[] = { | |
21 // If the architecture doesn't match SECCOMP_ARCH, disallow the | |
22 // system call. | |
23 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, | |
24 offsetof(struct arch_seccomp_data, arch)), | |
25 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, SECCOMP_ARCH, 0, 3), | |
26 | |
27 // Check the system call number. The only allowed call are getpid() | |
28 // and exit_group() | |
29 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, | |
30 offsetof(struct arch_seccomp_data, nr)), | |
31 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_getpid, 2, 1), | |
32 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_exit_group, 0, 2), | |
33 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW), | |
34 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ERRNO | EPERM), | |
35 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL), | |
36 }; | |
37 | |
38 // Try to install filter. If we succeed, return success. | |
39 const struct sock_fprog prog = { | |
40 ARRAYSIZE(filter), | |
41 (struct sock_filter *)filter | |
42 }; | |
43 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == 0 && | |
44 prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) == 0 && | |
45 syscall(__NR_getpid) == -1 && errno == EPERM) { | |
46 syscall(__NR_exit_group, (intptr_t)0); | |
47 } | |
48 _exit(1); | |
49 } | |
50 int status; | |
51 TEMP_FAILURE_RETRY(waitpid(pid, &status, 0)); | |
52 status_ = WIFEXITED(status) && !WEXITSTATUS(status) | |
53 ? STATUS_AVAILABLE : STATUS_UNSUPPORTED; | |
54 } | |
55 } | |
56 return status_ == STATUS_AVAILABLE; | |
57 } | |
58 | |
59 void Sandbox::setProcFd(int proc_fd) { | |
60 proc_fd_ = proc_fd; | |
61 } | |
62 | |
63 void Sandbox::startSandbox() { | |
64 if (status_ == STATUS_UNSUPPORTED) { | |
65 die("Trying to start sandbox, even though it is known to be unavailable"); | |
66 } | |
67 if (proc_fd_ < 0) { | |
68 proc_fd_ = open("/proc", O_RDONLY|O_DIRECTORY); | |
69 } | |
70 if (proc_fd_ < 0) { | |
71 die("Cannot access /proc"); | |
72 } | |
73 if (!isSingleThreaded(proc_fd_)) { | |
74 die("Cannot start sandbox, if process is already multi-threaded"); | |
75 } | |
76 disableFilesystem(); | |
77 installFilter(); | |
78 | |
79 // We no longer need access to any files in /proc | |
80 if (proc_fd_ >= 0) { | |
81 if (TEMP_FAILURE_RETRY(close(proc_fd_))) { | |
82 die("Failed to close file descriptor for /proc"); | |
83 } | |
84 proc_fd_ = -1; | |
85 } | |
86 } | |
87 | |
88 bool Sandbox::isSingleThreaded(int proc_fd) { | |
89 struct stat sb; | |
90 int task = -1; | |
91 if (proc_fd < 0 || | |
92 (task = openat(proc_fd, "self/task", O_RDONLY|O_DIRECTORY)) < 0 || | |
93 fstat(task, &sb) != 0 || | |
94 sb.st_nlink != 3 || | |
95 TEMP_FAILURE_RETRY(close(task))) { | |
96 if (task >= 0) { | |
97 TEMP_FAILURE_RETRY(close(task)); | |
98 } | |
99 return false; | |
100 } | |
101 return true; | |
102 } | |
103 | |
104 bool Sandbox::disableFilesystem() { | |
jln (very slow on Chromium)
2012/05/30 20:31:21
Looks good, but this should be kept independent of
| |
105 // Some versions of PR_SET_NO_NEW_PRIVS allow unprivileged processes | |
106 // to call chroot(). If this feature is available in the kernel, move | |
107 // us into a non-existent directory. | |
108 // This is slightly more difficult than it sounds. We don't want | |
109 // to actually create a directory anywhere, as that is difficult | |
110 // to do securely. Instead, we rely on the /proc filesystem to | |
111 // give us a directory for our child process. We can then remove | |
112 // this directory by terminating the process. | |
113 // Also, pass the file descriptor from the child process to the | |
114 // parent rather than opening the directory by "/proc/${PID}". The | |
115 // latter doesn't necessarily work, if somebody already pushed us | |
116 // into a new pid namespace. Access by "/proc/self" is more reliable. | |
117 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) { | |
118 return false; | |
119 } | |
120 int fds[2]; | |
121 pid_t pid; | |
122 if (socketpair(AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0, fds) < 0 || | |
123 (pid = fork()) < 0) { | |
124 chroot_failed: | |
125 die("Failed to isolate file system accesses"); | |
126 } | |
127 if (!pid) { | |
128 TEMP_FAILURE_RETRY(close(fds[1])); | |
129 prctl(PR_SET_DUMPABLE, 1); | |
130 fds[1] = openat(proc_fd_, "self/fdinfo", O_RDONLY|O_DIRECTORY); | |
131 if (fds[1] >= 0) { | |
132 Util::sendFds(fds[0], NULL, 0, fds[1], -1); | |
133 } | |
134 _exit(0); | |
135 } | |
136 TEMP_FAILURE_RETRY(close(fds[0])); | |
137 if (!Util::getFds(fds[1], NULL, 0, &fds[0], NULL)) { | |
138 goto chroot_failed; | |
139 } | |
140 bool rc = false; | |
141 if (fchdir(fds[0]) == 0 && chroot(".") == 0) { | |
142 rc = true; | |
143 } | |
144 TEMP_FAILURE_RETRY(close(fds[0])); | |
145 TEMP_FAILURE_RETRY(close(fds[1])); | |
146 TEMP_FAILURE_RETRY(waitpid(pid, NULL, 0)); | |
147 return rc; | |
148 } | |
149 | |
150 int Sandbox::jumpTableSize(int numSyscalls, bool recursing) { | |
151 int ret = 0; | |
152 if (numSyscalls <= 0) { | |
153 // Nothing to do | |
154 } else if (numSyscalls > 160) { | |
155 for (int i = 0; i < numSyscalls; i += 160) { | |
156 ret += jumpTableSize(std::min(160, numSyscalls-i)); | |
157 } | |
158 } else { | |
159 if (numSyscalls <= 3) { | |
160 ret += numSyscalls; | |
161 } else { | |
162 int m = numSyscalls/2; | |
163 ret += 1 + jumpTableSize(m, true) + jumpTableSize(numSyscalls-m, true); | |
164 } | |
165 if (!recursing) { | |
166 ++ret; | |
167 } | |
168 } | |
169 return ret; | |
170 } | |
171 | |
172 void Sandbox::verifyJumpTable(struct sock_filter *filter, int numInsn, | |
173 const int *syscallList, int numSyscalls) { | |
174 if (numSyscalls <= 0) { | |
175 if (numInsn != 0) { | |
176 failed: | |
177 die("Failed to assemble jump table"); | |
178 } | |
179 return; | |
180 } | |
181 int j = 0; | |
182 for (int i = syscallList[0]-1; i <= syscallList[numSyscalls-1]+1; ++i) { | |
183 for (; j < numSyscalls && syscallList[j] < i; ++j) { } | |
184 bool present = j < numSyscalls && syscallList[j] == i; | |
185 for (int ip = 0; ip < numInsn; ++ip) { | |
186 if (filter[ip].code == BPF_JMP+BPF_JEQ+BPF_K) { | |
187 ip += i == (int)filter[ip].k ? filter[ip].jt : filter[ip].jf; | |
188 } else if (filter[ip].code == BPF_JMP+BPF_JGE+BPF_K) { | |
189 ip += i >= (int)filter[ip].k ? filter[ip].jt : filter[ip].jf; | |
190 } else if (filter[ip].code == BPF_RET+BPF_K) { | |
191 if (!present) { | |
192 goto failed; | |
193 } else { | |
194 goto ok; | |
195 } | |
196 } else { | |
197 goto failed; | |
198 } | |
199 if (ip >= numInsn) { | |
200 goto failed; | |
201 } | |
202 } | |
203 if (present) { | |
204 goto failed; | |
205 } | |
206 ok:; | |
207 } | |
208 return; | |
209 } | |
210 | |
211 static int cmp(const void *a, const void *b) { | |
212 return *(const int *)a - *(const int *)b; | |
213 } | |
214 | |
215 int Sandbox::jumpTable(struct sock_filter *filter, int *idx, | |
216 const int *syscallList, int numSyscalls, | |
217 int ret, bool sorted, bool recursing) { | |
218 const int origIdx = *idx; | |
219 | |
220 // If the list of system calls is not yet sorted, we have to do that now. | |
221 const int *list; | |
222 int l[sorted ? 0 : numSyscalls]; | |
223 if (sorted) { | |
224 list = syscallList; | |
225 } else { | |
226 memcpy(l, syscallList, sizeof(int)*numSyscalls); | |
227 qsort(l, numSyscalls, sizeof(int), cmp); | |
228 list = l; | |
229 } | |
230 | |
231 // If the list of system calls is too big, we have to split it. That allows | |
232 // us to avoid jumps that are longer than 256 instructions. | |
233 if (numSyscalls <= 0) { | |
234 // Nothing to do | |
235 } else if (numSyscalls > 160) { | |
236 for (int i = 0; i < numSyscalls; i += 160) { | |
237 jumpTable(filter, idx, list+i, std::min(160, numSyscalls-i), ret, true); | |
238 } | |
239 verifyJumpTable(filter + origIdx, *idx - origIdx, list, numSyscalls); | |
240 } else { | |
241 if (numSyscalls <= 3) { | |
242 for (int i = 0; i < numSyscalls; ++i) { | |
243 // If outputting more than one comparison, only mark the very last one | |
244 // with BPF_JMP. When fixing up jump targets, we use this information | |
245 // to generate the correct if..else.. sequence of jumps. And at that | |
246 // point, we add the missing BPF_JMP into the filter. | |
247 filter[(*idx)++] = (struct sock_filter) | |
248 BPF_JUMP((i == numSyscalls-1 ? BPF_JMP : 0)+BPF_JEQ+BPF_K, | |
249 list[i], 0, 0); | |
250 } | |
251 } else { | |
252 int m = numSyscalls/2; | |
253 int x = (*idx)++; | |
254 filter[x] = (struct sock_filter) | |
255 BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, list[m], 0, 0); | |
256 jumpTable(filter, idx, list, m, ret, true, true); | |
257 if (*idx - x - 1 > 255) { | |
258 die("Failed to assemble jump table"); | |
259 } | |
260 filter[x].jt = *idx - x - 1; | |
261 jumpTable(filter, idx, list+m, numSyscalls-m, ret, true, true); | |
262 } | |
263 | |
264 // If we are done recursing, fix up jump targets and insert the | |
265 // return statement. | |
266 if (!recursing) { | |
267 for (int i = origIdx; i < *idx; ++i) { | |
268 if (BPF_OP(filter[i].code) == BPF_JEQ) { | |
269 if (*idx - i > 255) { | |
270 die("Failed to assemble jump table"); | |
271 } | |
272 filter[i].jt = *idx - i - 1; | |
273 if (BPF_CLASS(filter[i].code) == BPF_JMP) { | |
274 filter[i].jf = *idx - i; | |
275 } else { | |
276 filter[i].code += BPF_JMP; | |
277 } | |
278 } | |
279 } | |
280 filter[(*idx)++] = (struct sock_filter)BPF_STMT(BPF_RET+BPF_K, ret); | |
281 verifyJumpTable(filter + origIdx, *idx - origIdx, list, numSyscalls); | |
282 } | |
283 } | |
284 | |
285 return *idx - origIdx; | |
286 } | |
287 | |
288 void Sandbox::setSandboxPolicy(EvaluateSyscall syscallEvaluator, | |
289 EvaluateArguments argumentEvaluator) { | |
290 evaluators_.push_back(std::make_pair<EvaluateSyscall, EvaluateArguments>( | |
291 syscallEvaluator, argumentEvaluator)); | |
292 } | |
293 | |
294 void Sandbox::installFilter() { | |
295 // Set new SIGSYS handler | |
296 struct sigaction sa; | |
297 memset(&sa, 0, sizeof(sa)); | |
298 sa.sa_sigaction = &sigSys; | |
299 sa.sa_flags = SA_SIGINFO; | |
300 if (sigaction(SIGSYS, &sa, NULL) < 0) { | |
301 filter_failed: | |
302 die("Failed to configure system call filters"); | |
303 } | |
304 | |
305 // Unmask SIGSYS | |
306 sigset_t mask; | |
307 sigemptyset(&mask); | |
308 sigaddset(&mask, SIGSYS); | |
309 if (sigprocmask(SIG_UNBLOCK, &mask, NULL)) { | |
310 goto filter_failed; | |
311 } | |
312 | |
313 // Static preamble at the beginning of the filter program | |
314 // static const struct sock_filter filterPreamble[] = { | |
315 // // If the architecture doesn't match SECCOMP_ARCH, disallow the | |
316 // // system call. | |
317 // BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct arch_seccomp_data, arch)), | |
318 // BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, SECCOMP_ARCH, 1, 0), | |
319 // BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_DENY), | |
320 // | |
321 // // Grab the system call number, so that we can implement jump tables. | |
322 // BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct arch_seccomp_data, nr)), | |
323 // }; | |
324 | |
325 for (std::vector<std::pair<EvaluateSyscall, EvaluateArguments> >:: | |
326 const_iterator iter = evaluators_.begin(); | |
327 iter != evaluators_.end(); | |
328 ++iter) { | |
329 EvaluateSyscall evaluateSyscall = iter->first; | |
330 EvaluateArguments evaluateArgs = iter->second; | |
331 int oldSysnum = INT32_MIN; | |
332 ErrorCode oldErr = evaluateSyscall(oldSysnum); | |
333 if (oldErr != evaluateSyscall(-1) || | |
334 (oldErr >= SB_INSPECT_ARG_1 && oldErr <= SB_INSPECT_ARG_6)) { | |
335 policyErr: | |
336 die("Invalid sandbox policy"); | |
337 } | |
338 for (int sysnum = 0; sysnum <= MAX_SYSCALL; ++sysnum) { | |
339 ErrorCode err = evaluateSyscall(sysnum); | |
340 if (err != oldErr) { | |
341 addRange(oldSysnum, sysnum-1, oldErr); | |
342 oldSysnum = sysnum; | |
343 oldErr = err; | |
344 } | |
345 } | |
346 if (oldErr != evaluateSyscall(INT32_MAX) || | |
347 (oldErr >= SB_INSPECT_ARG_1 && oldErr <= SB_INSPECT_ARG_6)) { | |
348 goto policyErr; | |
349 } | |
350 addRange(oldSysnum, INT32_MAX, oldErr); | |
351 | |
352 /***/ | |
353 | |
354 // Install BPF filter program | |
355 const struct sock_fprog prog = { 0 /***/, 0 /***/ }; | |
356 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) || | |
357 prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) { | |
358 goto filter_failed; | |
359 } | |
360 } | |
361 | |
362 return; | |
363 } | |
364 | |
365 void Sandbox::sigSys(int nr, siginfo_t *info, void *void_context) { | |
jln (very slow on Chromium)
2012/05/30 20:31:21
For the purpose of merging with Chris code, would
| |
366 if (info->si_code != SYS_SECCOMP || !void_context) { | |
367 die("Unexpected SIGSYS received"); | |
368 } | |
369 ucontext_t *ctx = (ucontext_t *)void_context; | |
370 int old_errno = errno; | |
371 void *rc = | |
372 (void *)(intptr_t)-(int)(SECCOMP_RET_DENY & SECCOMP_RET_DATA); | |
373 | |
374 if (rc == (void *)(intptr_t)-(int)(SECCOMP_RET_DENY & SECCOMP_RET_DATA)) { | |
375 // sprintf() is not technically async-signal safe. But in glibc it | |
376 // tends to be much safer than calling fprintf() or any other higher- | |
377 // level I/O function. | |
378 /***/ | |
379 char buf[80]; | |
380 sprintf(buf, "Seccomp policy denies system call %ld\n", | |
381 (long int)ctx->uc_mcontext.gregs[REG_SYSCALL]); | |
382 if (TEMP_FAILURE_RETRY(write(2, buf, strlen(buf)))) {} | |
383 } | |
384 | |
385 ctx->uc_mcontext.gregs[REG_RESULT] = (greg_t)rc; | |
386 errno = old_errno; | |
387 return; | |
388 } | |
389 | |
390 | |
391 Sandbox::SandboxStatus Sandbox::status_ = STATUS_UNKNOWN; | |
392 int Sandbox::proc_fd_ = -1; | |
393 std::vector<std::pair<Sandbox::EvaluateSyscall, | |
394 Sandbox::EvaluateArguments> > Sandbox::evaluators_; | |
395 | |
396 } // namespace | |
OLD | NEW |