OLD | NEW |
(Empty) | |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 #include "sandbox/linux/seccomp_bpf/sandbox_bpf.h" |
| 6 |
| 7 // The kernel gives us a sandbox, we turn it into a playground :-) |
| 8 // This is version 2 of the playground; version 1 was built on top of |
| 9 // pre-BPF seccomp mode. |
| 10 namespace playground2 { |
| 11 |
| 12 Sandbox::ErrorCode Sandbox::probeEvaluator(int signo) { |
| 13 switch (signo) { |
| 14 case __NR_getpid: |
| 15 // Return EPERM so that we can check that the filter actually ran. |
| 16 return (ErrorCode)EPERM; |
| 17 case __NR_exit_group: |
| 18 // Allow exit() with a non-default return code. |
| 19 return SB_ALLOWED; |
| 20 default: |
| 21 // Make everything else fail in an easily recognizable way. |
| 22 return (ErrorCode)EINVAL; |
| 23 } |
| 24 } |
| 25 |
| 26 bool Sandbox::kernelSupportSeccompBPF(int proc_fd) { |
| 27 // Block all signals before forking a child process. This prevents an |
| 28 // attacker from manipulating our test by sending us an unexpected signal. |
| 29 sigset_t oldMask, newMask; |
| 30 if (sigfillset(&newMask) || |
| 31 sigprocmask(SIG_BLOCK, &newMask, &oldMask)) { |
| 32 die("sigprocmask() failed"); |
| 33 } |
| 34 |
| 35 pid_t pid = fork(); |
| 36 if (pid < 0) { |
| 37 // Die if we cannot fork(). We would probably fail a little later |
| 38 // anyway, as the machine is likely very close to running out of |
| 39 // memory. |
| 40 // But what we don't want to do is return "false", as a crafty |
| 41 // attacker might cause fork() to fail at will and could trick us |
| 42 // into running without a sandbox. |
| 43 sigprocmask(SIG_SETMASK, &oldMask, NULL); // OK, if it fails |
| 44 die("fork() failed unexpectedly"); |
| 45 } |
| 46 |
| 47 // In the child process |
| 48 if (!pid) { |
| 49 // Test a very simple sandbox policy to verify that we can |
| 50 // successfully turn on sandboxing. |
| 51 suppressLogging_ = true; |
| 52 evaluators_.clear(); |
| 53 setSandboxPolicy(probeEvaluator, NULL); |
| 54 setProcFd(proc_fd); |
| 55 startSandbox(); |
| 56 if (syscall(__NR_getpid) < 0 && errno == EPERM) { |
| 57 syscall(__NR_exit_group, (intptr_t)100); |
| 58 } |
| 59 die(NULL); |
| 60 } |
| 61 |
| 62 // In the parent process |
| 63 if (sigprocmask(SIG_SETMASK, &oldMask, NULL)) { |
| 64 die("sigprocmask() failed"); |
| 65 } |
| 66 int status; |
| 67 if (HANDLE_EINTR(waitpid(pid, &status, 0)) != pid) { |
| 68 die("waitpid() failed unexpectedly"); |
| 69 } |
| 70 return WIFEXITED(status) && WEXITSTATUS(status) == 100; |
| 71 } |
| 72 |
| 73 Sandbox::SandboxStatus Sandbox::supportsSeccompSandbox(int proc_fd) { |
| 74 // It the sandbox is currently active, we clearly must have support for |
| 75 // sandboxing. |
| 76 if (status_ == STATUS_ENABLED) { |
| 77 return status_; |
| 78 } |
| 79 |
| 80 // Even if the sandbox was previously available, something might have |
| 81 // changed in our run-time environment. Check one more time. |
| 82 if (status_ == STATUS_AVAILABLE) { |
| 83 if (!isSingleThreaded(proc_fd)) { |
| 84 status_ = STATUS_UNAVAILABLE; |
| 85 } |
| 86 return status_; |
| 87 } |
| 88 |
| 89 if (status_ == STATUS_UNAVAILABLE && isSingleThreaded(proc_fd)) { |
| 90 // All state transitions resulting in STATUS_UNAVAILABLE are immediately |
| 91 // preceded by STATUS_AVAILABLE. Furthermore, these transitions all |
| 92 // happen, if and only if they are triggered by the process being multi- |
| 93 // threaded. |
| 94 // In other words, if a single-threaded process is currently in the |
| 95 // STATUS_UNAVAILABLE state, it is safe to assume that sandboxing is |
| 96 // actually available. |
| 97 status_ == STATUS_AVAILABLE; |
| 98 return status_; |
| 99 } |
| 100 |
| 101 // If we have not previously checked for availability of the sandbox or if |
| 102 // we otherwise don't believe to have a good cached value, we have to |
| 103 // perform a thorough check now. |
| 104 if (status_ == STATUS_UNKNOWN) { |
| 105 status_ = kernelSupportSeccompBPF(proc_fd) |
| 106 ? STATUS_AVAILABLE : STATUS_UNSUPPORTED; |
| 107 |
| 108 // As we are performing our tests from a child process, the run-time |
| 109 // environment that is visible to the sandbox is always guaranteed to be |
| 110 // single-threaded. Let's check here whether the caller is single- |
| 111 // threaded. Otherwise, we mark the sandbox as temporarily unavailable. |
| 112 if (status_ == STATUS_AVAILABLE && !isSingleThreaded(proc_fd)) { |
| 113 status_ = STATUS_UNAVAILABLE; |
| 114 } |
| 115 } |
| 116 return status_; |
| 117 } |
| 118 |
| 119 void Sandbox::setProcFd(int proc_fd) { |
| 120 proc_fd_ = proc_fd; |
| 121 } |
| 122 |
| 123 void Sandbox::startSandbox() { |
| 124 if (status_ == STATUS_UNSUPPORTED || status_ == STATUS_UNAVAILABLE) { |
| 125 die("Trying to start sandbox, even though it is known to be unavailable"); |
| 126 } else if (status_ == STATUS_ENABLED) { |
| 127 die("Cannot start sandbox recursively. Use multiple calls to " |
| 128 "setSandboxPolicy() to stack policies instead"); |
| 129 } |
| 130 if (proc_fd_ < 0) { |
| 131 proc_fd_ = open("/proc", O_RDONLY|O_DIRECTORY); |
| 132 } |
| 133 if (proc_fd_ < 0) { |
| 134 // For now, continue in degraded mode, if we can't access /proc. |
| 135 // In the future, we might want to tighten this requirement. |
| 136 } |
| 137 if (!isSingleThreaded(proc_fd_)) { |
| 138 die("Cannot start sandbox, if process is already multi-threaded"); |
| 139 } |
| 140 |
| 141 // We no longer need access to any files in /proc. We want to do this |
| 142 // before installing the filters, just in case that our policy denies |
| 143 // close(). |
| 144 if (proc_fd_ >= 0) { |
| 145 if (HANDLE_EINTR(close(proc_fd_))) { |
| 146 die("Failed to close file descriptor for /proc"); |
| 147 } |
| 148 proc_fd_ = -1; |
| 149 } |
| 150 |
| 151 // Install the filters. |
| 152 installFilter(); |
| 153 |
| 154 // We are now inside the sandbox. |
| 155 status_ = STATUS_ENABLED; |
| 156 } |
| 157 |
| 158 bool Sandbox::isSingleThreaded(int proc_fd) { |
| 159 if (proc_fd < 0) { |
| 160 // Cannot determine whether program is single-threaded. Hope for |
| 161 // the best... |
| 162 return true; |
| 163 } |
| 164 |
| 165 struct stat sb; |
| 166 int task = -1; |
| 167 if ((task = openat(proc_fd, "self/task", O_RDONLY|O_DIRECTORY)) < 0 || |
| 168 fstat(task, &sb) != 0 || |
| 169 sb.st_nlink != 3 || |
| 170 HANDLE_EINTR(close(task))) { |
| 171 if (task >= 0) { |
| 172 HANDLE_EINTR(close(task)); |
| 173 } |
| 174 return false; |
| 175 } |
| 176 return true; |
| 177 } |
| 178 |
| 179 void Sandbox::setSandboxPolicy(EvaluateSyscall syscallEvaluator, |
| 180 EvaluateArguments argumentEvaluator) { |
| 181 evaluators_.push_back(std::make_pair(syscallEvaluator, argumentEvaluator)); |
| 182 } |
| 183 |
| 184 void Sandbox::installFilter() { |
| 185 // Verify that the user pushed a policy. |
| 186 if (evaluators_.empty()) { |
| 187 filter_failed: |
| 188 die("Failed to configure system call filters"); |
| 189 } |
| 190 |
| 191 // Set new SIGSYS handler |
| 192 struct sigaction sa; |
| 193 memset(&sa, 0, sizeof(sa)); |
| 194 sa.sa_sigaction = &sigSys; |
| 195 sa.sa_flags = SA_SIGINFO; |
| 196 if (sigaction(SIGSYS, &sa, NULL) < 0) { |
| 197 goto filter_failed; |
| 198 } |
| 199 |
| 200 // Unmask SIGSYS |
| 201 sigset_t mask; |
| 202 if (sigemptyset(&mask) || |
| 203 sigaddset(&mask, SIGSYS) || |
| 204 sigprocmask(SIG_UNBLOCK, &mask, NULL)) { |
| 205 goto filter_failed; |
| 206 } |
| 207 |
| 208 // We can't handle stacked evaluators, yet. We'll get there eventually |
| 209 // though. Hang tight. |
| 210 if (evaluators_.size() != 1) { |
| 211 die("Not implemented"); |
| 212 } |
| 213 |
| 214 // If the architecture doesn't match SECCOMP_ARCH, disallow the |
| 215 // system call. |
| 216 std::vector<struct sock_filter> program; |
| 217 program.push_back((struct sock_filter) |
| 218 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, |
| 219 offsetof(struct arch_seccomp_data, arch))); |
| 220 program.push_back((struct sock_filter) |
| 221 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, SECCOMP_ARCH, 1, 0)); |
| 222 program.push_back((struct sock_filter) |
| 223 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ERRNO + SECCOMP_DENY_ERRNO)); |
| 224 |
| 225 // Grab the system call number, so that we can implement jump tables. |
| 226 program.push_back((struct sock_filter) |
| 227 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct arch_seccomp_data, nr))); |
| 228 |
| 229 // Evaluate all possible system calls and depending on their |
| 230 // exit codes generate a BPF filter. |
| 231 // This is very inefficient right now. We need to be much smarter |
| 232 // eventually. |
| 233 // We currently incur a O(N) overhead on each system call, with N |
| 234 // being the number of system calls. It is easy to get this down to |
| 235 // O(log_2(M)) with M being the number of system calls that need special |
| 236 // treatment. |
| 237 EvaluateSyscall evaluateSyscall = evaluators_.begin()->first; |
| 238 for (int sysnum = MIN_SYSCALL; sysnum <= MAX_SYSCALL; ++sysnum) { |
| 239 ErrorCode err = evaluateSyscall(sysnum); |
| 240 int ret; |
| 241 switch (err) { |
| 242 case SB_INSPECT_ARG_1...SB_INSPECT_ARG_6: |
| 243 die("Not implemented"); |
| 244 case SB_TRAP: |
| 245 ret = SECCOMP_RET_TRAP; |
| 246 break; |
| 247 case SB_ALLOWED: |
| 248 ret = SECCOMP_RET_ALLOW; |
| 249 break; |
| 250 default: |
| 251 if (err >= static_cast<ErrorCode>(1) && |
| 252 err <= static_cast<ErrorCode>(4096)) { |
| 253 // We limit errno values to a reasonable range. In fact, the Linux ABI |
| 254 // doesn't support errno values outside of this range. |
| 255 ret = SECCOMP_RET_ERRNO + err; |
| 256 } else { |
| 257 die("Invalid ErrorCode reported by sandbox system call evaluator"); |
| 258 } |
| 259 break; |
| 260 } |
| 261 program.push_back((struct sock_filter) |
| 262 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, sysnum, 0, 1)); |
| 263 program.push_back((struct sock_filter) |
| 264 BPF_STMT(BPF_RET+BPF_K, ret)); |
| 265 } |
| 266 |
| 267 // Everything that isn't allowed is forbidden. Eventually, we would |
| 268 // like to have a way to log forbidden calls, when in debug mode. |
| 269 program.push_back((struct sock_filter) |
| 270 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ERRNO + SECCOMP_DENY_ERRNO)); |
| 271 |
| 272 // Install BPF filter program |
| 273 const struct sock_fprog prog = { program.size(), &program[0] }; |
| 274 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) || |
| 275 prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) { |
| 276 goto filter_failed; |
| 277 } |
| 278 |
| 279 return; |
| 280 } |
| 281 |
| 282 void Sandbox::sigSys(int nr, siginfo_t *info, void *void_context) { |
| 283 if (nr != SIGSYS || info->si_code != SYS_SECCOMP || !void_context) { |
| 284 // die() can call LOG(FATAL). This is not normally async-signal safe |
| 285 // and can lead to bugs. We should eventually implement a different |
| 286 // logging and reporting mechanism that is safe to be called from |
| 287 // the sigSys() handler. |
| 288 die("Unexpected SIGSYS received"); |
| 289 } |
| 290 ucontext_t *ctx = reinterpret_cast<ucontext_t *>(void_context); |
| 291 int old_errno = errno; |
| 292 |
| 293 // In case of error, set the REG_RESULT CPU register to the default |
| 294 // errno value (i.e. EPERM). |
| 295 // We need to be very careful when doing this, as some of our target |
| 296 // platforms have pointer types and CPU registers that are wider than |
| 297 // ints. Furthermore, the kernel ABI requires us to return a negative |
| 298 // value, but errno values are usually positive. And in fact, it would |
| 299 // be perfectly reasonable for somebody to have defined them as unsigned |
| 300 // properties. This makes the correct incantation of type casts rather |
| 301 // subtle. Sometimes, C++ is just too smart for its own good. |
| 302 void *rc = (void *)(intptr_t)-(int)SECCOMP_DENY_ERRNO; |
| 303 |
| 304 // This is where we can add extra code to handle complex system calls. |
| 305 // ... |
| 306 |
| 307 ctx->uc_mcontext.gregs[REG_RESULT] = reinterpret_cast<greg_t>(rc); |
| 308 errno = old_errno; |
| 309 return; |
| 310 } |
| 311 |
| 312 |
| 313 bool Sandbox::suppressLogging_ = false; |
| 314 Sandbox::SandboxStatus Sandbox::status_ = STATUS_UNKNOWN; |
| 315 int Sandbox::proc_fd_ = -1; |
| 316 std::vector<std::pair<Sandbox::EvaluateSyscall, |
| 317 Sandbox::EvaluateArguments> > Sandbox::evaluators_; |
| 318 |
| 319 } // namespace |
OLD | NEW |