sandbox/linux/seccomp-bpf/sandbox_bpf.cc - Issue 10536048: Instead of outputting one BPF check per possible system call. Coalesce

Side by Side Diff: sandbox/linux/seccomp-bpf/sandbox_bpf.cc

Issue 10536048: Instead of outputting one BPF check per possible system call. Coalesce (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Simplified the asserts Created 8 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "sandbox/linux/seccomp-bpf/sandbox_bpf.h"	5 #include "sandbox/linux/seccomp-bpf/sandbox_bpf.h"

6 #include "sandbox/linux/seccomp-bpf/verifier.h"	6 #include "sandbox/linux/seccomp-bpf/verifier.h"

7	7

8 // The kernel gives us a sandbox, we turn it into a playground :-)	8 // The kernel gives us a sandbox, we turn it into a playground :-)

9 // This is version 2 of the playground; version 1 was built on top of	9 // This is version 2 of the playground; version 1 was built on top of

10 // pre-BPF seccomp mode.	10 // pre-BPF seccomp mode.

(...skipping 159 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
170 sb.st_nlink != 3 \|\|	170 sb.st_nlink != 3 \|\|

171 HANDLE_EINTR(close(task))) {	171 HANDLE_EINTR(close(task))) {

172 if (task >= 0) {	172 if (task >= 0) {

173 (void) HANDLE_EINTR(close(task));	173 (void) HANDLE_EINTR(close(task));

174 }	174 }

175 return false;	175 return false;

176 }	176 }

177 return true;	177 return true;

178 }	178 }

179	179

	180 static bool isDenied(Sandbox::ErrorCode code) {

	181 return code == Sandbox::SB_TRAP \|\|

	182 (code >= (Sandbox::ErrorCode)1 &&

	183 code <= (Sandbox::ErrorCode)4095); // errno value

	184 }

	185

180 void Sandbox::setSandboxPolicy(EvaluateSyscall syscallEvaluator,	186 void Sandbox::setSandboxPolicy(EvaluateSyscall syscallEvaluator,

181 EvaluateArguments argumentEvaluator) {	187 EvaluateArguments argumentEvaluator) {

	188 // Do some sanity checks on the policy. This will warn users if they do

	189 // things that are likely unsafe and unintended.

	190 // We also have similar checks later, when we actually compile the BPF

	191 // program. That catches problems with incorrectly stacked evaluators.

	192 if (!isDenied(syscallEvaluator(-1))) {

	193 die("Negative system calls should always be disallowed by policy");

	194 }

	195 #if defined(__i386__) \|\| defined(__x86_64__)

	196 #if defined(__x86_64__) && defined(__ILP32__)

	197 for (unsigned int sysnum = MIN_SYSCALL & ~0x40000000u;

	198 sysnum <= (MAX_SYSCALL & ~0x40000000u);

	199 ++sysnum) {

	200 if (!isDenied(syscallEvaluator(sysnum))) {

	201 die("In x32 mode, you should not allow any non-x32 system calls");

	202 }

	203 }

	204 #else

	205 for (unsigned int sysnum = MIN_SYSCALL \| 0x40000000u;

	206 sysnum <= (MAX_SYSCALL \| 0x40000000u);

	207 ++sysnum) {

	208 if (!isDenied(syscallEvaluator(sysnum))) {

	209 die("x32 system calls should be explicitly disallowed");

	210 }

	211 }

	212 #endif

	213 #endif

	214 // Check interesting boundary values just outside of the valid system call

	215 // range: 0x7FFFFFFF, 0x80000000, 0xFFFFFFFF, MIN_SYSCALL-1, MAX_SYSCALL+1.

	216 // They all should be denied.

	217 if (!isDenied(syscallEvaluator(std::numeric_limits<int>::max())) \|\|

	218 !isDenied(syscallEvaluator(std::numeric_limits<int>::min())) \|\|

	219 !isDenied(syscallEvaluator(-1)) \|\|

	220 !isDenied(syscallEvaluator(static_cast<int>(MIN_SYSCALL) - 1)) \|\|

	221 !isDenied(syscallEvaluator(static_cast<int>(MAX_SYSCALL) + 1))) {

	222 die("Even for default-allow policies, you must never allow system calls "

	223 "outside of the standard system call range");

	224 }

	225
	Jorge Lucangeli Obes 2012/06/11 19:39:10 Since this method is called "setSandboxPolicy", do Since this method is called "setSandboxPolicy", doesn't it make sense to extract the sanity checks to another method? Markus (顧孟勤) 2012/06/11 19:56:50 Done. Show quoted text On 2012/06/11 19:39:10, Jorge Lucangeli Obes wrote: > Since this method is called "setSandboxPolicy", doesn't it make sense to extract > the sanity checks to another method? Done.
182 evaluators_.push_back(std::make_pair(syscallEvaluator, argumentEvaluator));	226 evaluators_.push_back(std::make_pair(syscallEvaluator, argumentEvaluator));

183 }	227 }

184	228

185 void Sandbox::installFilter() {	229 void Sandbox::installFilter() {

186 // Verify that the user pushed a policy.	230 // Verify that the user pushed a policy.

187 if (evaluators_.empty()) {	231 if (evaluators_.empty()) {

188 filter_failed:	232 filter_failed:

189 die("Failed to configure system call filters");	233 die("Failed to configure system call filters");

190 }	234 }

191	235

(...skipping 15 matching lines...) Expand all Loading...
207 }	251 }

208	252

209 // We can't handle stacked evaluators, yet. We'll get there eventually	253 // We can't handle stacked evaluators, yet. We'll get there eventually

210 // though. Hang tight.	254 // though. Hang tight.

211 if (evaluators_.size() != 1) {	255 if (evaluators_.size() != 1) {

212 die("Not implemented");	256 die("Not implemented");

213 }	257 }

214	258

215 // If the architecture doesn't match SECCOMP_ARCH, disallow the	259 // If the architecture doesn't match SECCOMP_ARCH, disallow the

216 // system call.	260 // system call.

217 std::vector<struct sock_filter> program;	261 Program program;

218 program.push_back((struct sock_filter)	262 program.push_back((struct sock_filter)

219 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct arch_seccomp_data, arch)));	263 BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct arch_seccomp_data, arch)));

220 program.push_back((struct sock_filter)	264 program.push_back((struct sock_filter)

221 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, SECCOMP_ARCH, 1, 0));	265 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, SECCOMP_ARCH, 1, 0));

222	266

223 // TODO: Instead of killing outright, we should raise a SIGSYS and	267 // TODO: Instead of killing outright, we should raise a SIGSYS and

224 // report a useful error message. SIGKILL cannot be trapped by the	268 // report a useful error message. SIGKILL cannot be trapped by the

225 // debugger and essentially makes the program fail in a way that is	269 // debugger and essentially makes the program fail in a way that is

226 // almost impossible to debug.	270 // almost impossible to debug.

227 program.push_back((struct sock_filter)	271 program.push_back((struct sock_filter)

(...skipping 12 matching lines...) Expand all Loading...
240 BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, 0x40000000, 1, 0));	284 BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, 0x40000000, 1, 0));

241 #else	285 #else

242 program.push_back((struct sock_filter)	286 program.push_back((struct sock_filter)

243 BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, 0x40000000, 0, 1));	287 BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, 0x40000000, 0, 1));

244 #endif	288 #endif

245 // TODO: raise a suitable SIGSYS signal	289 // TODO: raise a suitable SIGSYS signal

246 program.push_back((struct sock_filter)	290 program.push_back((struct sock_filter)

247 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL));	291 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL));

248 #endif	292 #endif

249	293

250 // Evaluate all possible system calls and depending on their	294 // Evaluate all possible system calls and group their ErrorCodes into

251 // exit codes generate a BPF filter.	295 // ranges of identical codes.

252 // This is very inefficient right now. We need to be much smarter	296 Ranges ranges;

253 // eventually.	297 findRanges(&ranges);

254 // We currently incur a O(N) overhead on each system call, with N	298

255 // being the number of system calls. It is easy to get this down to	299 // Compile the system call ranges to an optimized BPF program

256 // O(log_2(M)) with M being the number of system calls that need special	300 rangesToBPF(&program, ranges);

257 // treatment.	301

	302 // Everything that isn't allowed is forbidden. Eventually, we would

	303 // like to have a way to log forbidden calls, when in debug mode.

	304 program.push_back((struct sock_filter)

	305 BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ERRNO + SECCOMP_DENY_ERRNO));

	306

	307 // Make sure compilation resulted in BPF program that executes

	308 // correctly. Otherwise, there is an internal error in our BPF compiler.

	309 // There is really nothing the caller can do until the bug is fixed.

	310 const char *err;

	311 if (!Verifier::verifyBPF(program, evaluators_, &err)) {

	312 die(err);

	313 }

	314

	315 // Install BPF filter program

	316 const struct sock_fprog prog = { program.size(), &program[0] };

	317 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) \|\|

	318 prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) {

	319 goto filter_failed;

	320 }

	321

	322 return;

	323 }

	324

	325 void Sandbox::findRanges(Ranges *ranges) {

	326 // Please note that "struct seccomp_data" defines system calls as a signed

	327 // int32_t, but BPF instructions always operate on unsigned quantities. We

	328 // deal with this disparity by enumerating from MIN_SYSCALL to MAX_SYSCALL,

	329 // and then verifying that the rest of the number range (both positive and

	330 // negative) all return the same ErrorCode.

258 EvaluateSyscall evaluateSyscall = evaluators_.begin()->first;	331 EvaluateSyscall evaluateSyscall = evaluators_.begin()->first;

259 for (int sysnum = MIN_SYSCALL; sysnum <= MAX_SYSCALL+1; ++sysnum) {	332 uint32_t oldSysnum = 0;

260 ErrorCode err = evaluateSyscall(sysnum);	333 ErrorCode oldErr = evaluateSyscall(oldSysnum);

	334 for (uint32_t sysnum = std::max(1u, MIN_SYSCALL);

	335 sysnum <= MAX_SYSCALL + 1;

	336 ++sysnum) {

	337 ErrorCode err = evaluateSyscall(static_cast<int>(sysnum));

	338 if (err != oldErr) {

	339 ranges->push_back(Range(oldSysnum, sysnum-1, oldErr));

	340 oldSysnum = sysnum;

	341 oldErr = err;

	342 }

	343 }
	Jorge Lucangeli Obes 2012/06/11 19:39:10 This code only coalesces contiguous system calls w This code only coalesces contiguous system calls with the same return value, right? We still cannot have a concept of "hot" syscalls. But if we merge the binary search support, it shouldn't be a problem. Markus (顧孟勤) 2012/06/11 19:56:50 Have you actually done any benchmarks and does thi Have you actually done any benchmarks and does this cost show up anywhere? I am quite curious. My gut feeling is that the number of executed BPF instructions is pretty much negligible on most CPUs. Even if the kernel decides to interpret the BPF filter instead of using a JIT. What I do suspect will matter is the overall size of the BPF filter program and the overall density of the code. These two factors determine how frequently the CPU needs to reach out to memory instead of being able to load the BPF program from cache. Coalescing of ranges already helps somewhat with this goal, and the binary tree helps even more. For the vast majority of applications, cache foot print is the limiting factor, clock-cyles per instructions are not and haven't been for about ten years or so. These numbers might have changed again, but a few years ago, you could literally execute several thousand instructions while waiting for a single memory load. Of course, once memory starts loading, it will then try to aggressively stream some more memory. That's where code density helps us. And that's where BPF's forward-only jumps are quite nice.
	344

	345 // As we looped all the way past the valid system calls (i.e. MAX_SYSCALL+1),

	346 // "oldErr" should at this point be the "default" policy for all system call

	347 // numbers that don't have an explicit handler in the system call evaluator.

	348 // But as we are quite paranoid, we perform some more sanity checks to verify

	349 // that there actually is a consistent "default" policy in the first place.

	350 // We don't actually iterate over all possible 2^32 values, though. We just

	351 // perform spot checks at the boundaries.

	352 // The cases that we test are: 0x7FFFFFFF, 0x80000000, 0xFFFFFFFF.

	353 if (oldErr != evaluateSyscall(std::numeric_limits<int>::max()) \|\|

	354 oldErr != evaluateSyscall(std::numeric_limits<int>::min()) \|\|

	355 oldErr != evaluateSyscall(-1)) {

	356 die("Invalid seccomp policy");

	357 }

	358 ranges->push_back(

	359 Range(oldSysnum, std::numeric_limits<unsigned>::max(), oldErr));

	360 }

	361

	362 void Sandbox::rangesToBPF(Program *program, const Ranges& ranges) {

	363 // TODO: We currently search linearly through all ranges. An improved

	364 // algorithm should be doing a binary search.

	365

	366 // System call ranges must cover the entire number range.

	367 if (ranges.empty() \|\|

	368 ranges.begin()->from != 0 \|\|

	369 ranges.back().to != std::numeric_limits<unsigned>::max()) {

	370 rangeError:

	371 die("Invalid set of system call ranges");

	372 }

	373 uint32_t from = 0;

	374 for (Ranges::const_iterator iter = ranges.begin();

	375 iter != ranges.end();

	376 ++iter) {

	377 // Ranges must be contiguous and monotonically increasing.

	378 if (iter->from > iter->to \|\|

	379 iter->from != from) {

	380 goto rangeError;

	381 }

	382 from = iter->to + 1;

	383

	384 // Convert ErrorCodes to return values that are acceptable for

	385 // BPF filters.

261 int ret;	386 int ret;

262 switch (err) {	387 switch (iter->err) {

263 case SB_INSPECT_ARG_1...SB_INSPECT_ARG_6:	388 case SB_INSPECT_ARG_1...SB_INSPECT_ARG_6:

264 die("Not implemented");	389 die("Not implemented");

265 case SB_TRAP:	390 case SB_TRAP:

266 ret = SECCOMP_RET_TRAP;	391 ret = SECCOMP_RET_TRAP;

267 break;	392 break;

268 case SB_ALLOWED:	393 case SB_ALLOWED:

269 ret = SECCOMP_RET_ALLOW;	394 ret = SECCOMP_RET_ALLOW;

270 break;	395 break;

271 default:	396 default:

272 if (err >= static_cast<ErrorCode>(1) &&	397 if (iter->err >= static_cast<ErrorCode>(1) &&

273 err <= static_cast<ErrorCode>(4096)) {	398 iter->err <= static_cast<ErrorCode>(4096)) {

274 // We limit errno values to a reasonable range. In fact, the Linux ABI	399 // We limit errno values to a reasonable range. In fact, the Linux ABI

275 // doesn't support errno values outside of this range.	400 // doesn't support errno values outside of this range.

276 ret = SECCOMP_RET_ERRNO + err;	401 ret = SECCOMP_RET_ERRNO + iter->err;

277 } else {	402 } else {

278 die("Invalid ErrorCode reported by sandbox system call evaluator");	403 die("Invalid ErrorCode reported by sandbox system call evaluator");

279 }	404 }

280 break;	405 break;

281 }	406 }

282 if (sysnum <= MAX_SYSCALL) {	407

283 // We compute the default behavior (e.g. fail open or fail closed) by	408 // Emit BPF instructions matching this range.

284 // calling the system call evaluator with a system call bigger than	409 if (iter->to != std::numeric_limits<unsigned>::max()) {

285 // MAX_SYSCALL.	410 program->push_back((struct sock_filter)

286 // In other words, the very last iteration in our loop becomes the	411 BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, iter->to, 1, 0));

287 // fallback case and we don't need to do any comparisons.

288 program.push_back((struct sock_filter)

289 BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, sysnum, 0, 1));

290 }	412 }

291 program.push_back((struct sock_filter)	413 program->push_back((struct sock_filter)

292 BPF_STMT(BPF_RET+BPF_K, ret));	414 BPF_STMT(BPF_RET+BPF_K, ret));

293 }	415 }

294

295 // Make sure compilation resulted in BPF program that executes

296 // correctly. Otherwise, there is an internal error in our BPF compiler.

297 // There is really nothing the caller can do until the bug is fixed.

298 const char *err;

299 if (!Verifier::verifyBPF(program, evaluators_, &err)) {

300 die(err);

301 }

302

303 // Install BPF filter program

304 const struct sock_fprog prog = { program.size(), &program[0] };

305 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) \|\|

306 prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)) {

307 goto filter_failed;

308 }

309

310 return;	416 return;

311 }	417 }

312	418

313 void Sandbox::sigSys(int nr, siginfo_t info, void void_context) {	419 void Sandbox::sigSys(int nr, siginfo_t info, void void_context) {

314 if (nr != SIGSYS \|\| info->si_code != SYS_SECCOMP \|\| !void_context) {	420 if (nr != SIGSYS \|\| info->si_code != SYS_SECCOMP \|\| !void_context) {

315 // die() can call LOG(FATAL). This is not normally async-signal safe	421 // die() can call LOG(FATAL). This is not normally async-signal safe

316 // and can lead to bugs. We should eventually implement a different	422 // and can lead to bugs. We should eventually implement a different

317 // logging and reporting mechanism that is safe to be called from	423 // logging and reporting mechanism that is safe to be called from

318 // the sigSys() handler.	424 // the sigSys() handler.

319 die("Unexpected SIGSYS received");	425 die("Unexpected SIGSYS received");

(...skipping 20 matching lines...) Expand all Loading...
340 return;	446 return;

341 }	447 }

342	448

343	449

344 bool Sandbox::suppressLogging_ = false;	450 bool Sandbox::suppressLogging_ = false;

345 Sandbox::SandboxStatus Sandbox::status_ = STATUS_UNKNOWN;	451 Sandbox::SandboxStatus Sandbox::status_ = STATUS_UNKNOWN;

346 int Sandbox::proc_fd_ = -1;	452 int Sandbox::proc_fd_ = -1;

347 Sandbox::Evaluators Sandbox::evaluators_;	453 Sandbox::Evaluators Sandbox::evaluators_;

348	454

349 } // namespace	455 } // namespace

OLD	NEW

« no previous file with comments | « sandbox/linux/seccomp-bpf/sandbox_bpf.h ('k') | sandbox/linux/seccomp-bpf/verifier.cc » ('j') | no next file with comments »