1 .\" Copyright (C) 2014 Kees Cook <keescook@chromium.org>
2 .\" and Copyright (C) 2012 Will Drewry <wad@chromium.org>
3 .\" and Copyright (C) 2008, 2014 Michael Kerrisk <mtk.manpages@gmail.com>
5 .\" %%%LICENSE_START(VERBATIM)
6 .\" Permission is granted to make and distribute verbatim copies of this
7 .\" manual provided the copyright notice and this permission notice are
8 .\" preserved on all copies.
10 .\" Permission is granted to copy and distribute modified versions of this
11 .\" manual under the conditions for verbatim copying, provided that the
12 .\" entire resulting derived work is distributed under the terms of a
13 .\" permission notice identical to this one.
15 .\" Since the Linux kernel and libraries are constantly changing, this
16 .\" manual page may be incorrect or out-of-date. The author(s) assume no
17 .\" responsibility for errors or omissions, or for damages resulting from
18 .\" the use of the information contained herein. The author(s) may not
19 .\" have taken the same level of care in the production of this manual,
20 .\" which is licensed free of charge, as they might when working
23 .\" Formatted or processed versions of this manual, if unaccompanied by
24 .\" the source, must acknowledge the copyright and authors of this work.
27 .TH SECCOMP 2 2015-01-10 "Linux" "Linux Programmer's Manual"
29 seccomp \- operate on Secure Computing state of the process
32 .B #include <linux/seccomp.h>
33 .B #include <linux/filter.h>
34 .B #include <linux/audit.h>
35 .B #include <linux/signal.h>
36 .B #include <sys/ptrace.h>
37 .\" Kees Cook noted: Anything that uses SECCOMP_RET_TRACE returns will
38 .\" need <sys/ptrace.h>
40 .BI "int seccomp(unsigned int " operation ", unsigned int " flags \
46 system call operates on the Secure Computing (seccomp) state of the
49 Currently, Linux supports the following
53 .BR SECCOMP_SET_MODE_STRICT
54 The only system calls that the calling thread is permitted to make are
60 Other system calls result in the delivery of a
63 Strict secure computing mode is useful for number-crunching
64 applications that may need to execute untrusted byte code, perhaps
65 obtained by reading from a pipe or socket.
67 This operation is available only if the kernel is configured with
77 This operation is functionally identical to the call:
79 prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT);
81 .BR SECCOMP_SET_MODE_FILTER
82 The system calls allowed are defined by a pointer to a Berkeley Packet
83 Filter (BPF) passed via
85 This argument is a pointer to a
86 .IR "struct\ sock_fprog" ;
87 it can be designed to filter arbitrary system calls and system call
89 If the filter is invalid,
100 is allowed by the filter, any child processes will be constrained to
101 the same system call filters as the parent.
105 the existing filters will be preserved across a call to
109 .BR SECCOMP_SET_MODE_FILTER
110 operation, either the caller must have the
112 capability, or the thread must already have the
115 If that bit was not already set by an ancestor of this thread,
116 the thread must make the following call:
118 prctl(PR_SET_NO_NEW_PRIVS, 1);
121 .BR SECCOMP_SET_MODE_FILTER
122 operation will fail and return
126 This requirement ensures that an unprivileged process cannot apply
127 a malicious filter and then invoke a set-user-ID or
128 other privileged program using
130 thus potentially compromising that program.
131 (Such a malicious filter might, for example, cause an attempt to use
133 to set the caller's user IDs to non-zero values to instead
134 return 0 without actually making the system call.
135 Thus, the program might be tricked into retaining superuser privileges
136 in circumstances where it is possible to influence it to do
137 dangerous things because it did not actually drop privileges.)
143 is allowed by the attached filter, further filters may be added.
144 This will increase evaluation time, but allows for further reduction of
145 the attack surface during execution of a thread.
148 .BR SECCOMP_SET_MODE_FILTER
149 operation is available only if the kernel is configured with
150 .BR CONFIG_SECCOMP_FILTER
155 is 0, this operation is functionally identical to the call:
157 prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, args);
164 .BR SECCOMP_FILTER_FLAG_TSYNC
165 When adding a new filter, synchronize all other threads of the calling
166 process to the same seccomp filter tree.
167 A "filter tree" is the ordered list of filters attached to a thread.
168 (Attaching identical filters in separate
170 calls results in different filters from this perspective.)
172 If any thread cannot synchronize to the same filter tree,
173 the call will not attach the new seccomp filter,
174 and will fail, returning the first thread ID found that cannot synchronize.
175 Synchronization will fail if another thread in the same process is in
176 .BR SECCOMP_MODE_STRICT
177 or if it has attached new seccomp filters to itself,
178 diverging from the calling thread's filter tree.
181 When adding filters via
182 .BR SECCOMP_SET_MODE_FILTER ,
184 points to a filter program:
189 unsigned short len; /* Number of BPF instructions */
190 struct sock_filter *filter; /* Pointer to array of
196 Each program must contain one or more BPF instructions:
200 struct sock_filter { /* Filter block */
201 __u16 code; /* Actual filter code */
202 __u8 jt; /* Jump true */
203 __u8 jf; /* Jump false */
204 __u32 k; /* Generic multiuse field */
209 When executing the instructions, the BPF program operates on the
210 system call information made available (i.e., use the
212 addressing mode) as a buffer of the following form:
216 struct seccomp_data {
217 int nr; /* System call number */
218 __u32 arch; /* AUDIT_ARCH_* value
219 (see <linux/audit.h>) */
220 __u64 instruction_pointer; /* CPU instruction pointer */
221 __u64 args[6]; /* Up to 6 system call arguments */
226 A seccomp filter returns a 32-bit value consisting of two parts:
227 the most significant 16 bits
228 (corresponding to the mask defined by the constant
229 .BR SECCOMP_RET_ACTION )
230 contain one of the "action" values listed below;
231 the least significant 16-bits (defined by the constant
232 .BR SECCOMP_RET_DATA )
233 are "data" to be associated with this return value.
235 If multiple filters exist, they are all executed,
236 in reverse order of their addition to the filter tree
237 (i.e., the most recently installed filter is executed first).
238 The return value for the evaluation of a given system call is the first-seen
239 .BR SECCOMP_RET_ACTION
240 value of highest precedence (along with its accompanying data)
241 returned by execution of all of the filters.
243 In decreasing order of precedence,
244 the values that may be returned by a seccomp filter are:
247 This value results in the process exiting immediately
248 without executing the system call.
249 The process terminates as though killed by a
256 This value results in the kernel sending a
258 signal to the triggering process without executing the system call.
259 Various fields will be set in the
263 associated with signal:
271 will show the address of the system call instruction.
276 will indicate which system call was attempted.
285 portion of the filter return value.
288 The program counter will be as though the system call happened
289 (i.e., it will not point to the system call instruction).
290 The return value register will contain an architecture\-dependent value;
291 if resuming execution, set it to something appropriate for the system call.
292 (The architecture dependency is because replacing it with
294 could overwrite some useful information.)
296 .BR SECCOMP_RET_ERRNO
297 This value results in the
299 portion of the filter's return value being passed to user space as the
301 value without executing the system call.
303 .BR SECCOMP_RET_TRACE
304 When returned, this value will cause the kernel to attempt to notify a
306 tracer prior to executing the system call.
307 If there is no tracer present,
308 the system call is not executed and returns a failure status with
313 A tracer will be notified if it requests
314 .BR PTRACE_O_TRACESECCOMP
316 .IR ptrace(PTRACE_SETOPTIONS) .
317 The tracer will be notified of a
318 .BR PTRACE_EVENT_SECCOMP
321 portion of the filter's return value will be available to the tracer via
322 .BR PTRACE_GETEVENTMSG .
324 The tracer can skip the system call by changing the system call number
326 Alternatively, the tracer can change the system call
327 requested by changing the system call to a valid system call number.
328 If the tracer asks to skip the system call, then the system call will
329 appear to return the value that the tracer puts in the return value register.
331 The seccomp check will not be run again after the tracer is notified.
332 (This means that seccomp-based sandboxes
335 .BR ptrace (2)\(emeven
337 sandboxed processes\(emwithout extreme care;
338 ptracers can use this mechanism to escape from the seccomp sandbox.)
340 .BR SECCOMP_RET_ALLOW
341 This value results in the system call being executed.
347 .BR SECCOMP_FILTER_FLAG_TSYNC
349 the return value is the ID of the thread
350 that caused the synchronization failure.
351 (This ID is a kernel thread ID of the type returned by
355 On other errors, \-1 is returned, and
357 is set to indicate the cause of the error.
360 can fail for the following reasons:
363 The caller did not have the
365 capability, or had not set
368 .BR SECCOMP_SET_MODE_FILTER .
372 was not a valid address.
378 are invalid for the given
385 but the specified offset was not aligned to a 32-bit boundary or exceeded
386 .IR "sizeof(struct\ seccomp_data)" .
389 .\" See kernel/seccomp.c::seccomp_may_assign_mode() in 3.18 sources
390 A secure computing mode has already been set, and
392 differs from the existing setting.
395 .\" See stub kernel/seccomp.c::seccomp_set_mode_filter() in 3.18 sources
398 .BR SECCOMP_SET_MODE_FILTER ,
399 but the kernel was not built with
400 .B CONFIG_SECCOMP_FILTER
406 .BR SECCOMP_SET_MODE_FILTER ,
407 but the filter program pointed to by
409 was not valid or the length of the filter program was zero or exceeded
418 .\" ENOMEM in kernel/seccomp.c::seccomp_attach_filter() in 3.18 sources
419 The total length of all filter programs attached
420 to the calling thread would exceed
421 .B MAX_INSNS_PER_PATH
422 (32768) instructions.
423 Note that for the purposes of calculating this limit,
424 each already existing filter program incurs an
425 overhead penalty of 4 instructions.
428 Another thread caused a failure during thread sync, but its ID could not
433 system call first appeared in Linux 3.17.
434 .\" FIXME . Add glibc version
438 system call is a nonstandard Linux extension.
443 .IR /proc/[pid]/status
444 file provides a method of viewing the seccomp mode of a process; see
448 provides a superset of the functionality provided by the
451 operation (which does not support
453 .SS Seccomp-specific BPF details
454 Note the following BPF details specific to seccomp filters:
460 size modifiers are not supported: all operations must load and store
464 To access the contents of the
468 addressing mode modifier.
472 addressing mode modifier yields an immediate mode operand
473 whose value is the size of the
477 The program below accepts four or more arguments.
478 The first three arguments are a system call number,
479 a numeric architecture identifier, and an error number.
480 The program uses these values to construct a BPF filter
481 that is used at run time to perform the following checks:
483 If the program is not running on the specified architecture,
484 the BPF filter causes system calls to fail with the error
487 If the program attempts to execute the system call with the specified number,
488 the BPF filter causes the system call to fail, with
490 being set to the specified error number.
492 The remaining command-line arguments specify
493 the pathname and additional arguments of a program
494 that the example program should attempt to execute using
496 (a library function that employs the
499 Some example runs of the program are shown below.
501 First, we display the architecture that we are running on (x86-64)
502 and then construct a shell function that looks up system call
503 numbers on this architecture:
510 cat /usr/src/linux/arch/x86/syscalls/syscall_64.tbl | \\
511 awk '$2 != "x32" && $3 == "'$1'" { print $1 }'
516 When the BPF filter rejects a system call (case [2] above),
517 it causes the system call to fail with the error number
518 specified on the command line.
519 In the experiments shown here, we'll use error number 99:
524 EADDRNOTAVAIL 99 Cannot assign requested address
528 In the following example, we attempt to run the command
530 but the BPF filter rejects the
532 system call, so that the command is not even executed:
536 $ \fBsyscall_nr execve\fP
539 Usage: ./a.out <syscall_nr> <arch> <errno> <prog> [<args>]
540 Hint for <arch>: AUDIT_ARCH_I386: 0x40000003
541 AUDIT_ARCH_X86_64: 0xC000003E
542 $ \fB./a.out 59 0xC000003E 99 /bin/whoami\fP
543 execv: Cannot assign requested address
547 In the next example, the BPF filter rejects the
549 system call, so that, although it is successfully started, the
551 command is not able to write output:
555 $ \fBsyscall_nr write\fP
557 $ \fB./a.out 1 0xC000003E 99 /bin/whoami\fP
561 In the final example,
562 the BPF filter rejects a system call that is not used by the
564 command, so it is able to successfully execute and produce output:
568 $ \fBsyscall_nr preadv\fP
570 $ \fB./a.out 295 0xC000003E 99 /bin/whoami\fP
582 #include <linux/audit.h>
583 #include <linux/filter.h>
584 #include <linux/seccomp.h>
585 #include <sys/prctl.h>
588 install_filter(int syscall_nr, int t_arch, int f_errno)
590 struct sock_filter filter[] = {
591 /* [0] Load architecture from 'seccomp_data' buffer into
593 BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
594 (offsetof(struct seccomp_data, arch))),
596 /* [1] Jump forward 4 instructions if architecture does not
598 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, t_arch, 0, 4),
600 /* [2] Load system call number from 'seccomp_data' buffer into
602 BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
603 (offsetof(struct seccomp_data, nr))),
605 /* [3] Jump forward 1 instruction if system call number
606 does not match 'syscall_nr' */
607 BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, syscall_nr, 0, 1),
609 /* [4] Matching architecture and system call: don't execute
610 the system call, and return 'f_errno' in 'errno' */
611 BPF_STMT(BPF_RET | BPF_K,
612 SECCOMP_RET_ERRNO | (f_errno & SECCOMP_RET_DATA)),
614 /* [5] Destination of system call number mismatch: allow other
616 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
618 /* [6] Destination of architecture mismatch: kill process */
619 BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL),
622 struct sock_fprog prog = {
623 .len = (unsigned short) (sizeof(filter) / sizeof(filter[0])),
627 if (seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog)) {
636 main(int argc, char **argv)
639 fprintf(stderr, "Usage: "
640 "%s <syscall_nr> <arch> <errno> <prog> [<args>]\\n"
641 "Hint for <arch>: AUDIT_ARCH_I386: 0x%X\\n"
642 " AUDIT_ARCH_X86_64: 0x%X\\n"
643 "\\n", argv[0], AUDIT_ARCH_I386, AUDIT_ARCH_X86_64);
647 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
652 if (install_filter(strtol(argv[1], NULL, 0),
653 strtol(argv[2], NULL, 0),
654 strtol(argv[3], NULL, 0)))
657 execv(argv[4], &argv[4]);
669 The kernel source files
670 .IR Documentation/networking/filter.txt
672 .IR Documentation/prctl/seccomp_filter.txt .
674 McCanne, S. and Jacobson, V. (1992)
675 .IR "The BSD Packet Filter: A New Architecture for User-level Packet Capture" ,
676 Proceedings of the USENIX Winter 1993 Conference
677 .UR http://www.tcpdump.org/papers/bpf-usenix93.pdf
680 This page is part of release 3.79 of the Linux
683 A description of the project,
684 information about reporting bugs,
685 and the latest version of this page,
687 \%http://www.kernel.org/doc/man\-pages/.