LDP: Update original to LDP v3.77

[linuxjm/LDP_man-pages.git] / original / man2 / seccomp.2
diff --git a/original/man2/seccomp.2 b/original/man2/seccomp.2

new file mode 100644 (file)

index 0000000..a21f9fa
--- /dev/null
+++ b/original/man2/seccomp.2
@@ -0,0 +1,686 @@
+.\" Copyright (C) 2014 Kees Cook <keescook@chromium.org>
+.\" and Copyright (C) 2012 Will Drewry <wad@chromium.org>
+.\" and Copyright (C) 2008, 2014 Michael Kerrisk <mtk.manpages@gmail.com>
+.\"
+.\" %%%LICENSE_START(VERBATIM)
+.\" Permission is granted to make and distribute verbatim copies of this
+.\" manual provided the copyright notice and this permission notice are
+.\" preserved on all copies.
+.\"
+.\" Permission is granted to copy and distribute modified versions of this
+.\" manual under the conditions for verbatim copying, provided that the
+.\" entire resulting derived work is distributed under the terms of a
+.\" permission notice identical to this one.
+.\"
+.\" Since the Linux kernel and libraries are constantly changing, this
+.\" manual page may be incorrect or out-of-date.  The author(s) assume no
+.\" responsibility for errors or omissions, or for damages resulting from
+.\" the use of the information contained herein.  The author(s) may not
+.\" have taken the same level of care in the production of this manual,
+.\" which is licensed free of charge, as they might when working
+.\" professionally.
+.\"
+.\" Formatted or processed versions of this manual, if unaccompanied by
+.\" the source, must acknowledge the copyright and authors of this work.
+.\" %%%LICENSE_END
+.\"
+.TH SECCOMP 2 2015-01-10 "Linux" "Linux Programmer's Manual"
+.SH NAME
+seccomp \- operate on Secure Computing state of the process
+.SH SYNOPSIS
+.nf
+.B #include <linux/seccomp.h>
+.B #include <linux/filter.h>
+.B #include <linux/audit.h>
+.B #include <linux/signal.h>
+.B #include <sys/ptrace.h>
+.\" Kees Cook noted: Anything that uses SECCOMP_RET_TRACE returns will
+.\"                  need <sys/ptrace.h>
+
+.BI "int seccomp(unsigned int " operation ", unsigned int " flags \
+", void *" args );
+.fi
+.SH DESCRIPTION
+The
+.BR seccomp ()
+system call operates on the Secure Computing (seccomp) state of the
+calling process.
+
+Currently, Linux supports the following
+.IR operation
+values:
+.TP
+.BR SECCOMP_SET_MODE_STRICT
+The only system calls that the calling thread is permitted to make are
+.BR read (2),
+.BR write (2),
+.BR _exit (2),
+and
+.BR sigreturn (2).
+Other system calls result in the delivery of a
+.BR SIGKILL
+signal.
+Strict secure computing mode is useful for number-crunching
+applications that may need to execute untrusted byte code, perhaps
+obtained by reading from a pipe or socket.
+
+This operation is available only if the kernel is configured with
+.BR CONFIG_SECCOMP
+enabled.
+
+The value of
+.IR flags
+must be 0, and
+.IR args
+must be NULL.
+
+This operation is functionally identical to the call:
+
+    prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT);
+.TP
+.BR SECCOMP_SET_MODE_FILTER
+The system calls allowed are defined by a pointer to a Berkeley Packet
+Filter (BPF) passed via
+.IR args .
+This argument is a pointer to a
+.IR "struct\ sock_fprog" ;
+it can be designed to filter arbitrary system calls and system call
+arguments.
+If the filter is invalid,
+.BR seccomp ()
+fails, returning
+.BR EINVAL
+in
+.IR errno .
+
+If
+.BR fork (2)
+or
+.BR clone (2)
+is allowed by the filter, any child processes will be constrained to
+the same system call filters as the parent.
+If
+.BR execve (2)
+is allowed,
+the existing filters will be preserved across a call to
+.BR execve (2).
+
+In order to use the
+.BR SECCOMP_SET_MODE_FILTER
+operation, either the caller must have the
+.BR CAP_SYS_ADMIN
+capability, or the thread must already have the
+.I no_new_privs
+bit set.
+If that bit was not already set by an ancestor of this thread,
+the thread must make the following call:
+
+    prctl(PR_SET_NO_NEW_PRIVS, 1);
+
+Otherwise, the
+.BR SECCOMP_SET_MODE_FILTER
+operation will fail and return
+.BR EACCES
+in
+.IR errno .
+This requirement ensures that an unprivileged process cannot apply
+a malicious filter and then invoke a set-user-ID or
+other privileged program using
+.BR execve (2),
+thus potentially compromising that program.
+(Such a malicious filter might, for example, cause an attempt to use
+.BR setuid (2)
+to set the caller's user IDs to non-zero values to instead
+return 0 without actually making the system call.
+Thus, the program might be tricked into retaining superuser privileges
+in circumstances where it is possible to influence it to do
+dangerous things because it did not actually drop privileges.)
+
+If
+.BR prctl (2)
+or
+.BR seccomp (2)
+is allowed by the attached filter, further filters may be added.
+This will increase evaluation time, but allows for further reduction of
+the attack surface during execution of a thread.
+
+The
+.BR SECCOMP_SET_MODE_FILTER
+operation is available only if the kernel is configured with
+.BR CONFIG_SECCOMP_FILTER
+enabled.
+
+When
+.IR flags
+is 0, this operation is functionally identical to the call:
+
+    prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, args);
+
+The recognized
+.IR flags
+are:
+.RS
+.TP
+.BR SECCOMP_FILTER_FLAG_TSYNC
+When adding a new filter, synchronize all other threads of the calling
+process to the same seccomp filter tree.
+A "filter tree" is the ordered list of filters attached to a thread.
+(Attaching identical filters in separate
+.BR seccomp ()
+calls results in different filters from this perspective.)
+
+If any thread cannot synchronize to the same filter tree,
+the call will not attach the new seccomp filter,
+and will fail, returning the first thread ID found that cannot synchronize.
+Synchronization will fail if another thread in the same process is in
+.BR SECCOMP_MODE_STRICT
+or if it has attached new seccomp filters to itself,
+diverging from the calling thread's filter tree.
+.RE
+.SS Filters
+When adding filters via
+.BR SECCOMP_SET_MODE_FILTER ,
+.IR args
+points to a filter program:
+
+.in +4n
+.nf
+struct sock_fprog {
+    unsigned short      len;    /* Number of BPF instructions */
+    struct sock_filter *filter; /* Pointer to array of
+                                   BPF instructions */
+};
+.fi
+.in
+
+Each program must contain one or more BPF instructions:
+
+.in +4n
+.nf
+struct sock_filter {            /* Filter block */
+    __u16 code;                 /* Actual filter code */
+    __u8  jt;                   /* Jump true */
+    __u8  jf;                   /* Jump false */
+    __u32 k;                    /* Generic multiuse field */
+};
+.fi
+.in
+
+When executing the instructions, the BPF program operates on the
+system call information made available (i.e., use the
+.BR BPF_ABS
+addressing mode) as a buffer of the following form:
+
+.in +4n
+.nf
+struct seccomp_data {
+    int   nr;                   /* System call number */
+    __u32 arch;                 /* AUDIT_ARCH_* value
+                                   (see <linux/audit.h>) */
+    __u64 instruction_pointer;  /* CPU instruction pointer */
+    __u64 args[6];              /* Up to 6 system call arguments */
+};
+.fi
+.in
+
+A seccomp filter returns a 32-bit value consisting of two parts:
+the most significant 16 bits
+(corresponding to the mask defined by the constant
+.BR SECCOMP_RET_ACTION )
+contain one of the "action" values listed below;
+the least significant 16-bits (defined by the constant
+.BR SECCOMP_RET_DATA )
+are "data" to be associated with this return value.
+
+If multiple filters exist, they are all executed,
+in reverse order of their addition to the filter tree
+(i.e., the most recently installed filter is executed first).
+The return value for the evaluation of a given system call is the first-seen
+.BR SECCOMP_RET_ACTION
+value of highest precedence (along with its accompanying data)
+returned by execution of all of the filters.
+
+In decreasing order of precedence,
+the values that may be returned by a seccomp filter are:
+.TP
+.BR SECCOMP_RET_KILL
+This value results in the process exiting immediately
+without executing the system call.
+The process terminates as though killed by a
+.B SIGSYS
+signal
+.RI ( not
+.BR SIGKILL ).
+.TP
+.BR SECCOMP_RET_TRAP
+This value results in the kernel sending a
+.BR SIGSYS
+signal to the triggering process without executing the system call.
+Various fields will be set in the
+.I siginfo_t
+structure (see
+.BR sigaction (2))
+associated with signal:
+.RS
+.IP * 3
+.I si_signo
+will contain
+.BR SIGSYS .
+.IP *
+.IR si_call_addr
+will show the address of the system call instruction.
+.IP *
+.IR si_syscall
+and
+.IR si_arch
+will indicate which system call was attempted.
+.IP *
+.I si_code
+will contain
+.BR SYS_SECCOMP .
+.IP *
+.I si_errno
+will contain the
+.BR SECCOMP_RET_DATA
+portion of the filter return value.
+.RE
+.IP
+The program counter will be as though the system call happened
+(i.e., it will not point to the system call instruction).
+The return value register will contain an architecture\-dependent value;
+if resuming execution, set it to something appropriate for the system call.
+(The architecture dependency is because replacing it with
+.BR ENOSYS
+could overwrite some useful information.)
+.TP
+.BR SECCOMP_RET_ERRNO
+This value results in the
+.B SECCOMP_RET_DATA
+portion of the filter's return value being passed to user space as the
+.IR errno
+value without executing the system call.
+.TP
+.BR SECCOMP_RET_TRACE
+When returned, this value will cause the kernel to attempt to notify a
+.BR ptrace (2)-based
+tracer prior to executing the system call.
+If there is no tracer present,
+the system call is not executed and returns a failure status with
+.I errno
+set to
+.BR ENOSYS .
+
+A tracer will be notified if it requests
+.BR PTRACE_O_TRACESECCOMP
+using
+.IR ptrace(PTRACE_SETOPTIONS) .
+The tracer will be notified of a
+.BR PTRACE_EVENT_SECCOMP
+and the
+.BR SECCOMP_RET_DATA
+portion of the filter's return value will be available to the tracer via
+.BR PTRACE_GETEVENTMSG .
+
+The tracer can skip the system call by changing the system call number
+to \-1.
+Alternatively, the tracer can change the system call
+requested by changing the system call to a valid system call number.
+If the tracer asks to skip the system call, then the system call will
+appear to return the value that the tracer puts in the return value register.
+
+The seccomp check will not be run again after the tracer is notified.
+(This means that seccomp-based sandboxes
+.B "must not"
+allow use of
+.BR ptrace (2)\(emeven
+of other
+sandboxed processes\(emwithout extreme care;
+ptracers can use this mechanism to escape from the seccomp sandbox.)
+.TP
+.BR SECCOMP_RET_ALLOW
+This value results in the system call being executed.
+.SH RETURN VALUE
+On success,
+.BR seccomp ()
+returns 0.
+On error, if
+.BR SECCOMP_FILTER_FLAG_TSYNC
+was used,
+the return value is the ID of the thread
+that caused the synchronization failure.
+(This ID is a kernel thread ID of the type returned by
+.BR clone (2)
+and
+.BR gettid (2).)
+On other errors, \-1 is returned, and
+.IR errno
+is set to indicate the cause of the error.
+.SH ERRORS
+.BR seccomp ()
+can fail for the following reasons:
+.TP
+.BR EACCESS
+The caller did not have the
+.BR CAP_SYS_ADMIN
+capability, or had not set
+.IR no_new_privs
+before using
+.BR SECCOMP_SET_MODE_FILTER .
+.TP
+.BR EFAULT
+.IR args
+was not a valid address.
+.TP
+.BR EINVAL
+.IR operation
+is unknown; or
+.IR flags
+are invalid for the given
+.IR operation .
+.TP
+.BR EINVAL
+.I operation
+included
+.BR BPF_ABS ,
+but the specified offset was not aligned to a 32-bit boundary or exceeded
+.IR "sizeof(struct\ seccomp_data)" .
+.TP
+.BR EINVAL
+.\" See kernel/seccomp.c::seccomp_may_assign_mode() in 3.18 sources
+A secure computing mode has already been set, and
+.I operation
+differs from the existing setting.
+.TP
+.BR EINVAL
+.\" See stub kernel/seccomp.c::seccomp_set_mode_filter() in 3.18 sources
+.I operation
+specified
+.BR SECCOMP_SET_MODE_FILTER ,
+but the kernel was not built with
+.B CONFIG_SECCOMP_FILTER
+enabled.
+.TP
+.BR EINVAL
+.I operation
+specified
+.BR SECCOMP_SET_MODE_FILTER ,
+but the filter program pointed to by
+.I args
+was not valid or the length of the filter program was zero or exceeded
+.B BPF_MAXINSNS
+(4096) instructions.
+.BR EINVAL
+.TP
+.BR ENOMEM
+Out of memory.
+.TP
+.BR ENOMEM
+.\" ENOMEM in kernel/seccomp.c::seccomp_attach_filter() in 3.18 sources
+The total length of all filter programs attached
+to the calling thread would exceed
+.B MAX_INSNS_PER_PATH
+(32768) instructions.
+Note that for the purposes of calculating this limit,
+each already existing filter program incurs an
+overhead penalty of 4 instructions.
+.TP
+.BR ESRCH
+Another thread caused a failure during thread sync, but its ID could not
+be determined.
+.SH VERSIONS
+The
+.BR seccomp()
+system call first appeared in Linux 3.17.
+.\" FIXME . Add glibc version
+.SH CONFORMING TO
+The
+.BR seccomp()
+system call is a nonstandard Linux extension.
+.SH NOTES
+The
+.IR Seccomp
+field of the
+.IR /proc/[pid]/status
+file provides a method of viewing the seccomp mode of a process; see
+.BR proc (5).
+
+.BR seccomp ()
+provides a superset of the functionality provided by the
+.BR prctl (2)
+.BR PR_SET_SECCOMP
+operation (which does not support
+.IR flags ).
+.SS Seccomp-specific BPF details
+Note the following BPF details specific to seccomp filters:
+.IP * 3
+The
+.B BPF_H
+and
+.B BPF_B
+size modifiers are not supported: all operations must load and store
+(4-byte) words
+.RB ( BPF_W ).
+.IP *
+To access the contents of the
+.I seccomp_data
+buffer, use the
+.B BPF_ABS
+addressing mode modifier.
+.IP *
+The
+.B BPF_LEN
+addressing mode modifier yields an immediate mode operand
+whose value is the size of the
+.IR seccomp_data
+buffer.
+.SH EXAMPLE
+The program below accepts four or more arguments.
+The first three arguments are a system call number,
+a numeric architecture identifier, and an error number.
+The program uses these values to construct a BPF filter
+that is used at run time to perform the following checks:
+.IP [1] 4
+If the program is not running on the specified architecture,
+the BPF filter causes system calls to fail with the error
+.BR ENOSYS .
+.IP [2]
+If the program attempts to execute the system call with the specified number,
+the BPF filter causes the system call to fail, with
+.I errno
+being set to the specified error number.
+.PP
+The remaining command-line arguments specify
+the pathname and additional arguments of a program
+that the example program should attempt to execute using
+.BR execve (3)
+(a library function that employs the
+.BR execve (2)
+system call).
+Some example runs of the program are shown below.
+
+First, we display the architecture that we are running on (x86-64)
+and then construct a shell function that looks up system call
+numbers on this architecture:
+
+.nf
+.in +4n
+$ \fBuname -m\fP
+x86_64
+$ \fBsyscall_nr() {
+    cat /usr/src/linux/arch/x86/syscalls/syscall_64.tbl | \\
+    awk '$2 != "x32" && $3 == "'$1'" { print $1 }'
+}\fP
+.in
+.fi
+
+When the BPF filter rejects a system call (case [2] above),
+it causes the system call to fail with the error number
+specified on the command line.
+In the experiments shown here, we'll use error number 99:
+
+.nf
+.in +4n
+$ \fBerrno 99\fP
+EADDRNOTAVAIL 99 Cannot assign requested address
+.in
+.fi
+
+In the following example, we attempt to run the command
+.BR whoami (1),
+but the BPF filter rejects the
+.BR execve (2)
+system call, so that the command is not even executed:
+
+.nf
+.in +4n
+$ \fBsyscall_nr execve\fP
+59
+$ \fB./a.out\fP
+Usage: ./a.out <syscall_nr> <arch> <errno> <prog> [<args>]
+Hint for <arch>: AUDIT_ARCH_I386: 0x40000003
+                 AUDIT_ARCH_X86_64: 0xC000003E
+$ \fB./a.out 59 0xC000003E 99 /bin/whoami\fP
+execv: Cannot assign requested address
+.in
+.fi
+
+In the next example, the BPF filter rejects the
+.BR write (2)
+system call, so that, although it is successfully started, the
+.BR whoami (1)
+command is not able to write output:
+
+.nf
+.in +4n
+$ \fBsyscall_nr write\fP
+1
+$ \fB./a.out 1 0xC000003E 99 /bin/whoami\fP
+.in
+.fi
+
+In the final example,
+the BPF filter rejects a system call that is not used by the
+.BR whoami (1)
+command, so it is able to successfully execute and produce output:
+
+.nf
+.in +4n
+$ \fBsyscall_nr preadv\fP
+295
+$ \fB./a.out 295 0xC000003E 99 /bin/whoami\fP
+cecilia
+.in
+.fi
+.SS Program source
+.fi
+.nf
+#include <errno.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <linux/audit.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+#include <sys/prctl.h>
+
+static int
+install_filter(int syscall_nr, int t_arch, int f_errno)
+{
+    struct sock_filter filter[] = {
+        /* [0] Load architecture from 'seccomp_data' buffer into
+               accumulator */
+        BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
+                 (offsetof(struct seccomp_data, arch))),
+
+        /* [1] Jump forward 4 instructions if architecture does not
+               match 't_arch' */
+        BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, t_arch, 0, 4),
+
+        /* [2] Load system call number from 'seccomp_data' buffer into
+               accumulator */
+        BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
+                 (offsetof(struct seccomp_data, nr))),
+
+        /* [3] Jump forward 1 instruction if system call number
+               does not match 'syscall_nr' */
+        BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, syscall_nr, 0, 1),
+
+        /* [4] Matching architecture and system call: don't execute
+              the system call, and return 'f_errno' in 'errno' */
+        BPF_STMT(BPF_RET | BPF_K,
+                 SECCOMP_RET_ERRNO | (f_errno & SECCOMP_RET_DATA)),
+
+        /* [5] Destination of system call number mismatch: allow other
+               system calls */
+        BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
+
+        /* [6] Destination of architecture mismatch: kill process */
+        BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL),
+    };
+
+    struct sock_fprog prog = {
+        .len = (unsigned short) (sizeof(filter) / sizeof(filter[0])),
+        .filter = filter,
+    };
+
+    if (seccomp(SECCOMP_SET_MODE_FILTER, 0, &prog)) {
+        perror("seccomp");
+        return 1;
+    }
+
+    return 0;
+}
+
+int
+main(int argc, char **argv)
+{
+    if (argc < 5) {
+        fprintf(stderr, "Usage: "
+                "%s <syscall_nr> <arch> <errno> <prog> [<args>]\\n"
+                "Hint for <arch>: AUDIT_ARCH_I386: 0x%X\\n"
+                "                 AUDIT_ARCH_X86_64: 0x%X\\n"
+                "\\n", argv[0], AUDIT_ARCH_I386, AUDIT_ARCH_X86_64);
+        exit(EXIT_FAILURE);
+    }
+
+    if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+        perror("prctl");
+        exit(EXIT_FAILURE);
+    }
+
+    if (install_filter(strtol(argv[1], NULL, 0),
+                       strtol(argv[2], NULL, 0),
+                       strtol(argv[3], NULL, 0)))
+        exit(EXIT_FAILURE);
+
+    execv(argv[4], &argv[4]);
+    perror("execv");
+    exit(EXIT_FAILURE);
+}
+.fi
+.SH SEE ALSO
+.BR prctl (2),
+.BR ptrace (2),
+.BR signal (7),
+.BR socket (7)
+.sp
+The kernel source files
+.IR Documentation/networking/filter.txt
+and
+.IR Documentation/prctl/seccomp_filter.txt .
+.sp
+McCanne, S. and Jacobson, V. (1992)
+.IR "The BSD Packet Filter: A New Architecture for User-level Packet Capture" ,
+Proceedings of the USENIX Winter 1993 Conference
+.UR http://www.tcpdump.org/papers/bpf-usenix93.pdf
+.UE
+.SH COLOPHON
+This page is part of release 3.77 of the Linux
+.I man-pages
+project.
+A description of the project,
+information about reporting bugs,
+and the latest version of this page,
+can be found at
+\%http://www.kernel.org/doc/man\-pages/.