clone3: allow spawning processes into cgroups

author Christian Brauner <christian.brauner@ubuntu.com>

Wed, 5 Feb 2020 13:26:22 +0000 (14:26 +0100)

committer Tejun Heo <tj@kernel.org>

Wed, 12 Feb 2020 22:57:51 +0000 (17:57 -0500)
author Christian Brauner <christian.brauner@ubuntu.com>
Wed, 5 Feb 2020 13:26:22 +0000 (14:26 +0100)
committer Tejun Heo <tj@kernel.org>
Wed, 12 Feb 2020 22:57:51 +0000 (17:57 -0500)
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h

index 63097cb..68c391f 100644 (file)
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -628,8 +628,9 @@ struct cgroup_subsys {
         void (*cancel_attach)(struct cgroup_taskset *tset);
         void (*attach)(struct cgroup_taskset *tset);
         void (*post_attach)(void);
-       int (*can_fork)(struct task_struct *task);
-       void (*cancel_fork)(struct task_struct *task);
+       int (*can_fork)(struct task_struct *task,
+                       struct css_set *cset);
+       void (*cancel_fork)(struct task_struct *task, struct css_set *cset);
         void (*fork)(struct task_struct *task);
         void (*exit)(struct task_struct *task);
         void (*release)(struct task_struct *task);
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h

index f1219b9..4598e4d 100644 (file)
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -27,6 +27,8 @@
  
  #include <linux/cgroup-defs.h>
  
+struct kernel_clone_args;
+
  #ifdef CONFIG_CGROUPS
  
  /*
@@ -119,9 +121,12 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
                      struct pid *pid, struct task_struct *tsk);
  
  void cgroup_fork(struct task_struct *p);
-extern int cgroup_can_fork(struct task_struct *p);
-extern void cgroup_cancel_fork(struct task_struct *p);
-extern void cgroup_post_fork(struct task_struct *p);
+extern int cgroup_can_fork(struct task_struct *p,
+                          struct kernel_clone_args *kargs);
+extern void cgroup_cancel_fork(struct task_struct *p,
+                              struct kernel_clone_args *kargs);
+extern void cgroup_post_fork(struct task_struct *p,
+                            struct kernel_clone_args *kargs);
  void cgroup_exit(struct task_struct *p);
  void cgroup_release(struct task_struct *p);
  void cgroup_free(struct task_struct *p);
@@ -705,9 +710,12 @@ static inline int cgroupstats_build(struct cgroupstats *stats,
                                     struct dentry *dentry) { return -EINVAL; }
  
  static inline void cgroup_fork(struct task_struct *p) {}
-static inline int cgroup_can_fork(struct task_struct *p) { return 0; }
-static inline void cgroup_cancel_fork(struct task_struct *p) {}
-static inline void cgroup_post_fork(struct task_struct *p) {}
+static inline int cgroup_can_fork(struct task_struct *p,
+                                 struct kernel_clone_args *kargs) { return 0; }
+static inline void cgroup_cancel_fork(struct task_struct *p,
+                                     struct kernel_clone_args *kargs) {}
+static inline void cgroup_post_fork(struct task_struct *p,
+                                   struct kernel_clone_args *kargs) {}
  static inline void cgroup_exit(struct task_struct *p) {}
  static inline void cgroup_release(struct task_struct *p) {}
  static inline void cgroup_free(struct task_struct *p) {}
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h

index f187988..3835907 100644 (file)
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -13,6 +13,7 @@
  struct task_struct;
  struct rusage;
  union thread_union;
+struct css_set;
  
  /* All the bits taken by the old clone syscall. */
  #define CLONE_LEGACY_FLAGS 0xffffffffULL
@@ -29,6 +30,9 @@ struct kernel_clone_args {
         pid_t *set_tid;
         /* Number of elements in *set_tid */
         size_t set_tid_size;
+       int cgroup;
+       struct cgroup *cgrp;
+       struct css_set *cset;
  };
  
  /*
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h

index 2e3bc22..3bac0a8 100644 (file)
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -35,6 +35,7 @@
  
  /* Flags for the clone3() syscall. */
  #define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */
+#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */
  
  /*
   * cloning flags intersect with CSIGNAL so can be used with unshare and clone3
@@ -81,6 +82,8 @@
   * @set_tid_size: This defines the size of the array referenced
   *                in @set_tid. This cannot be larger than the
   *                kernel's limit of nested PID namespaces.
+ * @cgroup:       If CLONE_INTO_CGROUP is specified set this to
+ *                a file descriptor for the cgroup.
   *
   * The structure is versioned by size and thus extensible.
   * New struct members must go at the end of the struct and
@@ -97,11 +100,13 @@ struct clone_args {
         __aligned_u64 tls;
         __aligned_u64 set_tid;
         __aligned_u64 set_tid_size;
+       __aligned_u64 cgroup;
  };
  #endif
  
  #define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
  #define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */
+#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
  
  /*
   * Scheduling policies
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c

index 6d8bddd..9a8a5de 100644 (file)
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -5881,8 +5881,7 @@ out:
   * @child: pointer to task_struct of forking parent process.
   *
   * A task is associated with the init_css_set until cgroup_post_fork()
- * attaches it to the parent's css_set.  Empty cg_list indicates that
- * @child isn't holding reference to its css_set.
+ * attaches it to the target css_set.
   */
  void cgroup_fork(struct task_struct *child)
  {
@@ -5909,23 +5908,153 @@ static struct cgroup *cgroup_get_from_file(struct file *f)
  }
  
  /**
+ * cgroup_css_set_fork - find or create a css_set for a child process
+ * @kargs: the arguments passed to create the child process
+ *
+ * This functions finds or creates a new css_set which the child
+ * process will be attached to in cgroup_post_fork(). By default,
+ * the child process will be given the same css_set as its parent.
+ *
+ * If CLONE_INTO_CGROUP is specified this function will try to find an
+ * existing css_set which includes the requested cgroup and if not create
+ * a new css_set that the child will be attached to later. If this function
+ * succeeds it will hold cgroup_threadgroup_rwsem on return. If
+ * CLONE_INTO_CGROUP is requested this function will grab cgroup mutex
+ * before grabbing cgroup_threadgroup_rwsem and will hold a reference
+ * to the target cgroup.
+ */
+static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
+       __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
+{
+       int ret;
+       struct cgroup *dst_cgrp = NULL;
+       struct css_set *cset;
+       struct super_block *sb;
+       struct file *f;
+
+       if (kargs->flags & CLONE_INTO_CGROUP)
+               mutex_lock(&cgroup_mutex);
+
+       cgroup_threadgroup_change_begin(current);
+
+       spin_lock_irq(&css_set_lock);
+       cset = task_css_set(current);
+       get_css_set(cset);
+       spin_unlock_irq(&css_set_lock);
+
+       if (!(kargs->flags & CLONE_INTO_CGROUP)) {
+               kargs->cset = cset;
+               return 0;
+       }
+
+       f = fget_raw(kargs->cgroup);
+       if (!f) {
+               ret = -EBADF;
+               goto err;
+       }
+       sb = f->f_path.dentry->d_sb;
+
+       dst_cgrp = cgroup_get_from_file(f);
+       if (IS_ERR(dst_cgrp)) {
+               ret = PTR_ERR(dst_cgrp);
+               dst_cgrp = NULL;
+               goto err;
+       }
+
+       if (cgroup_is_dead(dst_cgrp)) {
+               ret = -ENODEV;
+               goto err;
+       }
+
+       /*
+        * Verify that we the target cgroup is writable for us. This is
+        * usually done by the vfs layer but since we're not going through
+        * the vfs layer here we need to do it "manually".
+        */
+       ret = cgroup_may_write(dst_cgrp, sb);
+       if (ret)
+               goto err;
+
+       ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
+                                       !(kargs->flags & CLONE_THREAD));
+       if (ret)
+               goto err;
+
+       kargs->cset = find_css_set(cset, dst_cgrp);
+       if (!kargs->cset) {
+               ret = -ENOMEM;
+               goto err;
+       }
+
+       put_css_set(cset);
+       fput(f);
+       kargs->cgrp = dst_cgrp;
+       return ret;
+
+err:
+       cgroup_threadgroup_change_end(current);
+       mutex_unlock(&cgroup_mutex);
+       if (f)
+               fput(f);
+       if (dst_cgrp)
+               cgroup_put(dst_cgrp);
+       put_css_set(cset);
+       if (kargs->cset)
+               put_css_set(kargs->cset);
+       return ret;
+}
+
+/**
+ * cgroup_css_set_put_fork - drop references we took during fork
+ * @kargs: the arguments passed to create the child process
+ *
+ * Drop references to the prepared css_set and target cgroup if
+ * CLONE_INTO_CGROUP was requested.
+ */
+static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
+       __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
+{
+       cgroup_threadgroup_change_end(current);
+
+       if (kargs->flags & CLONE_INTO_CGROUP) {
+               struct cgroup *cgrp = kargs->cgrp;
+               struct css_set *cset = kargs->cset;
+
+               mutex_unlock(&cgroup_mutex);
+
+               if (cset) {
+                       put_css_set(cset);
+                       kargs->cset = NULL;
+               }
+
+               if (cgrp) {
+                       cgroup_put(cgrp);
+                       kargs->cgrp = NULL;
+               }
+       }
+}
+
+/**
   * cgroup_can_fork - called on a new task before the process is exposed
   * @child: the child process
   *
+ * This prepares a new css_set for the child process which the child will
+ * be attached to in cgroup_post_fork().
   * This calls the subsystem can_fork() callbacks. If the cgroup_can_fork()
   * callback returns an error, the fork aborts with that error code. This
   * allows for a cgroup subsystem to conditionally allow or deny new forks.
   */
-int cgroup_can_fork(struct task_struct *child)
-       __acquires(&cgroup_threadgroup_rwsem) __releases(&cgroup_threadgroup_rwsem)
+int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
  {
         struct cgroup_subsys *ss;
         int i, j, ret;
  
-       cgroup_threadgroup_change_begin(current);
+       ret = cgroup_css_set_fork(kargs);
+       if (ret)
+               return ret;
  
         do_each_subsys_mask(ss, i, have_canfork_callback) {
-               ret = ss->can_fork(child);
+               ret = ss->can_fork(child, kargs->cset);
                 if (ret)
                         goto out_revert;
         } while_each_subsys_mask();
@@ -5937,32 +6066,34 @@ out_revert:
                 if (j >= i)
                         break;
                 if (ss->cancel_fork)
-                       ss->cancel_fork(child);
+                       ss->cancel_fork(child, kargs->cset);
         }
  
-       cgroup_threadgroup_change_end(current);
+       cgroup_css_set_put_fork(kargs);
  
         return ret;
  }
  
  /**
-  * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
-  * @child: the child process
-  *
-  * This calls the cancel_fork() callbacks if a fork failed *after*
-  * cgroup_can_fork() succeded.
-  */
-void cgroup_cancel_fork(struct task_struct *child)
-       __releases(&cgroup_threadgroup_rwsem)
+ * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
+ * @child: the child process
+ * @kargs: the arguments passed to create the child process
+ *
+ * This calls the cancel_fork() callbacks if a fork failed *after*
+ * cgroup_can_fork() succeded and cleans up references we took to
+ * prepare a new css_set for the child process in cgroup_can_fork().
+ */
+void cgroup_cancel_fork(struct task_struct *child,
+                       struct kernel_clone_args *kargs)
  {
         struct cgroup_subsys *ss;
         int i;
  
         for_each_subsys(ss, i)
                 if (ss->cancel_fork)
-                       ss->cancel_fork(child);
+                       ss->cancel_fork(child, kargs->cset);
  
-       cgroup_threadgroup_change_end(current);
+       cgroup_css_set_put_fork(kargs);
  }
  
  /**
@@ -5972,22 +6103,27 @@ void cgroup_cancel_fork(struct task_struct *child)
   * Attach the child process to its css_set calling the subsystem fork()
   * callbacks.
   */
-void cgroup_post_fork(struct task_struct *child)
-       __releases(&cgroup_threadgroup_rwsem)
+void cgroup_post_fork(struct task_struct *child,
+                     struct kernel_clone_args *kargs)
+       __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
  {
         struct cgroup_subsys *ss;
         struct css_set *cset;
         int i;
  
+       cset = kargs->cset;
+       kargs->cset = NULL;
+
         spin_lock_irq(&css_set_lock);
  
         /* init tasks are special, only link regular threads */
         if (likely(child->pid)) {
                 WARN_ON_ONCE(!list_empty(&child->cg_list));
-               cset = task_css_set(current); /* current is @child's parent */
-               get_css_set(cset);
                 cset->nr_tasks++;
                 css_set_move_task(child, NULL, cset, false);
+       } else {
+               put_css_set(cset);
+               cset = NULL;
         }
  
         /*
@@ -6020,7 +6156,16 @@ void cgroup_post_fork(struct task_struct *child)
                 ss->fork(child);
         } while_each_subsys_mask();
  
-       cgroup_threadgroup_change_end(current);
+       /* Make the new cset the root_cset of the new cgroup namespace. */
+       if (kargs->flags & CLONE_NEWCGROUP) {
+               struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;
+
+               get_css_set(cset);
+               child->nsproxy->cgroup_ns->root_cset = cset;
+               put_css_set(rcset);
+       }
+
+       cgroup_css_set_put_fork(kargs);
  }
  
  /**
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c

index 138059e..511af87 100644 (file)
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -33,6 +33,7 @@
  #include <linux/atomic.h>
  #include <linux/cgroup.h>
  #include <linux/slab.h>
+#include <linux/sched/task.h>
  
  #define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
  #define PIDS_MAX_STR "max"
@@ -214,13 +215,16 @@ static void pids_cancel_attach(struct cgroup_taskset *tset)
   * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
   * on cgroup_threadgroup_change_begin() held by the copy_process().
   */
-static int pids_can_fork(struct task_struct *task)
+static int pids_can_fork(struct task_struct *task, struct css_set *cset)
  {
         struct cgroup_subsys_state *css;
         struct pids_cgroup *pids;
         int err;
  
-       css = task_css_check(current, pids_cgrp_id, true);
+       if (cset)
+               css = cset->subsys[pids_cgrp_id];
+       else
+               css = task_css_check(current, pids_cgrp_id, true);
         pids = css_pids(css);
         err = pids_try_charge(pids, 1);
         if (err) {
@@ -235,12 +239,15 @@ static int pids_can_fork(struct task_struct *task)
         return err;
  }
  
-static void pids_cancel_fork(struct task_struct *task)
+static void pids_cancel_fork(struct task_struct *task, struct css_set *cset)
  {
         struct cgroup_subsys_state *css;
         struct pids_cgroup *pids;
  
-       css = task_css_check(current, pids_cgrp_id, true);
+       if (cset)
+               css = cset->subsys[pids_cgrp_id];
+       else
+               css = task_css_check(current, pids_cgrp_id, true);
         pids = css_pids(css);
         pids_uncharge(pids, 1);
  }
diff --git a/kernel/fork.c b/kernel/fork.c

index 9245b6e..635d636 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2180,7 +2180,7 @@ static __latent_entropy struct task_struct *copy_process(
          * between here and cgroup_post_fork() if an organisation operation is in
          * progress.
          */
-       retval = cgroup_can_fork(p);
+       retval = cgroup_can_fork(p, args);
         if (retval)
                 goto bad_fork_put_pidfd;
  
@@ -2287,7 +2287,7 @@ static __latent_entropy struct task_struct *copy_process(
         write_unlock_irq(&tasklist_lock);
  
         proc_fork_connector(p);
-       cgroup_post_fork(p);
+       cgroup_post_fork(p, args);
         perf_event_fork(p);
  
         trace_task_newtask(p, clone_flags);
@@ -2298,7 +2298,7 @@ static __latent_entropy struct task_struct *copy_process(
  bad_fork_cancel_cgroup:
         spin_unlock(&current->sighand->siglock);
         write_unlock_irq(&tasklist_lock);
-       cgroup_cancel_fork(p);
+       cgroup_cancel_fork(p, args);
  bad_fork_put_pidfd:
         if (clone_flags & CLONE_PIDFD) {
                 fput(pidfile);
@@ -2627,6 +2627,9 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
                      !valid_signal(args.exit_signal)))
                 return -EINVAL;
  
+       if ((args.flags & CLONE_INTO_CGROUP) && args.cgroup < 0)
+               return -EINVAL;
+
         *kargs = (struct kernel_clone_args){
                 .flags          = args.flags,
                 .pidfd          = u64_to_user_ptr(args.pidfd),
@@ -2637,6 +2640,7 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
                 .stack_size     = args.stack_size,
                 .tls            = args.tls,
                 .set_tid_size   = args.set_tid_size,
+               .cgroup         = args.cgroup,
         };
  
         if (args.set_tid &&
@@ -2680,7 +2684,8 @@ static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
  static bool clone3_args_valid(struct kernel_clone_args *kargs)
  {
         /* Verify that no unknown flags are passed along. */
-       if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND))
+       if (kargs->flags &
+           ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
                 return false;
  
         /*
author	Christian Brauner <christian.brauner@ubuntu.com>
	Wed, 5 Feb 2020 13:26:22 +0000 (14:26 +0100)
committer	Tejun Heo <tj@kernel.org>
	Wed, 12 Feb 2020 22:57:51 +0000 (17:57 -0500)
include/linux/cgroup-defs.h		patch \| blob \| history
include/linux/cgroup.h		patch \| blob \| history
include/linux/sched/task.h		patch \| blob \| history
include/uapi/linux/sched.h		patch \| blob \| history
kernel/cgroup/cgroup.c		patch \| blob \| history
kernel/cgroup/pids.c		patch \| blob \| history
kernel/fork.c		patch \| blob \| history