Merge branch 'for-4.2' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

author Linus Torvalds <torvalds@linux-foundation.org>

Sat, 27 Jun 2015 02:50:04 +0000 (19:50 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Sat, 27 Jun 2015 02:50:04 +0000 (19:50 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Sat, 27 Jun 2015 02:50:04 +0000 (19:50 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Sat, 27 Jun 2015 02:50:04 +0000 (19:50 -0700)
diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt

index eb102fb..86847a7 100644 (file)
--- a/Documentation/cgroups/unified-hierarchy.txt
+++ b/Documentation/cgroups/unified-hierarchy.txt
@@ -17,15 +17,18 @@ CONTENTS
  3. Structural Constraints
    3-1. Top-down
    3-2. No internal tasks
-4. Other Changes
-  4-1. [Un]populated Notification
-  4-2. Other Core Changes
-  4-3. Per-Controller Changes
-    4-3-1. blkio
-    4-3-2. cpuset
-    4-3-3. memory
-5. Planned Changes
-  5-1. CAP for resource control
+4. Delegation
+  4-1. Model of delegation
+  4-2. Common ancestor rule
+5. Other Changes
+  5-1. [Un]populated Notification
+  5-2. Other Core Changes
+  5-3. Per-Controller Changes
+    5-3-1. blkio
+    5-3-2. cpuset
+    5-3-3. memory
+6. Planned Changes
+  6-1. CAP for resource control
  
  
  1. Background
@@ -245,9 +248,72 @@ cgroup must create children and transfer all its tasks to the children
  before enabling controllers in its "cgroup.subtree_control" file.
  
  
-4. Other Changes
+4. Delegation
  
-4-1. [Un]populated Notification
+4-1. Model of delegation
+
+A cgroup can be delegated to a less privileged user by granting write
+access of the directory and its "cgroup.procs" file to the user.  Note
+that the resource control knobs in a given directory concern the
+resources of the parent and thus must not be delegated along with the
+directory.
+
+Once delegated, the user can build sub-hierarchy under the directory,
+organize processes as it sees fit and further distribute the resources
+it got from the parent.  The limits and other settings of all resource
+controllers are hierarchical and regardless of what happens in the
+delegated sub-hierarchy, nothing can escape the resource restrictions
+imposed by the parent.
+
+Currently, cgroup doesn't impose any restrictions on the number of
+cgroups in or nesting depth of a delegated sub-hierarchy; however,
+this may in the future be limited explicitly.
+
+
+4-2. Common ancestor rule
+
+On the unified hierarchy, to write to a "cgroup.procs" file, in
+addition to the usual write permission to the file and uid match, the
+writer must also have write access to the "cgroup.procs" file of the
+common ancestor of the source and destination cgroups.  This prevents
+delegatees from smuggling processes across disjoint sub-hierarchies.
+
+Let's say cgroups C0 and C1 have been delegated to user U0 who created
+C00, C01 under C0 and C10 under C1 as follows.
+
+ ~~~~~~~~~~~~~ - C0 - C00
+ ~ cgroup    ~      \ C01
+ ~ hierarchy ~
+ ~~~~~~~~~~~~~ - C1 - C10
+
+C0 and C1 are separate entities in terms of resource distribution
+regardless of their relative positions in the hierarchy.  The
+resources the processes under C0 are entitled to are controlled by
+C0's ancestors and may be completely different from C1.  It's clear
+that the intention of delegating C0 to U0 is allowing U0 to organize
+the processes under C0 and further control the distribution of C0's
+resources.
+
+On traditional hierarchies, if a task has write access to "tasks" or
+"cgroup.procs" file of a cgroup and its uid agrees with the target, it
+can move the target to the cgroup.  In the above example, U0 will not
+only be able to move processes in each sub-hierarchy but also across
+the two sub-hierarchies, effectively allowing it to violate the
+organizational and resource restrictions implied by the hierarchical
+structure above C0 and C1.
+
+On the unified hierarchy, let's say U0 wants to write the pid of a
+process which has a matching uid and is currently in C10 into
+"C00/cgroup.procs".  U0 obviously has write access to the file and
+migration permission on the process; however, the common ancestor of
+the source cgroup C10 and the destination cgroup C00 is above the
+points of delegation and U0 would not have write access to its
+"cgroup.procs" and thus be denied with -EACCES.
+
+
+5. Other Changes
+
+5-1. [Un]populated Notification
  
  cgroup users often need a way to determine when a cgroup's
  subhierarchy becomes empty so that it can be cleaned up.  cgroup
@@ -289,7 +355,7 @@ supported and the interface files "release_agent" and
  "notify_on_release" do not exist.
  
  
-4-2. Other Core Changes
+5-2. Other Core Changes
  
  - None of the mount options is allowed.
  
@@ -306,14 +372,14 @@ supported and the interface files "release_agent" and
  - The "cgroup.clone_children" file is removed.
  
  
-4-3. Per-Controller Changes
+5-3. Per-Controller Changes
  
-4-3-1. blkio
+5-3-1. blkio
  
  - blk-throttle becomes properly hierarchical.
  
  
-4-3-2. cpuset
+5-3-2. cpuset
  
  - Tasks are kept in empty cpusets after hotplug and take on the masks
    of the nearest non-empty ancestor, instead of being moved to it.
@@ -322,7 +388,7 @@ supported and the interface files "release_agent" and
    masks of the nearest non-empty ancestor.
  
  
-4-3-3. memory
+5-3-3. memory
  
  - use_hierarchy is on by default and the cgroup file for the flag is
    not created.
@@ -407,9 +473,9 @@ supported and the interface files "release_agent" and
    memory.low, memory.high, and memory.max will use the string "max" to
    indicate and set the highest possible value.
  
-5. Planned Changes
+6. Planned Changes
  
-5-1. CAP for resource control
+6-1. CAP for resource control
  
  Unified hierarchy will require one of the capabilities(7), which is
  yet to be decided, for all resource control related knobs.  Process
diff --git a/MAINTAINERS b/MAINTAINERS

index 68457d8..c54a674 100644 (file)
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2816,6 +2816,7 @@ F:        drivers/connector/
  CONTROL GROUP (CGROUP)
  M:     Tejun Heo <tj@kernel.org>
  M:     Li Zefan <lizefan@huawei.com>
+M:     Johannes Weiner <hannes@cmpxchg.org>
  L:     cgroups@vger.kernel.org
  T:     git git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git
  S:     Maintained
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h

index af9fa74..6762bfb 100644 (file)
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -76,7 +76,6 @@ extern struct kmem_cache *kernfs_node_cache;
  /*
   * inode.c
   */
-struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn);
  void kernfs_evict_inode(struct inode *inode);
  int kernfs_iop_permission(struct inode *inode, int mask);
  int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr);
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h

new file mode 100644 (file)

index 0000000..93755a6
--- /dev/null
+++ b/include/linux/cgroup-defs.h
@@ -0,0 +1,501 @@
+/*
+ * linux/cgroup-defs.h - basic definitions for cgroup
+ *
+ * This file provides basic type and interface.  Include this file directly
+ * only if necessary to avoid cyclic dependencies.
+ */
+#ifndef _LINUX_CGROUP_DEFS_H
+#define _LINUX_CGROUP_DEFS_H
+
+#include <linux/limits.h>
+#include <linux/list.h>
+#include <linux/idr.h>
+#include <linux/wait.h>
+#include <linux/mutex.h>
+#include <linux/rcupdate.h>
+#include <linux/percpu-refcount.h>
+#include <linux/percpu-rwsem.h>
+#include <linux/workqueue.h>
+
+#ifdef CONFIG_CGROUPS
+
+struct cgroup;
+struct cgroup_root;
+struct cgroup_subsys;
+struct cgroup_taskset;
+struct kernfs_node;
+struct kernfs_ops;
+struct kernfs_open_file;
+struct seq_file;
+
+#define MAX_CGROUP_TYPE_NAMELEN 32
+#define MAX_CGROUP_ROOT_NAMELEN 64
+#define MAX_CFTYPE_NAME                64
+
+/* define the enumeration of all cgroup subsystems */
+#define SUBSYS(_x) _x ## _cgrp_id,
+enum cgroup_subsys_id {
+#include <linux/cgroup_subsys.h>
+       CGROUP_SUBSYS_COUNT,
+};
+#undef SUBSYS
+
+/* bits in struct cgroup_subsys_state flags field */
+enum {
+       CSS_NO_REF      = (1 << 0), /* no reference counting for this css */
+       CSS_ONLINE      = (1 << 1), /* between ->css_online() and ->css_offline() */
+       CSS_RELEASED    = (1 << 2), /* refcnt reached zero, released */
+};
+
+/* bits in struct cgroup flags field */
+enum {
+       /* Control Group requires release notifications to userspace */
+       CGRP_NOTIFY_ON_RELEASE,
+       /*
+        * Clone the parent's configuration when creating a new child
+        * cpuset cgroup.  For historical reasons, this option can be
+        * specified at mount time and thus is implemented here.
+        */
+       CGRP_CPUSET_CLONE_CHILDREN,
+};
+
+/* cgroup_root->flags */
+enum {
+       CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), /* __DEVEL__sane_behavior specified */
+       CGRP_ROOT_NOPREFIX      = (1 << 1), /* mounted subsystems have no named prefix */
+       CGRP_ROOT_XATTR         = (1 << 2), /* supports extended attributes */
+};
+
+/* cftype->flags */
+enum {
+       CFTYPE_ONLY_ON_ROOT     = (1 << 0),     /* only create on root cgrp */
+       CFTYPE_NOT_ON_ROOT      = (1 << 1),     /* don't create on root cgrp */
+       CFTYPE_NO_PREFIX        = (1 << 3),     /* (DON'T USE FOR NEW FILES) no subsys prefix */
+
+       /* internal flags, do not use outside cgroup core proper */
+       __CFTYPE_ONLY_ON_DFL    = (1 << 16),    /* only on default hierarchy */
+       __CFTYPE_NOT_ON_DFL     = (1 << 17),    /* not on default hierarchy */
+};
+
+/*
+ * Per-subsystem/per-cgroup state maintained by the system.  This is the
+ * fundamental structural building block that controllers deal with.
+ *
+ * Fields marked with "PI:" are public and immutable and may be accessed
+ * directly without synchronization.
+ */
+struct cgroup_subsys_state {
+       /* PI: the cgroup that this css is attached to */
+       struct cgroup *cgroup;
+
+       /* PI: the cgroup subsystem that this css is attached to */
+       struct cgroup_subsys *ss;
+
+       /* reference count - access via css_[try]get() and css_put() */
+       struct percpu_ref refcnt;
+
+       /* PI: the parent css */
+       struct cgroup_subsys_state *parent;
+
+       /* siblings list anchored at the parent's ->children */
+       struct list_head sibling;
+       struct list_head children;
+
+       /*
+        * PI: Subsys-unique ID.  0 is unused and root is always 1.  The
+        * matching css can be looked up using css_from_id().
+        */
+       int id;
+
+       unsigned int flags;
+
+       /*
+        * Monotonically increasing unique serial number which defines a
+        * uniform order among all csses.  It's guaranteed that all
+        * ->children lists are in the ascending order of ->serial_nr and
+        * used to allow interrupting and resuming iterations.
+        */
+       u64 serial_nr;
+
+       /* percpu_ref killing and RCU release */
+       struct rcu_head rcu_head;
+       struct work_struct destroy_work;
+};
+
+/*
+ * A css_set is a structure holding pointers to a set of
+ * cgroup_subsys_state objects. This saves space in the task struct
+ * object and speeds up fork()/exit(), since a single inc/dec and a
+ * list_add()/del() can bump the reference count on the entire cgroup
+ * set for a task.
+ */
+struct css_set {
+       /* Reference count */
+       atomic_t refcount;
+
+       /*
+        * List running through all cgroup groups in the same hash
+        * slot. Protected by css_set_lock
+        */
+       struct hlist_node hlist;
+
+       /*
+        * Lists running through all tasks using this cgroup group.
+        * mg_tasks lists tasks which belong to this cset but are in the
+        * process of being migrated out or in.  Protected by
+        * css_set_rwsem, but, during migration, once tasks are moved to
+        * mg_tasks, it can be read safely while holding cgroup_mutex.
+        */
+       struct list_head tasks;
+       struct list_head mg_tasks;
+
+       /*
+        * List of cgrp_cset_links pointing at cgroups referenced from this
+        * css_set.  Protected by css_set_lock.
+        */
+       struct list_head cgrp_links;
+
+       /* the default cgroup associated with this css_set */
+       struct cgroup *dfl_cgrp;
+
+       /*
+        * Set of subsystem states, one for each subsystem. This array is
+        * immutable after creation apart from the init_css_set during
+        * subsystem registration (at boot time).
+        */
+       struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
+
+       /*
+        * List of csets participating in the on-going migration either as
+        * source or destination.  Protected by cgroup_mutex.
+        */
+       struct list_head mg_preload_node;
+       struct list_head mg_node;
+
+       /*
+        * If this cset is acting as the source of migration the following
+        * two fields are set.  mg_src_cgrp is the source cgroup of the
+        * on-going migration and mg_dst_cset is the destination cset the
+        * target tasks on this cset should be migrated to.  Protected by
+        * cgroup_mutex.
+        */
+       struct cgroup *mg_src_cgrp;
+       struct css_set *mg_dst_cset;
+
+       /*
+        * On the default hierarhcy, ->subsys[ssid] may point to a css
+        * attached to an ancestor instead of the cgroup this css_set is
+        * associated with.  The following node is anchored at
+        * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to
+        * iterate through all css's attached to a given cgroup.
+        */
+       struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
+
+       /* For RCU-protected deletion */
+       struct rcu_head rcu_head;
+};
+
+struct cgroup {
+       /* self css with NULL ->ss, points back to this cgroup */
+       struct cgroup_subsys_state self;
+
+       unsigned long flags;            /* "unsigned long" so bitops work */
+
+       /*
+        * idr allocated in-hierarchy ID.
+        *
+        * ID 0 is not used, the ID of the root cgroup is always 1, and a
+        * new cgroup will be assigned with a smallest available ID.
+        *
+        * Allocating/Removing ID must be protected by cgroup_mutex.
+        */
+       int id;
+
+       /*
+        * If this cgroup contains any tasks, it contributes one to
+        * populated_cnt.  All children with non-zero popuplated_cnt of
+        * their own contribute one.  The count is zero iff there's no task
+        * in this cgroup or its subtree.
+        */
+       int populated_cnt;
+
+       struct kernfs_node *kn;         /* cgroup kernfs entry */
+       struct kernfs_node *procs_kn;   /* kn for "cgroup.procs" */
+       struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */
+
+       /*
+        * The bitmask of subsystems enabled on the child cgroups.
+        * ->subtree_control is the one configured through
+        * "cgroup.subtree_control" while ->child_subsys_mask is the
+        * effective one which may have more subsystems enabled.
+        * Controller knobs are made available iff it's enabled in
+        * ->subtree_control.
+        */
+       unsigned int subtree_control;
+       unsigned int child_subsys_mask;
+
+       /* Private pointers for each registered subsystem */
+       struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT];
+
+       struct cgroup_root *root;
+
+       /*
+        * List of cgrp_cset_links pointing at css_sets with tasks in this
+        * cgroup.  Protected by css_set_lock.
+        */
+       struct list_head cset_links;
+
+       /*
+        * On the default hierarchy, a css_set for a cgroup with some
+        * susbsys disabled will point to css's which are associated with
+        * the closest ancestor which has the subsys enabled.  The
+        * following lists all css_sets which point to this cgroup's css
+        * for the given subsystem.
+        */
+       struct list_head e_csets[CGROUP_SUBSYS_COUNT];
+
+       /*
+        * list of pidlists, up to two for each namespace (one for procs, one
+        * for tasks); created on demand.
+        */
+       struct list_head pidlists;
+       struct mutex pidlist_mutex;
+
+       /* used to wait for offlining of csses */
+       wait_queue_head_t offline_waitq;
+
+       /* used to schedule release agent */
+       struct work_struct release_agent_work;
+};
+
+/*
+ * A cgroup_root represents the root of a cgroup hierarchy, and may be
+ * associated with a kernfs_root to form an active hierarchy.  This is
+ * internal to cgroup core.  Don't access directly from controllers.
+ */
+struct cgroup_root {
+       struct kernfs_root *kf_root;
+
+       /* The bitmask of subsystems attached to this hierarchy */
+       unsigned int subsys_mask;
+
+       /* Unique id for this hierarchy. */
+       int hierarchy_id;
+
+       /* The root cgroup.  Root is destroyed on its release. */
+       struct cgroup cgrp;
+
+       /* Number of cgroups in the hierarchy, used only for /proc/cgroups */
+       atomic_t nr_cgrps;
+
+       /* A list running through the active hierarchies */
+       struct list_head root_list;
+
+       /* Hierarchy-specific flags */
+       unsigned int flags;
+
+       /* IDs for cgroups in this hierarchy */
+       struct idr cgroup_idr;
+
+       /* The path to use for release notifications. */
+       char release_agent_path[PATH_MAX];
+
+       /* The name for this hierarchy - may be empty */
+       char name[MAX_CGROUP_ROOT_NAMELEN];
+};
+
+/*
+ * struct cftype: handler definitions for cgroup control files
+ *
+ * When reading/writing to a file:
+ *     - the cgroup to use is file->f_path.dentry->d_parent->d_fsdata
+ *     - the 'cftype' of the file is file->f_path.dentry->d_fsdata
+ */
+struct cftype {
+       /*
+        * By convention, the name should begin with the name of the
+        * subsystem, followed by a period.  Zero length string indicates
+        * end of cftype array.
+        */
+       char name[MAX_CFTYPE_NAME];
+       int private;
+       /*
+        * If not 0, file mode is set to this value, otherwise it will
+        * be figured out automatically
+        */
+       umode_t mode;
+
+       /*
+        * The maximum length of string, excluding trailing nul, that can
+        * be passed to write.  If < PAGE_SIZE-1, PAGE_SIZE-1 is assumed.
+        */
+       size_t max_write_len;
+
+       /* CFTYPE_* flags */
+       unsigned int flags;
+
+       /*
+        * Fields used for internal bookkeeping.  Initialized automatically
+        * during registration.
+        */
+       struct cgroup_subsys *ss;       /* NULL for cgroup core files */
+       struct list_head node;          /* anchored at ss->cfts */
+       struct kernfs_ops *kf_ops;
+
+       /*
+        * read_u64() is a shortcut for the common case of returning a
+        * single integer. Use it in place of read()
+        */
+       u64 (*read_u64)(struct cgroup_subsys_state *css, struct cftype *cft);
+       /*
+        * read_s64() is a signed version of read_u64()
+        */
+       s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft);
+
+       /* generic seq_file read interface */
+       int (*seq_show)(struct seq_file *sf, void *v);
+
+       /* optional ops, implement all or none */
+       void *(*seq_start)(struct seq_file *sf, loff_t *ppos);
+       void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos);
+       void (*seq_stop)(struct seq_file *sf, void *v);
+
+       /*
+        * write_u64() is a shortcut for the common case of accepting
+        * a single integer (as parsed by simple_strtoull) from
+        * userspace. Use in place of write(); return 0 or error.
+        */
+       int (*write_u64)(struct cgroup_subsys_state *css, struct cftype *cft,
+                        u64 val);
+       /*
+        * write_s64() is a signed version of write_u64()
+        */
+       int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft,
+                        s64 val);
+
+       /*
+        * write() is the generic write callback which maps directly to
+        * kernfs write operation and overrides all other operations.
+        * Maximum write size is determined by ->max_write_len.  Use
+        * of_css/cft() to access the associated css and cft.
+        */
+       ssize_t (*write)(struct kernfs_open_file *of,
+                        char *buf, size_t nbytes, loff_t off);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+       struct lock_class_key   lockdep_key;
+#endif
+};
+
+/*
+ * Control Group subsystem type.
+ * See Documentation/cgroups/cgroups.txt for details
+ */
+struct cgroup_subsys {
+       struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css);
+       int (*css_online)(struct cgroup_subsys_state *css);
+       void (*css_offline)(struct cgroup_subsys_state *css);
+       void (*css_released)(struct cgroup_subsys_state *css);
+       void (*css_free)(struct cgroup_subsys_state *css);
+       void (*css_reset)(struct cgroup_subsys_state *css);
+       void (*css_e_css_changed)(struct cgroup_subsys_state *css);
+
+       int (*can_attach)(struct cgroup_subsys_state *css,
+                         struct cgroup_taskset *tset);
+       void (*cancel_attach)(struct cgroup_subsys_state *css,
+                             struct cgroup_taskset *tset);
+       void (*attach)(struct cgroup_subsys_state *css,
+                      struct cgroup_taskset *tset);
+       void (*fork)(struct task_struct *task);
+       void (*exit)(struct cgroup_subsys_state *css,
+                    struct cgroup_subsys_state *old_css,
+                    struct task_struct *task);
+       void (*bind)(struct cgroup_subsys_state *root_css);
+
+       int disabled;
+       int early_init;
+
+       /*
+        * If %false, this subsystem is properly hierarchical -
+        * configuration, resource accounting and restriction on a parent
+        * cgroup cover those of its children.  If %true, hierarchy support
+        * is broken in some ways - some subsystems ignore hierarchy
+        * completely while others are only implemented half-way.
+        *
+        * It's now disallowed to create nested cgroups if the subsystem is
+        * broken and cgroup core will emit a warning message on such
+        * cases.  Eventually, all subsystems will be made properly
+        * hierarchical and this will go away.
+        */
+       bool broken_hierarchy;
+       bool warned_broken_hierarchy;
+
+       /* the following two fields are initialized automtically during boot */
+       int id;
+       const char *name;
+
+       /* link to parent, protected by cgroup_lock() */
+       struct cgroup_root *root;
+
+       /* idr for css->id */
+       struct idr css_idr;
+
+       /*
+        * List of cftypes.  Each entry is the first entry of an array
+        * terminated by zero length name.
+        */
+       struct list_head cfts;
+
+       /*
+        * Base cftypes which are automatically registered.  The two can
+        * point to the same array.
+        */
+       struct cftype *dfl_cftypes;     /* for the default hierarchy */
+       struct cftype *legacy_cftypes;  /* for the legacy hierarchies */
+
+       /*
+        * A subsystem may depend on other subsystems.  When such subsystem
+        * is enabled on a cgroup, the depended-upon subsystems are enabled
+        * together if available.  Subsystems enabled due to dependency are
+        * not visible to userland until explicitly enabled.  The following
+        * specifies the mask of subsystems that this one depends on.
+        */
+       unsigned int depends_on;
+};
+
+extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
+
+/**
+ * cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups
+ * @tsk: target task
+ *
+ * Called from threadgroup_change_begin() and allows cgroup operations to
+ * synchronize against threadgroup changes using a percpu_rw_semaphore.
+ */
+static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk)
+{
+       percpu_down_read(&cgroup_threadgroup_rwsem);
+}
+
+/**
+ * cgroup_threadgroup_change_end - threadgroup exclusion for cgroups
+ * @tsk: target task
+ *
+ * Called from threadgroup_change_end().  Counterpart of
+ * cgroup_threadcgroup_change_begin().
+ */
+static inline void cgroup_threadgroup_change_end(struct task_struct *tsk)
+{
+       percpu_up_read(&cgroup_threadgroup_rwsem);
+}
+
+#else  /* CONFIG_CGROUPS */
+
+#define CGROUP_SUBSYS_COUNT 0
+
+static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) {}
+static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {}
+
+#endif /* CONFIG_CGROUPS */
+
+#endif /* _LINUX_CGROUP_DEFS_H */
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h

index e7da0aa..a593e29 100644 (file)
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -11,94 +11,200 @@
  #include <linux/sched.h>
  #include <linux/cpumask.h>
  #include <linux/nodemask.h>
-#include <linux/rcupdate.h>
  #include <linux/rculist.h>
  #include <linux/cgroupstats.h>
  #include <linux/rwsem.h>
-#include <linux/idr.h>
-#include <linux/workqueue.h>
  #include <linux/fs.h>
-#include <linux/percpu-refcount.h>
  #include <linux/seq_file.h>
  #include <linux/kernfs.h>
-#include <linux/wait.h>
+
+#include <linux/cgroup-defs.h>
  
  #ifdef CONFIG_CGROUPS
  
-struct cgroup_root;
-struct cgroup_subsys;
-struct cgroup;
+/* a css_task_iter should be treated as an opaque object */
+struct css_task_iter {
+       struct cgroup_subsys            *ss;
+
+       struct list_head                *cset_pos;
+       struct list_head                *cset_head;
  
-extern int cgroup_init_early(void);
-extern int cgroup_init(void);
-extern void cgroup_fork(struct task_struct *p);
-extern void cgroup_post_fork(struct task_struct *p);
-extern void cgroup_exit(struct task_struct *p);
-extern int cgroupstats_build(struct cgroupstats *stats,
-                               struct dentry *dentry);
+       struct list_head                *task_pos;
+       struct list_head                *tasks_head;
+       struct list_head                *mg_tasks_head;
+};
  
-extern int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
-                           struct pid *pid, struct task_struct *tsk);
+extern struct cgroup_root cgrp_dfl_root;
+extern struct css_set init_css_set;
  
-/* define the enumeration of all cgroup subsystems */
-#define SUBSYS(_x) _x ## _cgrp_id,
-enum cgroup_subsys_id {
+#define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys;
  #include <linux/cgroup_subsys.h>
-       CGROUP_SUBSYS_COUNT,
-};
  #undef SUBSYS
  
+bool css_has_online_children(struct cgroup_subsys_state *css);
+struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss);
+struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup,
+                                            struct cgroup_subsys *ss);
+struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
+                                                      struct cgroup_subsys *ss);
+
+bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor);
+int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
+int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
+
+int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
+int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
+int cgroup_rm_cftypes(struct cftype *cfts);
+
+char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen);
+int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry);
+int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
+                    struct pid *pid, struct task_struct *tsk);
+
+void cgroup_fork(struct task_struct *p);
+void cgroup_post_fork(struct task_struct *p);
+void cgroup_exit(struct task_struct *p);
+
+int cgroup_init_early(void);
+int cgroup_init(void);
+
  /*
- * Per-subsystem/per-cgroup state maintained by the system.  This is the
- * fundamental structural building block that controllers deal with.
+ * Iteration helpers and macros.
+ */
+
+struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
+                                          struct cgroup_subsys_state *parent);
+struct cgroup_subsys_state *css_next_descendant_pre(struct cgroup_subsys_state *pos,
+                                                   struct cgroup_subsys_state *css);
+struct cgroup_subsys_state *css_rightmost_descendant(struct cgroup_subsys_state *pos);
+struct cgroup_subsys_state *css_next_descendant_post(struct cgroup_subsys_state *pos,
+                                                    struct cgroup_subsys_state *css);
+
+struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset);
+struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset);
+
+void css_task_iter_start(struct cgroup_subsys_state *css,
+                        struct css_task_iter *it);
+struct task_struct *css_task_iter_next(struct css_task_iter *it);
+void css_task_iter_end(struct css_task_iter *it);
+
+/**
+ * css_for_each_child - iterate through children of a css
+ * @pos: the css * to use as the loop cursor
+ * @parent: css whose children to walk
+ *
+ * Walk @parent's children.  Must be called under rcu_read_lock().
+ *
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal.  It's each subsystem's
+ * responsibility to synchronize against on/offlining.
   *
- * Fields marked with "PI:" are public and immutable and may be accessed
- * directly without synchronization.
+ * It is allowed to temporarily drop RCU read lock during iteration.  The
+ * caller is responsible for ensuring that @pos remains accessible until
+ * the start of the next iteration by, for example, bumping the css refcnt.
   */
-struct cgroup_subsys_state {
-       /* PI: the cgroup that this css is attached to */
-       struct cgroup *cgroup;
-
-       /* PI: the cgroup subsystem that this css is attached to */
-       struct cgroup_subsys *ss;
-
-       /* reference count - access via css_[try]get() and css_put() */
-       struct percpu_ref refcnt;
-
-       /* PI: the parent css */
-       struct cgroup_subsys_state *parent;
-
-       /* siblings list anchored at the parent's ->children */
-       struct list_head sibling;
-       struct list_head children;
-
-       /*
-        * PI: Subsys-unique ID.  0 is unused and root is always 1.  The
-        * matching css can be looked up using css_from_id().
-        */
-       int id;
-
-       unsigned int flags;
-
-       /*
-        * Monotonically increasing unique serial number which defines a
-        * uniform order among all csses.  It's guaranteed that all
-        * ->children lists are in the ascending order of ->serial_nr and
-        * used to allow interrupting and resuming iterations.
-        */
-       u64 serial_nr;
-
-       /* percpu_ref killing and RCU release */
-       struct rcu_head rcu_head;
-       struct work_struct destroy_work;
-};
+#define css_for_each_child(pos, parent)                                        \
+       for ((pos) = css_next_child(NULL, (parent)); (pos);             \
+            (pos) = css_next_child((pos), (parent)))
  
-/* bits in struct cgroup_subsys_state flags field */
-enum {
-       CSS_NO_REF      = (1 << 0), /* no reference counting for this css */
-       CSS_ONLINE      = (1 << 1), /* between ->css_online() and ->css_offline() */
-       CSS_RELEASED    = (1 << 2), /* refcnt reached zero, released */
-};
+/**
+ * css_for_each_descendant_pre - pre-order walk of a css's descendants
+ * @pos: the css * to use as the loop cursor
+ * @root: css whose descendants to walk
+ *
+ * Walk @root's descendants.  @root is included in the iteration and the
+ * first node to be visited.  Must be called under rcu_read_lock().
+ *
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal.  It's each subsystem's
+ * responsibility to synchronize against on/offlining.
+ *
+ * For example, the following guarantees that a descendant can't escape
+ * state updates of its ancestors.
+ *
+ * my_online(@css)
+ * {
+ *     Lock @css's parent and @css;
+ *     Inherit state from the parent;
+ *     Unlock both.
+ * }
+ *
+ * my_update_state(@css)
+ * {
+ *     css_for_each_descendant_pre(@pos, @css) {
+ *             Lock @pos;
+ *             if (@pos == @css)
+ *                     Update @css's state;
+ *             else
+ *                     Verify @pos is alive and inherit state from its parent;
+ *             Unlock @pos;
+ *     }
+ * }
+ *
+ * As long as the inheriting step, including checking the parent state, is
+ * enclosed inside @pos locking, double-locking the parent isn't necessary
+ * while inheriting.  The state update to the parent is guaranteed to be
+ * visible by walking order and, as long as inheriting operations to the
+ * same @pos are atomic to each other, multiple updates racing each other
+ * still result in the correct state.  It's guaranateed that at least one
+ * inheritance happens for any css after the latest update to its parent.
+ *
+ * If checking parent's state requires locking the parent, each inheriting
+ * iteration should lock and unlock both @pos->parent and @pos.
+ *
+ * Alternatively, a subsystem may choose to use a single global lock to
+ * synchronize ->css_online() and ->css_offline() against tree-walking
+ * operations.
+ *
+ * It is allowed to temporarily drop RCU read lock during iteration.  The
+ * caller is responsible for ensuring that @pos remains accessible until
+ * the start of the next iteration by, for example, bumping the css refcnt.
+ */
+#define css_for_each_descendant_pre(pos, css)                          \
+       for ((pos) = css_next_descendant_pre(NULL, (css)); (pos);       \
+            (pos) = css_next_descendant_pre((pos), (css)))
+
+/**
+ * css_for_each_descendant_post - post-order walk of a css's descendants
+ * @pos: the css * to use as the loop cursor
+ * @css: css whose descendants to walk
+ *
+ * Similar to css_for_each_descendant_pre() but performs post-order
+ * traversal instead.  @root is included in the iteration and the last
+ * node to be visited.
+ *
+ * If a subsystem synchronizes ->css_online() and the start of iteration, a
+ * css which finished ->css_online() is guaranteed to be visible in the
+ * future iterations and will stay visible until the last reference is put.
+ * A css which hasn't finished ->css_online() or already finished
+ * ->css_offline() may show up during traversal.  It's each subsystem's
+ * responsibility to synchronize against on/offlining.
+ *
+ * Note that the walk visibility guarantee example described in pre-order
+ * walk doesn't apply the same to post-order walks.
+ */
+#define css_for_each_descendant_post(pos, css)                         \
+       for ((pos) = css_next_descendant_post(NULL, (css)); (pos);      \
+            (pos) = css_next_descendant_post((pos), (css)))
+
+/**
+ * cgroup_taskset_for_each - iterate cgroup_taskset
+ * @task: the loop cursor
+ * @tset: taskset to iterate
+ */
+#define cgroup_taskset_for_each(task, tset)                            \
+       for ((task) = cgroup_taskset_first((tset)); (task);             \
+            (task) = cgroup_taskset_next((tset)))
+
+/*
+ * Inline functions.
+ */
  
  /**
   * css_get - obtain a reference on the specified css
@@ -157,559 +263,33 @@ static inline bool css_tryget_online(struct cgroup_subsys_state *css)
  {
         if (!(css->flags & CSS_NO_REF))
                 return percpu_ref_tryget_live(&css->refcnt);
-       return true;
-}
-
-/**
- * css_put - put a css reference
- * @css: target css
- *
- * Put a reference obtained via css_get() and css_tryget_online().
- */
-static inline void css_put(struct cgroup_subsys_state *css)
-{
-       if (!(css->flags & CSS_NO_REF))
-               percpu_ref_put(&css->refcnt);
-}
-
-/**
- * css_put_many - put css references
- * @css: target css
- * @n: number of references to put
- *
- * Put references obtained via css_get() and css_tryget_online().
- */
-static inline void css_put_many(struct cgroup_subsys_state *css, unsigned int n)
-{
-       if (!(css->flags & CSS_NO_REF))
-               percpu_ref_put_many(&css->refcnt, n);
-}
-
-/* bits in struct cgroup flags field */
-enum {
-       /* Control Group requires release notifications to userspace */
-       CGRP_NOTIFY_ON_RELEASE,
-       /*
-        * Clone the parent's configuration when creating a new child
-        * cpuset cgroup.  For historical reasons, this option can be
-        * specified at mount time and thus is implemented here.
-        */
-       CGRP_CPUSET_CLONE_CHILDREN,
-};
-
-struct cgroup {
-       /* self css with NULL ->ss, points back to this cgroup */
-       struct cgroup_subsys_state self;
-
-       unsigned long flags;            /* "unsigned long" so bitops work */
-
-       /*
-        * idr allocated in-hierarchy ID.
-        *
-        * ID 0 is not used, the ID of the root cgroup is always 1, and a
-        * new cgroup will be assigned with a smallest available ID.
-        *
-        * Allocating/Removing ID must be protected by cgroup_mutex.
-        */
-       int id;
-
-       /*
-        * If this cgroup contains any tasks, it contributes one to
-        * populated_cnt.  All children with non-zero popuplated_cnt of
-        * their own contribute one.  The count is zero iff there's no task
-        * in this cgroup or its subtree.
-        */
-       int populated_cnt;
-
-       struct kernfs_node *kn;         /* cgroup kernfs entry */
-       struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */
-
-       /*
-        * The bitmask of subsystems enabled on the child cgroups.
-        * ->subtree_control is the one configured through
-        * "cgroup.subtree_control" while ->child_subsys_mask is the
-        * effective one which may have more subsystems enabled.
-        * Controller knobs are made available iff it's enabled in
-        * ->subtree_control.
-        */
-       unsigned int subtree_control;
-       unsigned int child_subsys_mask;
-
-       /* Private pointers for each registered subsystem */
-       struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT];
-
-       struct cgroup_root *root;
-
-       /*
-        * List of cgrp_cset_links pointing at css_sets with tasks in this
-        * cgroup.  Protected by css_set_lock.
-        */
-       struct list_head cset_links;
-
-       /*
-        * On the default hierarchy, a css_set for a cgroup with some
-        * susbsys disabled will point to css's which are associated with
-        * the closest ancestor which has the subsys enabled.  The
-        * following lists all css_sets which point to this cgroup's css
-        * for the given subsystem.
-        */
-       struct list_head e_csets[CGROUP_SUBSYS_COUNT];
-
-       /*
-        * list of pidlists, up to two for each namespace (one for procs, one
-        * for tasks); created on demand.
-        */
-       struct list_head pidlists;
-       struct mutex pidlist_mutex;
-
-       /* used to wait for offlining of csses */
-       wait_queue_head_t offline_waitq;
-
-       /* used to schedule release agent */
-       struct work_struct release_agent_work;
-};
-
-#define MAX_CGROUP_ROOT_NAMELEN 64
-
-/* cgroup_root->flags */
-enum {
-       CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), /* __DEVEL__sane_behavior specified */
-       CGRP_ROOT_NOPREFIX      = (1 << 1), /* mounted subsystems have no named prefix */
-       CGRP_ROOT_XATTR         = (1 << 2), /* supports extended attributes */
-};
-
-/*
- * A cgroup_root represents the root of a cgroup hierarchy, and may be
- * associated with a kernfs_root to form an active hierarchy.  This is
- * internal to cgroup core.  Don't access directly from controllers.
- */
-struct cgroup_root {
-       struct kernfs_root *kf_root;
-
-       /* The bitmask of subsystems attached to this hierarchy */
-       unsigned int subsys_mask;
-
-       /* Unique id for this hierarchy. */
-       int hierarchy_id;
-
-       /* The root cgroup.  Root is destroyed on its release. */
-       struct cgroup cgrp;
-
-       /* Number of cgroups in the hierarchy, used only for /proc/cgroups */
-       atomic_t nr_cgrps;
-
-       /* A list running through the active hierarchies */
-       struct list_head root_list;
-
-       /* Hierarchy-specific flags */
-       unsigned int flags;
-
-       /* IDs for cgroups in this hierarchy */
-       struct idr cgroup_idr;
-
-       /* The path to use for release notifications. */
-       char release_agent_path[PATH_MAX];
-
-       /* The name for this hierarchy - may be empty */
-       char name[MAX_CGROUP_ROOT_NAMELEN];
-};
-
-/*
- * A css_set is a structure holding pointers to a set of
- * cgroup_subsys_state objects. This saves space in the task struct
- * object and speeds up fork()/exit(), since a single inc/dec and a
- * list_add()/del() can bump the reference count on the entire cgroup
- * set for a task.
- */
-
-struct css_set {
-
-       /* Reference count */
-       atomic_t refcount;
-
-       /*
-        * List running through all cgroup groups in the same hash
-        * slot. Protected by css_set_lock
-        */
-       struct hlist_node hlist;
-
-       /*
-        * Lists running through all tasks using this cgroup group.
-        * mg_tasks lists tasks which belong to this cset but are in the
-        * process of being migrated out or in.  Protected by
-        * css_set_rwsem, but, during migration, once tasks are moved to
-        * mg_tasks, it can be read safely while holding cgroup_mutex.
-        */
-       struct list_head tasks;
-       struct list_head mg_tasks;
-
-       /*
-        * List of cgrp_cset_links pointing at cgroups referenced from this
-        * css_set.  Protected by css_set_lock.
-        */
-       struct list_head cgrp_links;
-
-       /* the default cgroup associated with this css_set */
-       struct cgroup *dfl_cgrp;
-
-       /*
-        * Set of subsystem states, one for each subsystem. This array is
-        * immutable after creation apart from the init_css_set during
-        * subsystem registration (at boot time).
-        */
-       struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
-
-       /*
-        * List of csets participating in the on-going migration either as
-        * source or destination.  Protected by cgroup_mutex.
-        */
-       struct list_head mg_preload_node;
-       struct list_head mg_node;
-
-       /*
-        * If this cset is acting as the source of migration the following
-        * two fields are set.  mg_src_cgrp is the source cgroup of the
-        * on-going migration and mg_dst_cset is the destination cset the
-        * target tasks on this cset should be migrated to.  Protected by
-        * cgroup_mutex.
-        */
-       struct cgroup *mg_src_cgrp;
-       struct css_set *mg_dst_cset;
-
-       /*
-        * On the default hierarhcy, ->subsys[ssid] may point to a css
-        * attached to an ancestor instead of the cgroup this css_set is
-        * associated with.  The following node is anchored at
-        * ->subsys[ssid]->cgroup->e_csets[ssid] and provides a way to
-        * iterate through all css's attached to a given cgroup.
-        */
-       struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
-
-       /* For RCU-protected deletion */
-       struct rcu_head rcu_head;
-};
-
-/*
- * struct cftype: handler definitions for cgroup control files
- *
- * When reading/writing to a file:
- *     - the cgroup to use is file->f_path.dentry->d_parent->d_fsdata
- *     - the 'cftype' of the file is file->f_path.dentry->d_fsdata
- */
-
-/* cftype->flags */
-enum {
-       CFTYPE_ONLY_ON_ROOT     = (1 << 0),     /* only create on root cgrp */
-       CFTYPE_NOT_ON_ROOT      = (1 << 1),     /* don't create on root cgrp */
-       CFTYPE_NO_PREFIX        = (1 << 3),     /* (DON'T USE FOR NEW FILES) no subsys prefix */
-
-       /* internal flags, do not use outside cgroup core proper */
-       __CFTYPE_ONLY_ON_DFL    = (1 << 16),    /* only on default hierarchy */
-       __CFTYPE_NOT_ON_DFL     = (1 << 17),    /* not on default hierarchy */
-};
-
-#define MAX_CFTYPE_NAME                64
-
-struct cftype {
-       /*
-        * By convention, the name should begin with the name of the
-        * subsystem, followed by a period.  Zero length string indicates
-        * end of cftype array.
-        */
-       char name[MAX_CFTYPE_NAME];
-       int private;
-       /*
-        * If not 0, file mode is set to this value, otherwise it will
-        * be figured out automatically
-        */
-       umode_t mode;
-
-       /*
-        * The maximum length of string, excluding trailing nul, that can
-        * be passed to write.  If < PAGE_SIZE-1, PAGE_SIZE-1 is assumed.
-        */
-       size_t max_write_len;
-
-       /* CFTYPE_* flags */
-       unsigned int flags;
-
-       /*
-        * Fields used for internal bookkeeping.  Initialized automatically
-        * during registration.
-        */
-       struct cgroup_subsys *ss;       /* NULL for cgroup core files */
-       struct list_head node;          /* anchored at ss->cfts */
-       struct kernfs_ops *kf_ops;
-
-       /*
-        * read_u64() is a shortcut for the common case of returning a
-        * single integer. Use it in place of read()
-        */
-       u64 (*read_u64)(struct cgroup_subsys_state *css, struct cftype *cft);
-       /*
-        * read_s64() is a signed version of read_u64()
-        */
-       s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft);
-
-       /* generic seq_file read interface */
-       int (*seq_show)(struct seq_file *sf, void *v);
-
-       /* optional ops, implement all or none */
-       void *(*seq_start)(struct seq_file *sf, loff_t *ppos);
-       void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos);
-       void (*seq_stop)(struct seq_file *sf, void *v);
-
-       /*
-        * write_u64() is a shortcut for the common case of accepting
-        * a single integer (as parsed by simple_strtoull) from
-        * userspace. Use in place of write(); return 0 or error.
-        */
-       int (*write_u64)(struct cgroup_subsys_state *css, struct cftype *cft,
-                        u64 val);
-       /*
-        * write_s64() is a signed version of write_u64()
-        */
-       int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft,
-                        s64 val);
-
-       /*
-        * write() is the generic write callback which maps directly to
-        * kernfs write operation and overrides all other operations.
-        * Maximum write size is determined by ->max_write_len.  Use
-        * of_css/cft() to access the associated css and cft.
-        */
-       ssize_t (*write)(struct kernfs_open_file *of,
-                        char *buf, size_t nbytes, loff_t off);
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-       struct lock_class_key   lockdep_key;
-#endif
-};
-
-extern struct cgroup_root cgrp_dfl_root;
-extern struct css_set init_css_set;
-
-/**
- * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
- * @cgrp: the cgroup of interest
- *
- * The default hierarchy is the v2 interface of cgroup and this function
- * can be used to test whether a cgroup is on the default hierarchy for
- * cases where a subsystem should behave differnetly depending on the
- * interface version.
- *
- * The set of behaviors which change on the default hierarchy are still
- * being determined and the mount option is prefixed with __DEVEL__.
- *
- * List of changed behaviors:
- *
- * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
- *   and "name" are disallowed.
- *
- * - When mounting an existing superblock, mount options should match.
- *
- * - Remount is disallowed.
- *
- * - rename(2) is disallowed.
- *
- * - "tasks" is removed.  Everything should be at process granularity.  Use
- *   "cgroup.procs" instead.
- *
- * - "cgroup.procs" is not sorted.  pids will be unique unless they got
- *   recycled inbetween reads.
- *
- * - "release_agent" and "notify_on_release" are removed.  Replacement
- *   notification mechanism will be implemented.
- *
- * - "cgroup.clone_children" is removed.
- *
- * - "cgroup.subtree_populated" is available.  Its value is 0 if the cgroup
- *   and its descendants contain no task; otherwise, 1.  The file also
- *   generates kernfs notification which can be monitored through poll and
- *   [di]notify when the value of the file changes.
- *
- * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
- *   take masks of ancestors with non-empty cpus/mems, instead of being
- *   moved to an ancestor.
- *
- * - cpuset: a task can be moved into an empty cpuset, and again it takes
- *   masks of ancestors.
- *
- * - memcg: use_hierarchy is on by default and the cgroup file for the flag
- *   is not created.
- *
- * - blkcg: blk-throttle becomes properly hierarchical.
- *
- * - debug: disallowed on the default hierarchy.
- */
-static inline bool cgroup_on_dfl(const struct cgroup *cgrp)
-{
-       return cgrp->root == &cgrp_dfl_root;
-}
-
-/* no synchronization, the result can only be used as a hint */
-static inline bool cgroup_has_tasks(struct cgroup *cgrp)
-{
-       return !list_empty(&cgrp->cset_links);
-}
-
-/* returns ino associated with a cgroup */
-static inline ino_t cgroup_ino(struct cgroup *cgrp)
-{
-       return cgrp->kn->ino;
-}
-
-/* cft/css accessors for cftype->write() operation */
-static inline struct cftype *of_cft(struct kernfs_open_file *of)
-{
-       return of->kn->priv;
-}
-
-struct cgroup_subsys_state *of_css(struct kernfs_open_file *of);
-
-/* cft/css accessors for cftype->seq_*() operations */
-static inline struct cftype *seq_cft(struct seq_file *seq)
-{
-       return of_cft(seq->private);
-}
-
-static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq)
-{
-       return of_css(seq->private);
-}
-
-/*
- * Name / path handling functions.  All are thin wrappers around the kernfs
- * counterparts and can be called under any context.
- */
-
-static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen)
-{
-       return kernfs_name(cgrp->kn, buf, buflen);
-}
-
-static inline char * __must_check cgroup_path(struct cgroup *cgrp, char *buf,
-                                             size_t buflen)
-{
-       return kernfs_path(cgrp->kn, buf, buflen);
-}
-
-static inline void pr_cont_cgroup_name(struct cgroup *cgrp)
-{
-       pr_cont_kernfs_name(cgrp->kn);
-}
-
-static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
-{
-       pr_cont_kernfs_path(cgrp->kn);
-}
-
-char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen);
-
-int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
-int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
-int cgroup_rm_cftypes(struct cftype *cfts);
-
-bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor);
-
-/*
- * Control Group taskset, used to pass around set of tasks to cgroup_subsys
- * methods.
- */
-struct cgroup_taskset;
-struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset);
-struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset);
+       return true;
+}
  
  /**
- * cgroup_taskset_for_each - iterate cgroup_taskset
- * @task: the loop cursor
- * @tset: taskset to iterate
+ * css_put - put a css reference
+ * @css: target css
+ *
+ * Put a reference obtained via css_get() and css_tryget_online().
   */
-#define cgroup_taskset_for_each(task, tset)                            \
-       for ((task) = cgroup_taskset_first((tset)); (task);             \
-            (task) = cgroup_taskset_next((tset)))
+static inline void css_put(struct cgroup_subsys_state *css)
+{
+       if (!(css->flags & CSS_NO_REF))
+               percpu_ref_put(&css->refcnt);
+}
  
-/*
- * Control Group subsystem type.
- * See Documentation/cgroups/cgroups.txt for details
+/**
+ * css_put_many - put css references
+ * @css: target css
+ * @n: number of references to put
+ *
+ * Put references obtained via css_get() and css_tryget_online().
   */
-
-struct cgroup_subsys {
-       struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css);
-       int (*css_online)(struct cgroup_subsys_state *css);
-       void (*css_offline)(struct cgroup_subsys_state *css);
-       void (*css_released)(struct cgroup_subsys_state *css);
-       void (*css_free)(struct cgroup_subsys_state *css);
-       void (*css_reset)(struct cgroup_subsys_state *css);
-       void (*css_e_css_changed)(struct cgroup_subsys_state *css);
-
-       int (*can_attach)(struct cgroup_subsys_state *css,
-                         struct cgroup_taskset *tset);
-       void (*cancel_attach)(struct cgroup_subsys_state *css,
-                             struct cgroup_taskset *tset);
-       void (*attach)(struct cgroup_subsys_state *css,
-                      struct cgroup_taskset *tset);
-       void (*fork)(struct task_struct *task);
-       void (*exit)(struct cgroup_subsys_state *css,
-                    struct cgroup_subsys_state *old_css,
-                    struct task_struct *task);
-       void (*bind)(struct cgroup_subsys_state *root_css);
-
-       int disabled;
-       int early_init;
-
-       /*
-        * If %false, this subsystem is properly hierarchical -
-        * configuration, resource accounting and restriction on a parent
-        * cgroup cover those of its children.  If %true, hierarchy support
-        * is broken in some ways - some subsystems ignore hierarchy
-        * completely while others are only implemented half-way.
-        *
-        * It's now disallowed to create nested cgroups if the subsystem is
-        * broken and cgroup core will emit a warning message on such
-        * cases.  Eventually, all subsystems will be made properly
-        * hierarchical and this will go away.
-        */
-       bool broken_hierarchy;
-       bool warned_broken_hierarchy;
-
-       /* the following two fields are initialized automtically during boot */
-       int id;
-#define MAX_CGROUP_TYPE_NAMELEN 32
-       const char *name;
-
-       /* link to parent, protected by cgroup_lock() */
-       struct cgroup_root *root;
-
-       /* idr for css->id */
-       struct idr css_idr;
-
-       /*
-        * List of cftypes.  Each entry is the first entry of an array
-        * terminated by zero length name.
-        */
-       struct list_head cfts;
-
-       /*
-        * Base cftypes which are automatically registered.  The two can
-        * point to the same array.
-        */
-       struct cftype *dfl_cftypes;     /* for the default hierarchy */
-       struct cftype *legacy_cftypes;  /* for the legacy hierarchies */
-
-       /*
-        * A subsystem may depend on other subsystems.  When such subsystem
-        * is enabled on a cgroup, the depended-upon subsystems are enabled
-        * together if available.  Subsystems enabled due to dependency are
-        * not visible to userland until explicitly enabled.  The following
-        * specifies the mask of subsystems that this one depends on.
-        */
-       unsigned int depends_on;
-};
-
-#define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys;
-#include <linux/cgroup_subsys.h>
-#undef SUBSYS
+static inline void css_put_many(struct cgroup_subsys_state *css, unsigned int n)
+{
+       if (!(css->flags & CSS_NO_REF))
+               percpu_ref_put_many(&css->refcnt, n);
+}
  
  /**
   * task_css_set_check - obtain a task's css_set with extra access conditions
@@ -818,178 +398,137 @@ static inline struct cgroup *task_cgroup(struct task_struct *task,
         return task_css(task, subsys_id)->cgroup;
  }
  
-struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
-                                          struct cgroup_subsys_state *parent);
-
-struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss);
-
  /**
- * css_for_each_child - iterate through children of a css
- * @pos: the css * to use as the loop cursor
- * @parent: css whose children to walk
+ * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
+ * @cgrp: the cgroup of interest
   *
- * Walk @parent's children.  Must be called under rcu_read_lock().
+ * The default hierarchy is the v2 interface of cgroup and this function
+ * can be used to test whether a cgroup is on the default hierarchy for
+ * cases where a subsystem should behave differnetly depending on the
+ * interface version.
   *
- * If a subsystem synchronizes ->css_online() and the start of iteration, a
- * css which finished ->css_online() is guaranteed to be visible in the
- * future iterations and will stay visible until the last reference is put.
- * A css which hasn't finished ->css_online() or already finished
- * ->css_offline() may show up during traversal.  It's each subsystem's
- * responsibility to synchronize against on/offlining.
+ * The set of behaviors which change on the default hierarchy are still
+ * being determined and the mount option is prefixed with __DEVEL__.
   *
- * It is allowed to temporarily drop RCU read lock during iteration.  The
- * caller is responsible for ensuring that @pos remains accessible until
- * the start of the next iteration by, for example, bumping the css refcnt.
- */
-#define css_for_each_child(pos, parent)                                        \
-       for ((pos) = css_next_child(NULL, (parent)); (pos);             \
-            (pos) = css_next_child((pos), (parent)))
-
-struct cgroup_subsys_state *
-css_next_descendant_pre(struct cgroup_subsys_state *pos,
-                       struct cgroup_subsys_state *css);
-
-struct cgroup_subsys_state *
-css_rightmost_descendant(struct cgroup_subsys_state *pos);
-
-/**
- * css_for_each_descendant_pre - pre-order walk of a css's descendants
- * @pos: the css * to use as the loop cursor
- * @root: css whose descendants to walk
+ * List of changed behaviors:
   *
- * Walk @root's descendants.  @root is included in the iteration and the
- * first node to be visited.  Must be called under rcu_read_lock().
+ * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
+ *   and "name" are disallowed.
   *
- * If a subsystem synchronizes ->css_online() and the start of iteration, a
- * css which finished ->css_online() is guaranteed to be visible in the
- * future iterations and will stay visible until the last reference is put.
- * A css which hasn't finished ->css_online() or already finished
- * ->css_offline() may show up during traversal.  It's each subsystem's
- * responsibility to synchronize against on/offlining.
+ * - When mounting an existing superblock, mount options should match.
   *
- * For example, the following guarantees that a descendant can't escape
- * state updates of its ancestors.
+ * - Remount is disallowed.
   *
- * my_online(@css)
- * {
- *     Lock @css's parent and @css;
- *     Inherit state from the parent;
- *     Unlock both.
- * }
+ * - rename(2) is disallowed.
   *
- * my_update_state(@css)
- * {
- *     css_for_each_descendant_pre(@pos, @css) {
- *             Lock @pos;
- *             if (@pos == @css)
- *                     Update @css's state;
- *             else
- *                     Verify @pos is alive and inherit state from its parent;
- *             Unlock @pos;
- *     }
- * }
+ * - "tasks" is removed.  Everything should be at process granularity.  Use
+ *   "cgroup.procs" instead.
   *
- * As long as the inheriting step, including checking the parent state, is
- * enclosed inside @pos locking, double-locking the parent isn't necessary
- * while inheriting.  The state update to the parent is guaranteed to be
- * visible by walking order and, as long as inheriting operations to the
- * same @pos are atomic to each other, multiple updates racing each other
- * still result in the correct state.  It's guaranateed that at least one
- * inheritance happens for any css after the latest update to its parent.
+ * - "cgroup.procs" is not sorted.  pids will be unique unless they got
+ *   recycled inbetween reads.
   *
- * If checking parent's state requires locking the parent, each inheriting
- * iteration should lock and unlock both @pos->parent and @pos.
+ * - "release_agent" and "notify_on_release" are removed.  Replacement
+ *   notification mechanism will be implemented.
   *
- * Alternatively, a subsystem may choose to use a single global lock to
- * synchronize ->css_online() and ->css_offline() against tree-walking
- * operations.
+ * - "cgroup.clone_children" is removed.
   *
- * It is allowed to temporarily drop RCU read lock during iteration.  The
- * caller is responsible for ensuring that @pos remains accessible until
- * the start of the next iteration by, for example, bumping the css refcnt.
- */
-#define css_for_each_descendant_pre(pos, css)                          \
-       for ((pos) = css_next_descendant_pre(NULL, (css)); (pos);       \
-            (pos) = css_next_descendant_pre((pos), (css)))
-
-struct cgroup_subsys_state *
-css_next_descendant_post(struct cgroup_subsys_state *pos,
-                        struct cgroup_subsys_state *css);
-
-/**
- * css_for_each_descendant_post - post-order walk of a css's descendants
- * @pos: the css * to use as the loop cursor
- * @css: css whose descendants to walk
+ * - "cgroup.subtree_populated" is available.  Its value is 0 if the cgroup
+ *   and its descendants contain no task; otherwise, 1.  The file also
+ *   generates kernfs notification which can be monitored through poll and
+ *   [di]notify when the value of the file changes.
   *
- * Similar to css_for_each_descendant_pre() but performs post-order
- * traversal instead.  @root is included in the iteration and the last
- * node to be visited.
+ * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
+ *   take masks of ancestors with non-empty cpus/mems, instead of being
+ *   moved to an ancestor.
   *
- * If a subsystem synchronizes ->css_online() and the start of iteration, a
- * css which finished ->css_online() is guaranteed to be visible in the
- * future iterations and will stay visible until the last reference is put.
- * A css which hasn't finished ->css_online() or already finished
- * ->css_offline() may show up during traversal.  It's each subsystem's
- * responsibility to synchronize against on/offlining.
+ * - cpuset: a task can be moved into an empty cpuset, and again it takes
+ *   masks of ancestors.
   *
- * Note that the walk visibility guarantee example described in pre-order
- * walk doesn't apply the same to post-order walks.
+ * - memcg: use_hierarchy is on by default and the cgroup file for the flag
+ *   is not created.
+ *
+ * - blkcg: blk-throttle becomes properly hierarchical.
+ *
+ * - debug: disallowed on the default hierarchy.
   */
-#define css_for_each_descendant_post(pos, css)                         \
-       for ((pos) = css_next_descendant_post(NULL, (css)); (pos);      \
-            (pos) = css_next_descendant_post((pos), (css)))
+static inline bool cgroup_on_dfl(const struct cgroup *cgrp)
+{
+       return cgrp->root == &cgrp_dfl_root;
+}
  
-bool css_has_online_children(struct cgroup_subsys_state *css);
+/* no synchronization, the result can only be used as a hint */
+static inline bool cgroup_has_tasks(struct cgroup *cgrp)
+{
+       return !list_empty(&cgrp->cset_links);
+}
  
-/* A css_task_iter should be treated as an opaque object */
-struct css_task_iter {
-       struct cgroup_subsys            *ss;
+/* returns ino associated with a cgroup */
+static inline ino_t cgroup_ino(struct cgroup *cgrp)
+{
+       return cgrp->kn->ino;
+}
  
-       struct list_head                *cset_pos;
-       struct list_head                *cset_head;
+/* cft/css accessors for cftype->write() operation */
+static inline struct cftype *of_cft(struct kernfs_open_file *of)
+{
+       return of->kn->priv;
+}
  
-       struct list_head                *task_pos;
-       struct list_head                *tasks_head;
-       struct list_head                *mg_tasks_head;
-};
+struct cgroup_subsys_state *of_css(struct kernfs_open_file *of);
  
-void css_task_iter_start(struct cgroup_subsys_state *css,
-                        struct css_task_iter *it);
-struct task_struct *css_task_iter_next(struct css_task_iter *it);
-void css_task_iter_end(struct css_task_iter *it);
+/* cft/css accessors for cftype->seq_*() operations */
+static inline struct cftype *seq_cft(struct seq_file *seq)
+{
+       return of_cft(seq->private);
+}
  
-int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
-int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
+static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq)
+{
+       return of_css(seq->private);
+}
  
-struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup,
-                                            struct cgroup_subsys *ss);
-struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
-                                                      struct cgroup_subsys *ss);
+/*
+ * Name / path handling functions.  All are thin wrappers around the kernfs
+ * counterparts and can be called under any context.
+ */
  
-#else /* !CONFIG_CGROUPS */
+static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen)
+{
+       return kernfs_name(cgrp->kn, buf, buflen);
+}
  
-struct cgroup_subsys_state;
+static inline char * __must_check cgroup_path(struct cgroup *cgrp, char *buf,
+                                             size_t buflen)
+{
+       return kernfs_path(cgrp->kn, buf, buflen);
+}
  
-static inline int cgroup_init_early(void) { return 0; }
-static inline int cgroup_init(void) { return 0; }
-static inline void cgroup_fork(struct task_struct *p) {}
-static inline void cgroup_post_fork(struct task_struct *p) {}
-static inline void cgroup_exit(struct task_struct *p) {}
+static inline void pr_cont_cgroup_name(struct cgroup *cgrp)
+{
+       pr_cont_kernfs_name(cgrp->kn);
+}
  
-static inline int cgroupstats_build(struct cgroupstats *stats,
-                                       struct dentry *dentry)
+static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
  {
-       return -EINVAL;
+       pr_cont_kernfs_path(cgrp->kn);
  }
  
-static inline void css_put(struct cgroup_subsys_state *css) {}
+#else /* !CONFIG_CGROUPS */
+
+struct cgroup_subsys_state;
  
-/* No cgroups - nothing to do */
+static inline void css_put(struct cgroup_subsys_state *css) {}
  static inline int cgroup_attach_task_all(struct task_struct *from,
-                                        struct task_struct *t)
-{
-       return 0;
-}
+                                        struct task_struct *t) { return 0; }
+static inline int cgroupstats_build(struct cgroupstats *stats,
+                                   struct dentry *dentry) { return -EINVAL; }
+
+static inline void cgroup_fork(struct task_struct *p) {}
+static inline void cgroup_post_fork(struct task_struct *p) {}
+static inline void cgroup_exit(struct task_struct *p) {}
+
+static inline int cgroup_init_early(void) { return 0; }
+static inline int cgroup_init(void) { return 0; }
  
  #endif /* !CONFIG_CGROUPS */
  
diff --git a/include/linux/init_task.h b/include/linux/init_task.h

index bb9b075..e8493fe 100644 (file)
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -25,13 +25,6 @@
  extern struct files_struct init_files;
  extern struct fs_struct init_fs;
  
-#ifdef CONFIG_CGROUPS
-#define INIT_GROUP_RWSEM(sig)                                          \
-       .group_rwsem = __RWSEM_INITIALIZER(sig.group_rwsem),
-#else
-#define INIT_GROUP_RWSEM(sig)
-#endif
-
  #ifdef CONFIG_CPUSETS
  #define INIT_CPUSET_SEQ(tsk)                                                   \
         .mems_allowed_seq = SEQCNT_ZERO(tsk.mems_allowed_seq),
@@ -55,7 +48,6 @@ extern struct fs_struct init_fs;
         },                                                              \
         .cred_guard_mutex =                                             \
                  __MUTEX_INITIALIZER(sig.cred_guard_mutex),             \
-       INIT_GROUP_RWSEM(sig)                                           \
  }
  
  extern struct nsproxy init_nsproxy;
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h

index 71ecdab..e6b2f7d 100644 (file)
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -277,6 +277,7 @@ void kernfs_put(struct kernfs_node *kn);
  
  struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry);
  struct kernfs_root *kernfs_root_from_sb(struct super_block *sb);
+struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn);
  
  struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
                                        unsigned int flags, void *priv);
@@ -352,6 +353,10 @@ static inline struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry)
  static inline struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
  { return NULL; }
  
+static inline struct inode *
+kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn)
+{ return NULL; }
+
  static inline struct kernfs_root *
  kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags,
                    void *priv)
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 93ed0b6..a09ece3 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -58,6 +58,7 @@ struct sched_param {
  #include <linux/uidgid.h>
  #include <linux/gfp.h>
  #include <linux/magic.h>
+#include <linux/cgroup-defs.h>
  
  #include <asm/processor.h>
  
@@ -755,18 +756,6 @@ struct signal_struct {
         unsigned audit_tty_log_passwd;
         struct tty_audit_buf *tty_audit_buf;
  #endif
-#ifdef CONFIG_CGROUPS
-       /*
-        * group_rwsem prevents new tasks from entering the threadgroup and
-        * member tasks from exiting,a more specifically, setting of
-        * PF_EXITING.  fork and exit paths are protected with this rwsem
-        * using threadgroup_change_begin/end().  Users which require
-        * threadgroup to remain stable should use threadgroup_[un]lock()
-        * which also takes care of exec path.  Currently, cgroup is the
-        * only user.
-        */
-       struct rw_semaphore group_rwsem;
-#endif
  
         oom_flags_t oom_flags;
         short oom_score_adj;            /* OOM kill score adjustment */
@@ -2725,53 +2714,33 @@ static inline void unlock_task_sighand(struct task_struct *tsk,
         spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
  }
  
-#ifdef CONFIG_CGROUPS
-static inline void threadgroup_change_begin(struct task_struct *tsk)
-{
-       down_read(&tsk->signal->group_rwsem);
-}
-static inline void threadgroup_change_end(struct task_struct *tsk)
-{
-       up_read(&tsk->signal->group_rwsem);
-}
-
  /**
- * threadgroup_lock - lock threadgroup
- * @tsk: member task of the threadgroup to lock
- *
- * Lock the threadgroup @tsk belongs to.  No new task is allowed to enter
- * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or
- * change ->group_leader/pid.  This is useful for cases where the threadgroup
- * needs to stay stable across blockable operations.
- *
- * fork and exit paths explicitly call threadgroup_change_{begin|end}() for
- * synchronization.  While held, no new task will be added to threadgroup
- * and no existing live task will have its PF_EXITING set.
+ * threadgroup_change_begin - mark the beginning of changes to a threadgroup
+ * @tsk: task causing the changes
   *
- * de_thread() does threadgroup_change_{begin|end}() when a non-leader
- * sub-thread becomes a new leader.
+ * All operations which modify a threadgroup - a new thread joining the
+ * group, death of a member thread (the assertion of PF_EXITING) and
+ * exec(2) dethreading the process and replacing the leader - are wrapped
+ * by threadgroup_change_{begin|end}().  This is to provide a place which
+ * subsystems needing threadgroup stability can hook into for
+ * synchronization.
   */
-static inline void threadgroup_lock(struct task_struct *tsk)
+static inline void threadgroup_change_begin(struct task_struct *tsk)
  {
-       down_write(&tsk->signal->group_rwsem);
+       might_sleep();
+       cgroup_threadgroup_change_begin(tsk);
  }
  
  /**
- * threadgroup_unlock - unlock threadgroup
- * @tsk: member task of the threadgroup to unlock
+ * threadgroup_change_end - mark the end of changes to a threadgroup
+ * @tsk: task causing the changes
   *
- * Reverse threadgroup_lock().
+ * See threadgroup_change_begin().
   */
-static inline void threadgroup_unlock(struct task_struct *tsk)
+static inline void threadgroup_change_end(struct task_struct *tsk)
  {
-       up_write(&tsk->signal->group_rwsem);
+       cgroup_threadgroup_change_end(tsk);
  }
-#else
-static inline void threadgroup_change_begin(struct task_struct *tsk) {}
-static inline void threadgroup_change_end(struct task_struct *tsk) {}
-static inline void threadgroup_lock(struct task_struct *tsk) {}
-static inline void threadgroup_unlock(struct task_struct *tsk) {}
-#endif
  
  #ifndef __HAVE_THREAD_FUNCTIONS
  
diff --git a/init/Kconfig b/init/Kconfig

index f0c2e68..7d1ffd2 100644 (file)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -924,6 +924,7 @@ config NUMA_BALANCING_DEFAULT_ENABLED
  menuconfig CGROUPS
         bool "Control Group support"
         select KERNFS
+       select PERCPU_RWSEM
         help
           This option adds support for grouping sets of processes together, for
           use with process control subsystems such as Cpusets, CFS, memory
diff --git a/kernel/cgroup.c b/kernel/cgroup.c

index 469dd54..9ef9fc8 100644 (file)
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -46,6 +46,7 @@
  #include <linux/slab.h>
  #include <linux/spinlock.h>
  #include <linux/rwsem.h>
+#include <linux/percpu-rwsem.h>
  #include <linux/string.h>
  #include <linux/sort.h>
  #include <linux/kmod.h>
@@ -103,6 +104,8 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
   */
  static DEFINE_SPINLOCK(release_agent_path_lock);
  
+struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
+
  #define cgroup_assert_mutex_or_rcu_locked()                            \
         rcu_lockdep_assert(rcu_read_lock_held() ||                      \
                            lockdep_is_held(&cgroup_mutex),              \
@@ -156,7 +159,7 @@ static bool cgrp_dfl_root_visible;
  static bool cgroup_legacy_files_on_dfl;
  
  /* some controllers are not supported in the default hierarchy */
-static unsigned int cgrp_dfl_root_inhibit_ss_mask;
+static unsigned long cgrp_dfl_root_inhibit_ss_mask;
  
  /* The list of hierarchy roots */
  
@@ -175,18 +178,19 @@ static DEFINE_IDR(cgroup_hierarchy_idr);
   */
  static u64 css_serial_nr_next = 1;
  
-/* This flag indicates whether tasks in the fork and exit paths should
- * check for fork/exit handlers to call. This avoids us having to do
- * extra work in the fork/exit path if none of the subsystems need to
- * be called.
+/*
+ * These bitmask flags indicate whether tasks in the fork and exit paths have
+ * fork/exit handlers to call. This avoids us having to do extra work in the
+ * fork/exit path to check which subsystems have fork/exit callbacks.
   */
-static int need_forkexit_callback __read_mostly;
+static unsigned long have_fork_callback __read_mostly;
+static unsigned long have_exit_callback __read_mostly;
  
  static struct cftype cgroup_dfl_base_files[];
  static struct cftype cgroup_legacy_base_files[];
  
  static int rebind_subsystems(struct cgroup_root *dst_root,
-                            unsigned int ss_mask);
+                            unsigned long ss_mask);
  static int cgroup_destroy_locked(struct cgroup *cgrp);
  static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
                       bool visible);
@@ -261,7 +265,7 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
   * @cgrp: the cgroup of interest
   * @ss: the subsystem of interest (%NULL returns @cgrp->self)
   *
- * Similar to cgroup_css() but returns the effctive css, which is defined
+ * Similar to cgroup_css() but returns the effective css, which is defined
   * as the matching css of the nearest ancestor including self which has @ss
   * enabled.  If @ss is associated with the hierarchy @cgrp is on, this
   * function is guaranteed to return non-NULL css.
@@ -409,6 +413,24 @@ static int notify_on_release(const struct cgroup *cgrp)
         for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&                \
              (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
  
+/**
+ * for_each_subsys_which - filter for_each_subsys with a bitmask
+ * @ss: the iteration cursor
+ * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
+ * @ss_maskp: a pointer to the bitmask
+ *
+ * The block will only run for cases where the ssid-th bit (1 << ssid) of
+ * mask is set to 1.
+ */
+#define for_each_subsys_which(ss, ssid, ss_maskp)                      \
+       if (!CGROUP_SUBSYS_COUNT) /* to avoid spurious gcc warning */   \
+               (ssid) = 0;                                             \
+       else                                                            \
+               for_each_set_bit(ssid, ss_maskp, CGROUP_SUBSYS_COUNT)   \
+                       if (((ss) = cgroup_subsys[ssid]) && false)      \
+                               break;                                  \
+                       else
+
  /* iterate across the hierarchies */
  #define for_each_root(root)                                            \
         list_for_each_entry((root), &cgroup_roots, root_list)
@@ -882,7 +904,7 @@ static void cgroup_exit_root_id(struct cgroup_root *root)
  static void cgroup_free_root(struct cgroup_root *root)
  {
         if (root) {
-               /* hierarhcy ID shoulid already have been released */
+               /* hierarchy ID should already have been released */
                 WARN_ON_ONCE(root->hierarchy_id);
  
                 idr_destroy(&root->cgroup_idr);
@@ -998,7 +1020,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
   * update of a tasks cgroup pointer by cgroup_attach_task()
   */
  
-static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask);
+static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
  static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
  static const struct file_operations proc_cgroupstats_operations;
  
@@ -1068,11 +1090,11 @@ static void cgroup_put(struct cgroup *cgrp)
   * @subtree_control is to be applied to @cgrp.  The returned mask is always
   * a superset of @subtree_control and follows the usual hierarchy rules.
   */
-static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
-                                                 unsigned int subtree_control)
+static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
+                                                 unsigned long subtree_control)
  {
         struct cgroup *parent = cgroup_parent(cgrp);
-       unsigned int cur_ss_mask = subtree_control;
+       unsigned long cur_ss_mask = subtree_control;
         struct cgroup_subsys *ss;
         int ssid;
  
@@ -1082,11 +1104,10 @@ static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp,
                 return cur_ss_mask;
  
         while (true) {
-               unsigned int new_ss_mask = cur_ss_mask;
+               unsigned long new_ss_mask = cur_ss_mask;
  
-               for_each_subsys(ss, ssid)
-                       if (cur_ss_mask & (1 << ssid))
-                               new_ss_mask |= ss->depends_on;
+               for_each_subsys_which(ss, ssid, &cur_ss_mask)
+                       new_ss_mask |= ss->depends_on;
  
                 /*
                  * Mask out subsystems which aren't available.  This can
@@ -1200,7 +1221,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
   * @cgrp: target cgroup
   * @subsys_mask: mask of the subsystem ids whose files should be removed
   */
-static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)
+static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
  {
         struct cgroup_subsys *ss;
         int i;
@@ -1215,18 +1236,16 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned int subsys_mask)
         }
  }
  
-static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
+static int rebind_subsystems(struct cgroup_root *dst_root,
+                            unsigned long ss_mask)
  {
         struct cgroup_subsys *ss;
-       unsigned int tmp_ss_mask;
+       unsigned long tmp_ss_mask;
         int ssid, i, ret;
  
         lockdep_assert_held(&cgroup_mutex);
  
-       for_each_subsys(ss, ssid) {
-               if (!(ss_mask & (1 << ssid)))
-                       continue;
-
+       for_each_subsys_which(ss, ssid, &ss_mask) {
                 /* if @ss has non-root csses attached to it, can't move */
                 if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)))
                         return -EBUSY;
@@ -1253,7 +1272,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
                  * Just warn about it and continue.
                  */
                 if (cgrp_dfl_root_visible) {
-                       pr_warn("failed to create files (%d) while rebinding 0x%x to default root\n",
+                       pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n",
                                 ret, ss_mask);
                         pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
                 }
@@ -1263,18 +1282,14 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
          * Nothing can fail from this point on.  Remove files for the
          * removed subsystems and rebind each subsystem.
          */
-       for_each_subsys(ss, ssid)
-               if (ss_mask & (1 << ssid))
-                       cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
+       for_each_subsys_which(ss, ssid, &ss_mask)
+               cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
  
-       for_each_subsys(ss, ssid) {
+       for_each_subsys_which(ss, ssid, &ss_mask) {
                 struct cgroup_root *src_root;
                 struct cgroup_subsys_state *css;
                 struct css_set *cset;
  
-               if (!(ss_mask & (1 << ssid)))
-                       continue;
-
                 src_root = ss->root;
                 css = cgroup_css(&src_root->cgrp, ss);
  
@@ -1338,7 +1353,7 @@ static int cgroup_show_options(struct seq_file *seq,
  }
  
  struct cgroup_sb_opts {
-       unsigned int subsys_mask;
+       unsigned long subsys_mask;
         unsigned int flags;
         char *release_agent;
         bool cpuset_clone_children;
@@ -1351,7 +1366,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
  {
         char *token, *o = data;
         bool all_ss = false, one_ss = false;
-       unsigned int mask = -1U;
+       unsigned long mask = -1UL;
         struct cgroup_subsys *ss;
         int nr_opts = 0;
         int i;
@@ -1495,7 +1510,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
         int ret = 0;
         struct cgroup_root *root = cgroup_root_from_kf(kf_root);
         struct cgroup_sb_opts opts;
-       unsigned int added_mask, removed_mask;
+       unsigned long added_mask, removed_mask;
  
         if (root == &cgrp_dfl_root) {
                 pr_err("remount is not allowed\n");
@@ -1641,7 +1656,7 @@ static void init_cgroup_root(struct cgroup_root *root,
                 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
  }
  
-static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
+static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
  {
         LIST_HEAD(tmp_links);
         struct cgroup *root_cgrp = &root->cgrp;
@@ -2052,9 +2067,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
         lockdep_assert_held(&css_set_rwsem);
  
         /*
-        * We are synchronized through threadgroup_lock() against PF_EXITING
-        * setting such that we can't race against cgroup_exit() changing the
-        * css_set to init_css_set and dropping the old one.
+        * We are synchronized through cgroup_threadgroup_rwsem against
+        * PF_EXITING setting such that we can't race against cgroup_exit()
+        * changing the css_set to init_css_set and dropping the old one.
          */
         WARN_ON_ONCE(tsk->flags & PF_EXITING);
         old_cset = task_css_set(tsk);
@@ -2111,10 +2126,11 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
   * @src_cset and add it to @preloaded_csets, which should later be cleaned
   * up by cgroup_migrate_finish().
   *
- * This function may be called without holding threadgroup_lock even if the
- * target is a process.  Threads may be created and destroyed but as long
- * as cgroup_mutex is not dropped, no new css_set can be put into play and
- * the preloaded css_sets are guaranteed to cover all migrations.
+ * This function may be called without holding cgroup_threadgroup_rwsem
+ * even if the target is a process.  Threads may be created and destroyed
+ * but as long as cgroup_mutex is not dropped, no new css_set can be put
+ * into play and the preloaded css_sets are guaranteed to cover all
+ * migrations.
   */
  static void cgroup_migrate_add_src(struct css_set *src_cset,
                                    struct cgroup *dst_cgrp,
@@ -2217,7 +2233,7 @@ err:
   * @threadgroup: whether @leader points to the whole process or a single task
   *
   * Migrate a process or task denoted by @leader to @cgrp.  If migrating a
- * process, the caller must be holding threadgroup_lock of @leader.  The
+ * process, the caller must be holding cgroup_threadgroup_rwsem.  The
   * caller is also responsible for invoking cgroup_migrate_add_src() and
   * cgroup_migrate_prepare_dst() on the targets before invoking this
   * function and following up with cgroup_migrate_finish().
@@ -2345,7 +2361,7 @@ out_release_tset:
   * @leader: the task or the leader of the threadgroup to be attached
   * @threadgroup: attach the whole threadgroup?
   *
- * Call holding cgroup_mutex and threadgroup_lock of @leader.
+ * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
   */
  static int cgroup_attach_task(struct cgroup *dst_cgrp,
                               struct task_struct *leader, bool threadgroup)
@@ -2376,6 +2392,47 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
         return ret;
  }
  
+static int cgroup_procs_write_permission(struct task_struct *task,
+                                        struct cgroup *dst_cgrp,
+                                        struct kernfs_open_file *of)
+{
+       const struct cred *cred = current_cred();
+       const struct cred *tcred = get_task_cred(task);
+       int ret = 0;
+
+       /*
+        * even if we're attaching all tasks in the thread group, we only
+        * need to check permissions on one of them.
+        */
+       if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
+           !uid_eq(cred->euid, tcred->uid) &&
+           !uid_eq(cred->euid, tcred->suid))
+               ret = -EACCES;
+
+       if (!ret && cgroup_on_dfl(dst_cgrp)) {
+               struct super_block *sb = of->file->f_path.dentry->d_sb;
+               struct cgroup *cgrp;
+               struct inode *inode;
+
+               down_read(&css_set_rwsem);
+               cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+               up_read(&css_set_rwsem);
+
+               while (!cgroup_is_descendant(dst_cgrp, cgrp))
+                       cgrp = cgroup_parent(cgrp);
+
+               ret = -ENOMEM;
+               inode = kernfs_get_inode(sb, cgrp->procs_kn);
+               if (inode) {
+                       ret = inode_permission(inode, MAY_WRITE);
+                       iput(inode);
+               }
+       }
+
+       put_cred(tcred);
+       return ret;
+}
+
  /*
   * Find the task_struct of the task to attach by vpid and pass it along to the
   * function to attach either it or all tasks in its threadgroup. Will lock
@@ -2385,7 +2442,6 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
                                     size_t nbytes, loff_t off, bool threadgroup)
  {
         struct task_struct *tsk;
-       const struct cred *cred = current_cred(), *tcred;
         struct cgroup *cgrp;
         pid_t pid;
         int ret;
@@ -2397,29 +2453,17 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
         if (!cgrp)
                 return -ENODEV;
  
-retry_find_task:
+       percpu_down_write(&cgroup_threadgroup_rwsem);
         rcu_read_lock();
         if (pid) {
                 tsk = find_task_by_vpid(pid);
                 if (!tsk) {
-                       rcu_read_unlock();
                         ret = -ESRCH;
-                       goto out_unlock_cgroup;
+                       goto out_unlock_rcu;
                 }
-               /*
-                * even if we're attaching all tasks in the thread group, we
-                * only need to check permissions on one of them.
-                */
-               tcred = __task_cred(tsk);
-               if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
-                   !uid_eq(cred->euid, tcred->uid) &&
-                   !uid_eq(cred->euid, tcred->suid)) {
-                       rcu_read_unlock();
-                       ret = -EACCES;
-                       goto out_unlock_cgroup;
-               }
-       } else
+       } else {
                 tsk = current;
+       }
  
         if (threadgroup)
                 tsk = tsk->group_leader;
@@ -2431,35 +2475,23 @@ retry_find_task:
          */
         if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
                 ret = -EINVAL;
-               rcu_read_unlock();
-               goto out_unlock_cgroup;
+               goto out_unlock_rcu;
         }
  
         get_task_struct(tsk);
         rcu_read_unlock();
  
-       threadgroup_lock(tsk);
-       if (threadgroup) {
-               if (!thread_group_leader(tsk)) {
-                       /*
-                        * a race with de_thread from another thread's exec()
-                        * may strip us of our leadership, if this happens,
-                        * there is no choice but to throw this task away and
-                        * try again; this is
-                        * "double-double-toil-and-trouble-check locking".
-                        */
-                       threadgroup_unlock(tsk);
-                       put_task_struct(tsk);
-                       goto retry_find_task;
-               }
-       }
-
-       ret = cgroup_attach_task(cgrp, tsk, threadgroup);
-
-       threadgroup_unlock(tsk);
+       ret = cgroup_procs_write_permission(tsk, cgrp, of);
+       if (!ret)
+               ret = cgroup_attach_task(cgrp, tsk, threadgroup);
  
         put_task_struct(tsk);
-out_unlock_cgroup:
+       goto out_unlock_threadgroup;
+
+out_unlock_rcu:
+       rcu_read_unlock();
+out_unlock_threadgroup:
+       percpu_up_write(&cgroup_threadgroup_rwsem);
         cgroup_kn_unlock(of->kn);
         return ret ?: nbytes;
  }
@@ -2542,19 +2574,17 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
         return 0;
  }
  
-static void cgroup_print_ss_mask(struct seq_file *seq, unsigned int ss_mask)
+static void cgroup_print_ss_mask(struct seq_file *seq, unsigned long ss_mask)
  {
         struct cgroup_subsys *ss;
         bool printed = false;
         int ssid;
  
-       for_each_subsys(ss, ssid) {
-               if (ss_mask & (1 << ssid)) {
-                       if (printed)
-                               seq_putc(seq, ' ');
-                       seq_printf(seq, "%s", ss->name);
-                       printed = true;
-               }
+       for_each_subsys_which(ss, ssid, &ss_mask) {
+               if (printed)
+                       seq_putc(seq, ' ');
+               seq_printf(seq, "%s", ss->name);
+               printed = true;
         }
         if (printed)
                 seq_putc(seq, '\n');
@@ -2606,6 +2636,8 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
  
         lockdep_assert_held(&cgroup_mutex);
  
+       percpu_down_write(&cgroup_threadgroup_rwsem);
+
         /* look up all csses currently attached to @cgrp's subtree */
         down_read(&css_set_rwsem);
         css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
@@ -2661,17 +2693,8 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
                                 goto out_finish;
                         last_task = task;
  
-                       threadgroup_lock(task);
-                       /* raced against de_thread() from another thread? */
-                       if (!thread_group_leader(task)) {
-                               threadgroup_unlock(task);
-                               put_task_struct(task);
-                               continue;
-                       }
-
                         ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
  
-                       threadgroup_unlock(task);
                         put_task_struct(task);
  
                         if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
@@ -2681,6 +2704,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
  
  out_finish:
         cgroup_migrate_finish(&preloaded_csets);
+       percpu_up_write(&cgroup_threadgroup_rwsem);
         return ret;
  }
  
@@ -2689,8 +2713,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
                                             char *buf, size_t nbytes,
                                             loff_t off)
  {
-       unsigned int enable = 0, disable = 0;
-       unsigned int css_enable, css_disable, old_sc, new_sc, old_ss, new_ss;
+       unsigned long enable = 0, disable = 0;
+       unsigned long css_enable, css_disable, old_sc, new_sc, old_ss, new_ss;
         struct cgroup *cgrp, *child;
         struct cgroup_subsys *ss;
         char *tok;
@@ -2702,11 +2726,12 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
          */
         buf = strstrip(buf);
         while ((tok = strsep(&buf, " "))) {
+               unsigned long tmp_ss_mask = ~cgrp_dfl_root_inhibit_ss_mask;
+
                 if (tok[0] == '\0')
                         continue;
-               for_each_subsys(ss, ssid) {
-                       if (ss->disabled || strcmp(tok + 1, ss->name) ||
-                           ((1 << ss->id) & cgrp_dfl_root_inhibit_ss_mask))
+               for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
+                       if (ss->disabled || strcmp(tok + 1, ss->name))
                                 continue;
  
                         if (*tok == '+') {
@@ -2793,10 +2818,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
          * still around.  In such cases, wait till it's gone using
          * offline_waitq.
          */
-       for_each_subsys(ss, ssid) {
-               if (!(css_enable & (1 << ssid)))
-                       continue;
-
+       for_each_subsys_which(ss, ssid, &css_enable) {
                 cgroup_for_each_live_child(child, cgrp) {
                         DEFINE_WAIT(wait);
  
@@ -3087,7 +3109,9 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
                 return ret;
         }
  
-       if (cft->seq_show == cgroup_populated_show)
+       if (cft->write == cgroup_procs_write)
+               cgrp->procs_kn = kn;
+       else if (cft->seq_show == cgroup_populated_show)
                 cgrp->populated_kn = kn;
         return 0;
  }
@@ -4322,7 +4346,7 @@ static struct cftype cgroup_legacy_base_files[] = {
   *
   * On failure, no file is added.
   */
-static int cgroup_populate_dir(struct cgroup *cgrp, unsigned int subsys_mask)
+static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
  {
         struct cgroup_subsys *ss;
         int i, ret = 0;
@@ -4931,7 +4955,8 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
          * init_css_set is in the subsystem's root cgroup. */
         init_css_set.subsys[ss->id] = css;
  
-       need_forkexit_callback |= ss->fork || ss->exit;
+       have_fork_callback |= (bool)ss->fork << ss->id;
+       have_exit_callback |= (bool)ss->exit << ss->id;
  
         /* At system boot, before all subsystems have been
          * registered, no tasks have been forked, so we don't
@@ -4989,6 +5014,7 @@ int __init cgroup_init(void)
         unsigned long key;
         int ssid, err;
  
+       BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
         BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
         BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
  
@@ -5241,11 +5267,8 @@ void cgroup_post_fork(struct task_struct *child)
          * css_set; otherwise, @child might change state between ->fork()
          * and addition to css_set.
          */
-       if (need_forkexit_callback) {
-               for_each_subsys(ss, i)
-                       if (ss->fork)
-                               ss->fork(child);
-       }
+       for_each_subsys_which(ss, i, &have_fork_callback)
+               ss->fork(child);
  }
  
  /**
@@ -5289,16 +5312,12 @@ void cgroup_exit(struct task_struct *tsk)
         cset = task_css_set(tsk);
         RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
  
-       if (need_forkexit_callback) {
-               /* see cgroup_post_fork() for details */
-               for_each_subsys(ss, i) {
-                       if (ss->exit) {
-                               struct cgroup_subsys_state *old_css = cset->subsys[i];
-                               struct cgroup_subsys_state *css = task_css(tsk, i);
+       /* see cgroup_post_fork() for details */
+       for_each_subsys_which(ss, i, &have_exit_callback) {
+               struct cgroup_subsys_state *old_css = cset->subsys[i];
+               struct cgroup_subsys_state *css = task_css(tsk, i);
  
-                               ss->exit(css, old_css, tsk);
-                       }
-               }
+               ss->exit(css, old_css, tsk);
         }
  
         if (put_cset)
diff --git a/kernel/fork.c b/kernel/fork.c

index 4c95cb3..1bfefc6 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1141,10 +1141,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
         tty_audit_fork(sig);
         sched_autogroup_fork(sig);
  
-#ifdef CONFIG_CGROUPS
-       init_rwsem(&sig->group_rwsem);
-#endif
-
         sig->oom_score_adj = current->signal->oom_score_adj;
         sig->oom_score_adj_min = current->signal->oom_score_adj_min;
author	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 27 Jun 2015 02:50:04 +0000 (19:50 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Sat, 27 Jun 2015 02:50:04 +0000 (19:50 -0700)
Documentation/cgroups/unified-hierarchy.txt		patch \| blob \| history
MAINTAINERS		patch \| blob \| history
fs/kernfs/kernfs-internal.h		patch \| blob \| history
include/linux/cgroup-defs.h	[new file with mode: 0644]	patch \| blob
include/linux/cgroup.h		patch \| blob \| history
include/linux/init_task.h		patch \| blob \| history
include/linux/kernfs.h		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
init/Kconfig		patch \| blob \| history
kernel/cgroup.c		patch \| blob \| history
kernel/fork.c		patch \| blob \| history