kernel/cgroup/cgroup.c

   1 /*
   2  *  Generic process-grouping system.
   3  *
   4  *  Based originally on the cpuset system, extracted by Paul Menage
   5  *  Copyright (C) 2006 Google, Inc
   6  *
   7  *  Notifications support
   8  *  Copyright (C) 2009 Nokia Corporation
   9  *  Author: Kirill A. Shutemov
  10  *
  11  *  Copyright notices from the original cpuset code:
  12  *  --------------------------------------------------
  13  *  Copyright (C) 2003 BULL SA.
  14  *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
  15  *
  16  *  Portions derived from Patrick Mochel's sysfs code.
  17  *  sysfs is Copyright (c) 2001-3 Patrick Mochel
  18  *
  19  *  2003-10-10 Written by Simon Derr.
  20  *  2003-10-22 Updates by Stephen Hemminger.
  21  *  2004 May-July Rework by Paul Jackson.
  22  *  ---------------------------------------------------
  23  *
  24  *  This file is subject to the terms and conditions of the GNU General Public
  25  *  License.  See the file COPYING in the main directory of the Linux
  26  *  distribution for more details.
  27  */
  28
  29 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  30
  31 #include "cgroup-internal.h"
  32
  33 #include <linux/bpf-cgroup.h>
  34 #include <linux/cred.h>
  35 #include <linux/errno.h>
  36 #include <linux/init_task.h>
  37 #include <linux/kernel.h>
  38 #include <linux/magic.h>
  39 #include <linux/mutex.h>
  40 #include <linux/mount.h>
  41 #include <linux/pagemap.h>
  42 #include <linux/proc_fs.h>
  43 #include <linux/rcupdate.h>
  44 #include <linux/sched.h>
  45 #include <linux/sched/task.h>
  46 #include <linux/slab.h>
  47 #include <linux/spinlock.h>
  48 #include <linux/percpu-rwsem.h>
  49 #include <linux/string.h>
  50 #include <linux/hashtable.h>
  51 #include <linux/idr.h>
  52 #include <linux/kthread.h>
  53 #include <linux/atomic.h>
  54 #include <linux/cpuset.h>
  55 #include <linux/proc_ns.h>
  56 #include <linux/nsproxy.h>
  57 #include <linux/file.h>
  58 #include <linux/fs_parser.h>
  59 #include <linux/sched/cputime.h>
  60 #include <linux/sched/deadline.h>
  61 #include <linux/psi.h>
  62 #include <net/sock.h>
  63
  64 #define CREATE_TRACE_POINTS
  65 #include <trace/events/cgroup.h>
  66
  67 #define CGROUP_FILE_NAME_MAX            (MAX_CGROUP_TYPE_NAMELEN +      \
  68                                          MAX_CFTYPE_NAME + 2)
  69 /* let's not notify more than 100 times per second */
  70 #define CGROUP_FILE_NOTIFY_MIN_INTV     DIV_ROUND_UP(HZ, 100)
  71
  72 /*
  73  * To avoid confusing the compiler (and generating warnings) with code
  74  * that attempts to access what would be a 0-element array (i.e. sized
  75  * to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this
  76  * constant expression can be added.
  77  */
  78 #define CGROUP_HAS_SUBSYS_CONFIG        (CGROUP_SUBSYS_COUNT > 0)
  79
  80 /*
  81  * cgroup_mutex is the master lock.  Any modification to cgroup or its
  82  * hierarchy must be performed while holding it.
  83  *
  84  * css_set_lock protects task->cgroups pointer, the list of css_set
  85  * objects, and the chain of tasks off each css_set.
  86  *
  87  * These locks are exported if CONFIG_PROVE_RCU so that accessors in
  88  * cgroup.h can use them for lockdep annotations.
  89  */
  90 DEFINE_MUTEX(cgroup_mutex);
  91 DEFINE_SPINLOCK(css_set_lock);
  92
  93 #ifdef CONFIG_PROVE_RCU
  94 EXPORT_SYMBOL_GPL(cgroup_mutex);
  95 EXPORT_SYMBOL_GPL(css_set_lock);
  96 #endif
  97
  98 DEFINE_SPINLOCK(trace_cgroup_path_lock);
  99 char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
 100 static bool cgroup_debug __read_mostly;
 101
 102 /*
 103  * Protects cgroup_idr and css_idr so that IDs can be released without
 104  * grabbing cgroup_mutex.
 105  */
 106 static DEFINE_SPINLOCK(cgroup_idr_lock);
 107
 108 /*
 109  * Protects cgroup_file->kn for !self csses.  It synchronizes notifications
 110  * against file removal/re-creation across css hiding.
 111  */
 112 static DEFINE_SPINLOCK(cgroup_file_kn_lock);
 113
 114 DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem);
 115
 116 #define cgroup_assert_mutex_or_rcu_locked()                             \
 117         RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
 118                            !lockdep_is_held(&cgroup_mutex),             \
 119                            "cgroup_mutex or RCU read lock required");
 120
 121 /*
 122  * cgroup destruction makes heavy use of work items and there can be a lot
 123  * of concurrent destructions.  Use a separate workqueue so that cgroup
 124  * destruction work items don't end up filling up max_active of system_wq
 125  * which may lead to deadlock.
 126  */
 127 static struct workqueue_struct *cgroup_destroy_wq;
 128
 129 /* generate an array of cgroup subsystem pointers */
 130 #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys,
 131 struct cgroup_subsys *cgroup_subsys[] = {
 132 #include <linux/cgroup_subsys.h>
 133 };
 134 #undef SUBSYS
 135
 136 /* array of cgroup subsystem names */
 137 #define SUBSYS(_x) [_x ## _cgrp_id] = #_x,
 138 static const char *cgroup_subsys_name[] = {
 139 #include <linux/cgroup_subsys.h>
 140 };
 141 #undef SUBSYS
 142
 143 /* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */
 144 #define SUBSYS(_x)                                                              \
 145         DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key);                 \
 146         DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key);                  \
 147         EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key);                      \
 148         EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
 149 #include <linux/cgroup_subsys.h>
 150 #undef SUBSYS
 151
 152 #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
 153 static struct static_key_true *cgroup_subsys_enabled_key[] = {
 154 #include <linux/cgroup_subsys.h>
 155 };
 156 #undef SUBSYS
 157
 158 #define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
 159 static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
 160 #include <linux/cgroup_subsys.h>
 161 };
 162 #undef SUBSYS
 163
 164 static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu);
 165
 166 /* the default hierarchy */
 167 struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu };
 168 EXPORT_SYMBOL_GPL(cgrp_dfl_root);
 169
 170 /*
 171  * The default hierarchy always exists but is hidden until mounted for the
 172  * first time.  This is for backward compatibility.
 173  */
 174 static bool cgrp_dfl_visible;
 175
 176 /* some controllers are not supported in the default hierarchy */
 177 static u16 cgrp_dfl_inhibit_ss_mask;
 178
 179 /* some controllers are implicitly enabled on the default hierarchy */
 180 static u16 cgrp_dfl_implicit_ss_mask;
 181
 182 /* some controllers can be threaded on the default hierarchy */
 183 static u16 cgrp_dfl_threaded_ss_mask;
 184
 185 /* The list of hierarchy roots */
 186 LIST_HEAD(cgroup_roots);
 187 static int cgroup_root_count;
 188
 189 /* hierarchy ID allocation and mapping, protected by cgroup_mutex */
 190 static DEFINE_IDR(cgroup_hierarchy_idr);
 191
 192 /*
 193  * Assign a monotonically increasing serial number to csses.  It guarantees
 194  * cgroups with bigger numbers are newer than those with smaller numbers.
 195  * Also, as csses are always appended to the parent's ->children list, it
 196  * guarantees that sibling csses are always sorted in the ascending serial
 197  * number order on the list.  Protected by cgroup_mutex.
 198  */
 199 static u64 css_serial_nr_next = 1;
 200
 201 /*
 202  * These bitmasks identify subsystems with specific features to avoid
 203  * having to do iterative checks repeatedly.
 204  */
 205 static u16 have_fork_callback __read_mostly;
 206 static u16 have_exit_callback __read_mostly;
 207 static u16 have_release_callback __read_mostly;
 208 static u16 have_canfork_callback __read_mostly;
 209
 210 /* cgroup namespace for init task */
 211 struct cgroup_namespace init_cgroup_ns = {
 212         .ns.count       = REFCOUNT_INIT(2),
 213         .user_ns        = &init_user_ns,
 214         .ns.ops         = &cgroupns_operations,
 215         .ns.inum        = PROC_CGROUP_INIT_INO,
 216         .root_cset      = &init_css_set,
 217 };
 218
 219 static struct file_system_type cgroup2_fs_type;
 220 static struct cftype cgroup_base_files[];
 221 static struct cftype cgroup_psi_files[];
 222
 223 /* cgroup optional features */
 224 enum cgroup_opt_features {
 225 #ifdef CONFIG_PSI
 226         OPT_FEATURE_PRESSURE,
 227 #endif
 228         OPT_FEATURE_COUNT
 229 };
 230
 231 static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = {
 232 #ifdef CONFIG_PSI
 233         "pressure",
 234 #endif
 235 };
 236
 237 static u16 cgroup_feature_disable_mask __read_mostly;
 238
 239 static int cgroup_apply_control(struct cgroup *cgrp);
 240 static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
 241 static void css_task_iter_skip(struct css_task_iter *it,
 242                                struct task_struct *task);
 243 static int cgroup_destroy_locked(struct cgroup *cgrp);
 244 static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
 245                                               struct cgroup_subsys *ss);
 246 static void css_release(struct percpu_ref *ref);
 247 static void kill_css(struct cgroup_subsys_state *css);
 248 static int cgroup_addrm_files(struct cgroup_subsys_state *css,
 249                               struct cgroup *cgrp, struct cftype cfts[],
 250                               bool is_add);
 251
 252 #ifdef CONFIG_DEBUG_CGROUP_REF
 253 #define CGROUP_REF_FN_ATTRS     noinline
 254 #define CGROUP_REF_EXPORT(fn)   EXPORT_SYMBOL_GPL(fn);
 255 #include <linux/cgroup_refcnt.h>
 256 #endif
 257
 258 /**
 259  * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
 260  * @ssid: subsys ID of interest
 261  *
 262  * cgroup_subsys_enabled() can only be used with literal subsys names which
 263  * is fine for individual subsystems but unsuitable for cgroup core.  This
 264  * is slower static_key_enabled() based test indexed by @ssid.
 265  */
 266 bool cgroup_ssid_enabled(int ssid)
 267 {
 268         if (!CGROUP_HAS_SUBSYS_CONFIG)
 269                 return false;
 270
 271         return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
 272 }
 273
 274 /**
 275  * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
 276  * @cgrp: the cgroup of interest
 277  *
 278  * The default hierarchy is the v2 interface of cgroup and this function
 279  * can be used to test whether a cgroup is on the default hierarchy for
 280  * cases where a subsystem should behave differently depending on the
 281  * interface version.
 282  *
 283  * List of changed behaviors:
 284  *
 285  * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
 286  *   and "name" are disallowed.
 287  *
 288  * - When mounting an existing superblock, mount options should match.
 289  *
 290  * - rename(2) is disallowed.
 291  *
 292  * - "tasks" is removed.  Everything should be at process granularity.  Use
 293  *   "cgroup.procs" instead.
 294  *
 295  * - "cgroup.procs" is not sorted.  pids will be unique unless they got
 296  *   recycled in-between reads.
 297  *
 298  * - "release_agent" and "notify_on_release" are removed.  Replacement
 299  *   notification mechanism will be implemented.
 300  *
 301  * - "cgroup.clone_children" is removed.
 302  *
 303  * - "cgroup.subtree_populated" is available.  Its value is 0 if the cgroup
 304  *   and its descendants contain no task; otherwise, 1.  The file also
 305  *   generates kernfs notification which can be monitored through poll and
 306  *   [di]notify when the value of the file changes.
 307  *
 308  * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
 309  *   take masks of ancestors with non-empty cpus/mems, instead of being
 310  *   moved to an ancestor.
 311  *
 312  * - cpuset: a task can be moved into an empty cpuset, and again it takes
 313  *   masks of ancestors.
 314  *
 315  * - blkcg: blk-throttle becomes properly hierarchical.
 316  */
 317 bool cgroup_on_dfl(const struct cgroup *cgrp)
 318 {
 319         return cgrp->root == &cgrp_dfl_root;
 320 }
 321
 322 /* IDR wrappers which synchronize using cgroup_idr_lock */
 323 static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
 324                             gfp_t gfp_mask)
 325 {
 326         int ret;
 327
 328         idr_preload(gfp_mask);
 329         spin_lock_bh(&cgroup_idr_lock);
 330         ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
 331         spin_unlock_bh(&cgroup_idr_lock);
 332         idr_preload_end();
 333         return ret;
 334 }
 335
 336 static void *cgroup_idr_replace(struct idr *idr, void *ptr, int id)
 337 {
 338         void *ret;
 339
 340         spin_lock_bh(&cgroup_idr_lock);
 341         ret = idr_replace(idr, ptr, id);
 342         spin_unlock_bh(&cgroup_idr_lock);
 343         return ret;
 344 }
 345
 346 static void cgroup_idr_remove(struct idr *idr, int id)
 347 {
 348         spin_lock_bh(&cgroup_idr_lock);
 349         idr_remove(idr, id);
 350         spin_unlock_bh(&cgroup_idr_lock);
 351 }
 352
 353 static bool cgroup_has_tasks(struct cgroup *cgrp)
 354 {
 355         return cgrp->nr_populated_csets;
 356 }
 357
 358 static bool cgroup_is_threaded(struct cgroup *cgrp)
 359 {
 360         return cgrp->dom_cgrp != cgrp;
 361 }
 362
 363 /* can @cgrp host both domain and threaded children? */
 364 static bool cgroup_is_mixable(struct cgroup *cgrp)
 365 {
 366         /*
 367          * Root isn't under domain level resource control exempting it from
 368          * the no-internal-process constraint, so it can serve as a thread
 369          * root and a parent of resource domains at the same time.
 370          */
 371         return !cgroup_parent(cgrp);
 372 }
 373
 374 /* can @cgrp become a thread root? Should always be true for a thread root */
 375 static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
 376 {
 377         /* mixables don't care */
 378         if (cgroup_is_mixable(cgrp))
 379                 return true;
 380
 381         /* domain roots can't be nested under threaded */
 382         if (cgroup_is_threaded(cgrp))
 383                 return false;
 384
 385         /* can only have either domain or threaded children */
 386         if (cgrp->nr_populated_domain_children)
 387                 return false;
 388
 389         /* and no domain controllers can be enabled */
 390         if (cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
 391                 return false;
 392
 393         return true;
 394 }
 395
 396 /* is @cgrp root of a threaded subtree? */
 397 static bool cgroup_is_thread_root(struct cgroup *cgrp)
 398 {
 399         /* thread root should be a domain */
 400         if (cgroup_is_threaded(cgrp))
 401                 return false;
 402
 403         /* a domain w/ threaded children is a thread root */
 404         if (cgrp->nr_threaded_children)
 405                 return true;
 406
 407         /*
 408          * A domain which has tasks and explicit threaded controllers
 409          * enabled is a thread root.
 410          */
 411         if (cgroup_has_tasks(cgrp) &&
 412             (cgrp->subtree_control & cgrp_dfl_threaded_ss_mask))
 413                 return true;
 414
 415         return false;
 416 }
 417
 418 /* a domain which isn't connected to the root w/o brekage can't be used */
 419 static bool cgroup_is_valid_domain(struct cgroup *cgrp)
 420 {
 421         /* the cgroup itself can be a thread root */
 422         if (cgroup_is_threaded(cgrp))
 423                 return false;
 424
 425         /* but the ancestors can't be unless mixable */
 426         while ((cgrp = cgroup_parent(cgrp))) {
 427                 if (!cgroup_is_mixable(cgrp) && cgroup_is_thread_root(cgrp))
 428                         return false;
 429                 if (cgroup_is_threaded(cgrp))
 430                         return false;
 431         }
 432
 433         return true;
 434 }
 435
 436 /* subsystems visibly enabled on a cgroup */
 437 static u16 cgroup_control(struct cgroup *cgrp)
 438 {
 439         struct cgroup *parent = cgroup_parent(cgrp);
 440         u16 root_ss_mask = cgrp->root->subsys_mask;
 441
 442         if (parent) {
 443                 u16 ss_mask = parent->subtree_control;
 444
 445                 /* threaded cgroups can only have threaded controllers */
 446                 if (cgroup_is_threaded(cgrp))
 447                         ss_mask &= cgrp_dfl_threaded_ss_mask;
 448                 return ss_mask;
 449         }
 450
 451         if (cgroup_on_dfl(cgrp))
 452                 root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask |
 453                                   cgrp_dfl_implicit_ss_mask);
 454         return root_ss_mask;
 455 }
 456
 457 /* subsystems enabled on a cgroup */
 458 static u16 cgroup_ss_mask(struct cgroup *cgrp)
 459 {
 460         struct cgroup *parent = cgroup_parent(cgrp);
 461
 462         if (parent) {
 463                 u16 ss_mask = parent->subtree_ss_mask;
 464
 465                 /* threaded cgroups can only have threaded controllers */
 466                 if (cgroup_is_threaded(cgrp))
 467                         ss_mask &= cgrp_dfl_threaded_ss_mask;
 468                 return ss_mask;
 469         }
 470
 471         return cgrp->root->subsys_mask;
 472 }
 473
 474 /**
 475  * cgroup_css - obtain a cgroup's css for the specified subsystem
 476  * @cgrp: the cgroup of interest
 477  * @ss: the subsystem of interest (%NULL returns @cgrp->self)
 478  *
 479  * Return @cgrp's css (cgroup_subsys_state) associated with @ss.  This
 480  * function must be called either under cgroup_mutex or rcu_read_lock() and
 481  * the caller is responsible for pinning the returned css if it wants to
 482  * keep accessing it outside the said locks.  This function may return
 483  * %NULL if @cgrp doesn't have @subsys_id enabled.
 484  */
 485 static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
 486                                               struct cgroup_subsys *ss)
 487 {
 488         if (CGROUP_HAS_SUBSYS_CONFIG && ss)
 489                 return rcu_dereference_check(cgrp->subsys[ss->id],
 490                                         lockdep_is_held(&cgroup_mutex));
 491         else
 492                 return &cgrp->self;
 493 }
 494
 495 /**
 496  * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
 497  * @cgrp: the cgroup of interest
 498  * @ss: the subsystem of interest
 499  *
 500  * Find and get @cgrp's css associated with @ss.  If the css doesn't exist
 501  * or is offline, %NULL is returned.
 502  */
 503 static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
 504                                                      struct cgroup_subsys *ss)
 505 {
 506         struct cgroup_subsys_state *css;
 507
 508         rcu_read_lock();
 509         css = cgroup_css(cgrp, ss);
 510         if (css && !css_tryget_online(css))
 511                 css = NULL;
 512         rcu_read_unlock();
 513
 514         return css;
 515 }
 516
 517 /**
 518  * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss
 519  * @cgrp: the cgroup of interest
 520  * @ss: the subsystem of interest (%NULL returns @cgrp->self)
 521  *
 522  * Similar to cgroup_css() but returns the effective css, which is defined
 523  * as the matching css of the nearest ancestor including self which has @ss
 524  * enabled.  If @ss is associated with the hierarchy @cgrp is on, this
 525  * function is guaranteed to return non-NULL css.
 526  */
 527 static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp,
 528                                                         struct cgroup_subsys *ss)
 529 {
 530         lockdep_assert_held(&cgroup_mutex);
 531
 532         if (!ss)
 533                 return &cgrp->self;
 534
 535         /*
 536          * This function is used while updating css associations and thus
 537          * can't test the csses directly.  Test ss_mask.
 538          */
 539         while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) {
 540                 cgrp = cgroup_parent(cgrp);
 541                 if (!cgrp)
 542                         return NULL;
 543         }
 544
 545         return cgroup_css(cgrp, ss);
 546 }
 547
 548 /**
 549  * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
 550  * @cgrp: the cgroup of interest
 551  * @ss: the subsystem of interest
 552  *
 553  * Find and get the effective css of @cgrp for @ss.  The effective css is
 554  * defined as the matching css of the nearest ancestor including self which
 555  * has @ss enabled.  If @ss is not mounted on the hierarchy @cgrp is on,
 556  * the root css is returned, so this function always returns a valid css.
 557  *
 558  * The returned css is not guaranteed to be online, and therefore it is the
 559  * callers responsibility to try get a reference for it.
 560  */
 561 struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
 562                                          struct cgroup_subsys *ss)
 563 {
 564         struct cgroup_subsys_state *css;
 565
 566         if (!CGROUP_HAS_SUBSYS_CONFIG)
 567                 return NULL;
 568
 569         do {
 570                 css = cgroup_css(cgrp, ss);
 571
 572                 if (css)
 573                         return css;
 574                 cgrp = cgroup_parent(cgrp);
 575         } while (cgrp);
 576
 577         return init_css_set.subsys[ss->id];
 578 }
 579
 580 /**
 581  * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem
 582  * @cgrp: the cgroup of interest
 583  * @ss: the subsystem of interest
 584  *
 585  * Find and get the effective css of @cgrp for @ss.  The effective css is
 586  * defined as the matching css of the nearest ancestor including self which
 587  * has @ss enabled.  If @ss is not mounted on the hierarchy @cgrp is on,
 588  * the root css is returned, so this function always returns a valid css.
 589  * The returned css must be put using css_put().
 590  */
 591 struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
 592                                              struct cgroup_subsys *ss)
 593 {
 594         struct cgroup_subsys_state *css;
 595
 596         if (!CGROUP_HAS_SUBSYS_CONFIG)
 597                 return NULL;
 598
 599         rcu_read_lock();
 600
 601         do {
 602                 css = cgroup_css(cgrp, ss);
 603
 604                 if (css && css_tryget_online(css))
 605                         goto out_unlock;
 606                 cgrp = cgroup_parent(cgrp);
 607         } while (cgrp);
 608
 609         css = init_css_set.subsys[ss->id];
 610         css_get(css);
 611 out_unlock:
 612         rcu_read_unlock();
 613         return css;
 614 }
 615 EXPORT_SYMBOL_GPL(cgroup_get_e_css);
 616
 617 static void cgroup_get_live(struct cgroup *cgrp)
 618 {
 619         WARN_ON_ONCE(cgroup_is_dead(cgrp));
 620         cgroup_get(cgrp);
 621 }
 622
 623 /**
 624  * __cgroup_task_count - count the number of tasks in a cgroup. The caller
 625  * is responsible for taking the css_set_lock.
 626  * @cgrp: the cgroup in question
 627  */
 628 int __cgroup_task_count(const struct cgroup *cgrp)
 629 {
 630         int count = 0;
 631         struct cgrp_cset_link *link;
 632
 633         lockdep_assert_held(&css_set_lock);
 634
 635         list_for_each_entry(link, &cgrp->cset_links, cset_link)
 636                 count += link->cset->nr_tasks;
 637
 638         return count;
 639 }
 640
 641 /**
 642  * cgroup_task_count - count the number of tasks in a cgroup.
 643  * @cgrp: the cgroup in question
 644  */
 645 int cgroup_task_count(const struct cgroup *cgrp)
 646 {
 647         int count;
 648
 649         spin_lock_irq(&css_set_lock);
 650         count = __cgroup_task_count(cgrp);
 651         spin_unlock_irq(&css_set_lock);
 652
 653         return count;
 654 }
 655
 656 struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
 657 {
 658         struct cgroup *cgrp = of->kn->parent->priv;
 659         struct cftype *cft = of_cft(of);
 660
 661         /*
 662          * This is open and unprotected implementation of cgroup_css().
 663          * seq_css() is only called from a kernfs file operation which has
 664          * an active reference on the file.  Because all the subsystem
 665          * files are drained before a css is disassociated with a cgroup,
 666          * the matching css from the cgroup's subsys table is guaranteed to
 667          * be and stay valid until the enclosing operation is complete.
 668          */
 669         if (CGROUP_HAS_SUBSYS_CONFIG && cft->ss)
 670                 return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
 671         else
 672                 return &cgrp->self;
 673 }
 674 EXPORT_SYMBOL_GPL(of_css);
 675
 676 /**
 677  * for_each_css - iterate all css's of a cgroup
 678  * @css: the iteration cursor
 679  * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end
 680  * @cgrp: the target cgroup to iterate css's of
 681  *
 682  * Should be called under cgroup_[tree_]mutex.
 683  */
 684 #define for_each_css(css, ssid, cgrp)                                   \
 685         for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++)        \
 686                 if (!((css) = rcu_dereference_check(                    \
 687                                 (cgrp)->subsys[(ssid)],                 \
 688                                 lockdep_is_held(&cgroup_mutex)))) { }   \
 689                 else
 690
 691 /**
 692  * do_each_subsys_mask - filter for_each_subsys with a bitmask
 693  * @ss: the iteration cursor
 694  * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
 695  * @ss_mask: the bitmask
 696  *
 697  * The block will only run for cases where the ssid-th bit (1 << ssid) of
 698  * @ss_mask is set.
 699  */
 700 #define do_each_subsys_mask(ss, ssid, ss_mask) do {                     \
 701         unsigned long __ss_mask = (ss_mask);                            \
 702         if (!CGROUP_HAS_SUBSYS_CONFIG) {                                \
 703                 (ssid) = 0;                                             \
 704                 break;                                                  \
 705         }                                                               \
 706         for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) {       \
 707                 (ss) = cgroup_subsys[ssid];                             \
 708                 {
 709
 710 #define while_each_subsys_mask()                                        \
 711                 }                                                       \
 712         }                                                               \
 713 } while (false)
 714
 715 /* iterate over child cgrps, lock should be held throughout iteration */
 716 #define cgroup_for_each_live_child(child, cgrp)                         \
 717         list_for_each_entry((child), &(cgrp)->self.children, self.sibling) \
 718                 if (({ lockdep_assert_held(&cgroup_mutex);              \
 719                        cgroup_is_dead(child); }))                       \
 720                         ;                                               \
 721                 else
 722
 723 /* walk live descendants in pre order */
 724 #define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)          \
 725         css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL))  \
 726                 if (({ lockdep_assert_held(&cgroup_mutex);              \
 727                        (dsct) = (d_css)->cgroup;                        \
 728                        cgroup_is_dead(dsct); }))                        \
 729                         ;                                               \
 730                 else
 731
 732 /* walk live descendants in postorder */
 733 #define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp)         \
 734         css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \
 735                 if (({ lockdep_assert_held(&cgroup_mutex);              \
 736                        (dsct) = (d_css)->cgroup;                        \
 737                        cgroup_is_dead(dsct); }))                        \
 738                         ;                                               \
 739                 else
 740
 741 /*
 742  * The default css_set - used by init and its children prior to any
 743  * hierarchies being mounted. It contains a pointer to the root state
 744  * for each subsystem. Also used to anchor the list of css_sets. Not
 745  * reference-counted, to improve performance when child cgroups
 746  * haven't been created.
 747  */
 748 struct css_set init_css_set = {
 749         .refcount               = REFCOUNT_INIT(1),
 750         .dom_cset               = &init_css_set,
 751         .tasks                  = LIST_HEAD_INIT(init_css_set.tasks),
 752         .mg_tasks               = LIST_HEAD_INIT(init_css_set.mg_tasks),
 753         .dying_tasks            = LIST_HEAD_INIT(init_css_set.dying_tasks),
 754         .task_iters             = LIST_HEAD_INIT(init_css_set.task_iters),
 755         .threaded_csets         = LIST_HEAD_INIT(init_css_set.threaded_csets),
 756         .cgrp_links             = LIST_HEAD_INIT(init_css_set.cgrp_links),
 757         .mg_src_preload_node    = LIST_HEAD_INIT(init_css_set.mg_src_preload_node),
 758         .mg_dst_preload_node    = LIST_HEAD_INIT(init_css_set.mg_dst_preload_node),
 759         .mg_node                = LIST_HEAD_INIT(init_css_set.mg_node),
 760
 761         /*
 762          * The following field is re-initialized when this cset gets linked
 763          * in cgroup_init().  However, let's initialize the field
 764          * statically too so that the default cgroup can be accessed safely
 765          * early during boot.
 766          */
 767         .dfl_cgrp               = &cgrp_dfl_root.cgrp,
 768 };
 769
 770 static int css_set_count        = 1;    /* 1 for init_css_set */
 771
 772 static bool css_set_threaded(struct css_set *cset)
 773 {
 774         return cset->dom_cset != cset;
 775 }
 776
 777 /**
 778  * css_set_populated - does a css_set contain any tasks?
 779  * @cset: target css_set
 780  *
 781  * css_set_populated() should be the same as !!cset->nr_tasks at steady
 782  * state. However, css_set_populated() can be called while a task is being
 783  * added to or removed from the linked list before the nr_tasks is
 784  * properly updated. Hence, we can't just look at ->nr_tasks here.
 785  */
 786 static bool css_set_populated(struct css_set *cset)
 787 {
 788         lockdep_assert_held(&css_set_lock);
 789
 790         return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
 791 }
 792
 793 /**
 794  * cgroup_update_populated - update the populated count of a cgroup
 795  * @cgrp: the target cgroup
 796  * @populated: inc or dec populated count
 797  *
 798  * One of the css_sets associated with @cgrp is either getting its first
 799  * task or losing the last.  Update @cgrp->nr_populated_* accordingly.  The
 800  * count is propagated towards root so that a given cgroup's
 801  * nr_populated_children is zero iff none of its descendants contain any
 802  * tasks.
 803  *
 804  * @cgrp's interface file "cgroup.populated" is zero if both
 805  * @cgrp->nr_populated_csets and @cgrp->nr_populated_children are zero and
 806  * 1 otherwise.  When the sum changes from or to zero, userland is notified
 807  * that the content of the interface file has changed.  This can be used to
 808  * detect when @cgrp and its descendants become populated or empty.
 809  */
 810 static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
 811 {
 812         struct cgroup *child = NULL;
 813         int adj = populated ? 1 : -1;
 814
 815         lockdep_assert_held(&css_set_lock);
 816
 817         do {
 818                 bool was_populated = cgroup_is_populated(cgrp);
 819
 820                 if (!child) {
 821                         cgrp->nr_populated_csets += adj;
 822                 } else {
 823                         if (cgroup_is_threaded(child))
 824                                 cgrp->nr_populated_threaded_children += adj;
 825                         else
 826                                 cgrp->nr_populated_domain_children += adj;
 827                 }
 828
 829                 if (was_populated == cgroup_is_populated(cgrp))
 830                         break;
 831
 832                 cgroup1_check_for_release(cgrp);
 833                 TRACE_CGROUP_PATH(notify_populated, cgrp,
 834                                   cgroup_is_populated(cgrp));
 835                 cgroup_file_notify(&cgrp->events_file);
 836
 837                 child = cgrp;
 838                 cgrp = cgroup_parent(cgrp);
 839         } while (cgrp);
 840 }
 841
 842 /**
 843  * css_set_update_populated - update populated state of a css_set
 844  * @cset: target css_set
 845  * @populated: whether @cset is populated or depopulated
 846  *
 847  * @cset is either getting the first task or losing the last.  Update the
 848  * populated counters of all associated cgroups accordingly.
 849  */
 850 static void css_set_update_populated(struct css_set *cset, bool populated)
 851 {
 852         struct cgrp_cset_link *link;
 853
 854         lockdep_assert_held(&css_set_lock);
 855
 856         list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
 857                 cgroup_update_populated(link->cgrp, populated);
 858 }
 859
 860 /*
 861  * @task is leaving, advance task iterators which are pointing to it so
 862  * that they can resume at the next position.  Advancing an iterator might
 863  * remove it from the list, use safe walk.  See css_task_iter_skip() for
 864  * details.
 865  */
 866 static void css_set_skip_task_iters(struct css_set *cset,
 867                                     struct task_struct *task)
 868 {
 869         struct css_task_iter *it, *pos;
 870
 871         list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node)
 872                 css_task_iter_skip(it, task);
 873 }
 874
 875 /**
 876  * css_set_move_task - move a task from one css_set to another
 877  * @task: task being moved
 878  * @from_cset: css_set @task currently belongs to (may be NULL)
 879  * @to_cset: new css_set @task is being moved to (may be NULL)
 880  * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
 881  *
 882  * Move @task from @from_cset to @to_cset.  If @task didn't belong to any
 883  * css_set, @from_cset can be NULL.  If @task is being disassociated
 884  * instead of moved, @to_cset can be NULL.
 885  *
 886  * This function automatically handles populated counter updates and
 887  * css_task_iter adjustments but the caller is responsible for managing
 888  * @from_cset and @to_cset's reference counts.
 889  */
 890 static void css_set_move_task(struct task_struct *task,
 891                               struct css_set *from_cset, struct css_set *to_cset,
 892                               bool use_mg_tasks)
 893 {
 894         lockdep_assert_held(&css_set_lock);
 895
 896         if (to_cset && !css_set_populated(to_cset))
 897                 css_set_update_populated(to_cset, true);
 898
 899         if (from_cset) {
 900                 WARN_ON_ONCE(list_empty(&task->cg_list));
 901
 902                 css_set_skip_task_iters(from_cset, task);
 903                 list_del_init(&task->cg_list);
 904                 if (!css_set_populated(from_cset))
 905                         css_set_update_populated(from_cset, false);
 906         } else {
 907                 WARN_ON_ONCE(!list_empty(&task->cg_list));
 908         }
 909
 910         if (to_cset) {
 911                 /*
 912                  * We are synchronized through cgroup_threadgroup_rwsem
 913                  * against PF_EXITING setting such that we can't race
 914                  * against cgroup_exit()/cgroup_free() dropping the css_set.
 915                  */
 916                 WARN_ON_ONCE(task->flags & PF_EXITING);
 917
 918                 cgroup_move_task(task, to_cset);
 919                 list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
 920                                                              &to_cset->tasks);
 921         }
 922 }
 923
 924 /*
 925  * hash table for cgroup groups. This improves the performance to find
 926  * an existing css_set. This hash doesn't (currently) take into
 927  * account cgroups in empty hierarchies.
 928  */
 929 #define CSS_SET_HASH_BITS       7
 930 static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS);
 931
 932 static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
 933 {
 934         unsigned long key = 0UL;
 935         struct cgroup_subsys *ss;
 936         int i;
 937
 938         for_each_subsys(ss, i)
 939                 key += (unsigned long)css[i];
 940         key = (key >> 16) ^ key;
 941
 942         return key;
 943 }
 944
 945 void put_css_set_locked(struct css_set *cset)
 946 {
 947         struct cgrp_cset_link *link, *tmp_link;
 948         struct cgroup_subsys *ss;
 949         int ssid;
 950
 951         lockdep_assert_held(&css_set_lock);
 952
 953         if (!refcount_dec_and_test(&cset->refcount))
 954                 return;
 955
 956         WARN_ON_ONCE(!list_empty(&cset->threaded_csets));
 957
 958         /* This css_set is dead. Unlink it and release cgroup and css refs */
 959         for_each_subsys(ss, ssid) {
 960                 list_del(&cset->e_cset_node[ssid]);
 961                 css_put(cset->subsys[ssid]);
 962         }
 963         hash_del(&cset->hlist);
 964         css_set_count--;
 965
 966         list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
 967                 list_del(&link->cset_link);
 968                 list_del(&link->cgrp_link);
 969                 if (cgroup_parent(link->cgrp))
 970                         cgroup_put(link->cgrp);
 971                 kfree(link);
 972         }
 973
 974         if (css_set_threaded(cset)) {
 975                 list_del(&cset->threaded_csets_node);
 976                 put_css_set_locked(cset->dom_cset);
 977         }
 978
 979         kfree_rcu(cset, rcu_head);
 980 }
 981
 982 /**
 983  * compare_css_sets - helper function for find_existing_css_set().
 984  * @cset: candidate css_set being tested
 985  * @old_cset: existing css_set for a task
 986  * @new_cgrp: cgroup that's being entered by the task
 987  * @template: desired set of css pointers in css_set (pre-calculated)
 988  *
 989  * Returns true if "cset" matches "old_cset" except for the hierarchy
 990  * which "new_cgrp" belongs to, for which it should match "new_cgrp".
 991  */
 992 static bool compare_css_sets(struct css_set *cset,
 993                              struct css_set *old_cset,
 994                              struct cgroup *new_cgrp,
 995                              struct cgroup_subsys_state *template[])
 996 {
 997         struct cgroup *new_dfl_cgrp;
 998         struct list_head *l1, *l2;
 999
1000         /*
1001          * On the default hierarchy, there can be csets which are
1002          * associated with the same set of cgroups but different csses.
1003          * Let's first ensure that csses match.
1004          */
1005         if (memcmp(template, cset->subsys, sizeof(cset->subsys)))
1006                 return false;
1007
1008
1009         /* @cset's domain should match the default cgroup's */
1010         if (cgroup_on_dfl(new_cgrp))
1011                 new_dfl_cgrp = new_cgrp;
1012         else
1013                 new_dfl_cgrp = old_cset->dfl_cgrp;
1014
1015         if (new_dfl_cgrp->dom_cgrp != cset->dom_cset->dfl_cgrp)
1016                 return false;
1017
1018         /*
1019          * Compare cgroup pointers in order to distinguish between
1020          * different cgroups in hierarchies.  As different cgroups may
1021          * share the same effective css, this comparison is always
1022          * necessary.
1023          */
1024         l1 = &cset->cgrp_links;
1025         l2 = &old_cset->cgrp_links;
1026         while (1) {
1027                 struct cgrp_cset_link *link1, *link2;
1028                 struct cgroup *cgrp1, *cgrp2;
1029
1030                 l1 = l1->next;
1031                 l2 = l2->next;
1032                 /* See if we reached the end - both lists are equal length. */
1033                 if (l1 == &cset->cgrp_links) {
1034                         BUG_ON(l2 != &old_cset->cgrp_links);
1035                         break;
1036                 } else {
1037                         BUG_ON(l2 == &old_cset->cgrp_links);
1038                 }
1039                 /* Locate the cgroups associated with these links. */
1040                 link1 = list_entry(l1, struct cgrp_cset_link, cgrp_link);
1041                 link2 = list_entry(l2, struct cgrp_cset_link, cgrp_link);
1042                 cgrp1 = link1->cgrp;
1043                 cgrp2 = link2->cgrp;
1044                 /* Hierarchies should be linked in the same order. */
1045                 BUG_ON(cgrp1->root != cgrp2->root);
1046
1047                 /*
1048                  * If this hierarchy is the hierarchy of the cgroup
1049                  * that's changing, then we need to check that this
1050                  * css_set points to the new cgroup; if it's any other
1051                  * hierarchy, then this css_set should point to the
1052                  * same cgroup as the old css_set.
1053                  */
1054                 if (cgrp1->root == new_cgrp->root) {
1055                         if (cgrp1 != new_cgrp)
1056                                 return false;
1057                 } else {
1058                         if (cgrp1 != cgrp2)
1059                                 return false;
1060                 }
1061         }
1062         return true;
1063 }
1064
1065 /**
1066  * find_existing_css_set - init css array and find the matching css_set
1067  * @old_cset: the css_set that we're using before the cgroup transition
1068  * @cgrp: the cgroup that we're moving into
1069  * @template: out param for the new set of csses, should be clear on entry
1070  */
1071 static struct css_set *find_existing_css_set(struct css_set *old_cset,
1072                                         struct cgroup *cgrp,
1073                                         struct cgroup_subsys_state *template[])
1074 {
1075         struct cgroup_root *root = cgrp->root;
1076         struct cgroup_subsys *ss;
1077         struct css_set *cset;
1078         unsigned long key;
1079         int i;
1080
1081         /*
1082          * Build the set of subsystem state objects that we want to see in the
1083          * new css_set. While subsystems can change globally, the entries here
1084          * won't change, so no need for locking.
1085          */
1086         for_each_subsys(ss, i) {
1087                 if (root->subsys_mask & (1UL << i)) {
1088                         /*
1089                          * @ss is in this hierarchy, so we want the
1090                          * effective css from @cgrp.
1091                          */
1092                         template[i] = cgroup_e_css_by_mask(cgrp, ss);
1093                 } else {
1094                         /*
1095                          * @ss is not in this hierarchy, so we don't want
1096                          * to change the css.
1097                          */
1098                         template[i] = old_cset->subsys[i];
1099                 }
1100         }
1101
1102         key = css_set_hash(template);
1103         hash_for_each_possible(css_set_table, cset, hlist, key) {
1104                 if (!compare_css_sets(cset, old_cset, cgrp, template))
1105                         continue;
1106
1107                 /* This css_set matches what we need */
1108                 return cset;
1109         }
1110
1111         /* No existing cgroup group matched */
1112         return NULL;
1113 }
1114
1115 static void free_cgrp_cset_links(struct list_head *links_to_free)
1116 {
1117         struct cgrp_cset_link *link, *tmp_link;
1118
1119         list_for_each_entry_safe(link, tmp_link, links_to_free, cset_link) {
1120                 list_del(&link->cset_link);
1121                 kfree(link);
1122         }
1123 }
1124
1125 /**
1126  * allocate_cgrp_cset_links - allocate cgrp_cset_links
1127  * @count: the number of links to allocate
1128  * @tmp_links: list_head the allocated links are put on
1129  *
1130  * Allocate @count cgrp_cset_link structures and chain them on @tmp_links
1131  * through ->cset_link.  Returns 0 on success or -errno.
1132  */
1133 static int allocate_cgrp_cset_links(int count, struct list_head *tmp_links)
1134 {
1135         struct cgrp_cset_link *link;
1136         int i;
1137
1138         INIT_LIST_HEAD(tmp_links);
1139
1140         for (i = 0; i < count; i++) {
1141                 link = kzalloc(sizeof(*link), GFP_KERNEL);
1142                 if (!link) {
1143                         free_cgrp_cset_links(tmp_links);
1144                         return -ENOMEM;
1145                 }
1146                 list_add(&link->cset_link, tmp_links);
1147         }
1148         return 0;
1149 }
1150
1151 /**
1152  * link_css_set - a helper function to link a css_set to a cgroup
1153  * @tmp_links: cgrp_cset_link objects allocated by allocate_cgrp_cset_links()
1154  * @cset: the css_set to be linked
1155  * @cgrp: the destination cgroup
1156  */
1157 static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
1158                          struct cgroup *cgrp)
1159 {
1160         struct cgrp_cset_link *link;
1161
1162         BUG_ON(list_empty(tmp_links));
1163
1164         if (cgroup_on_dfl(cgrp))
1165                 cset->dfl_cgrp = cgrp;
1166
1167         link = list_first_entry(tmp_links, struct cgrp_cset_link, cset_link);
1168         link->cset = cset;
1169         link->cgrp = cgrp;
1170
1171         /*
1172          * Always add links to the tail of the lists so that the lists are
1173          * in chronological order.
1174          */
1175         list_move_tail(&link->cset_link, &cgrp->cset_links);
1176         list_add_tail(&link->cgrp_link, &cset->cgrp_links);
1177
1178         if (cgroup_parent(cgrp))
1179                 cgroup_get_live(cgrp);
1180 }
1181
1182 /**
1183  * find_css_set - return a new css_set with one cgroup updated
1184  * @old_cset: the baseline css_set
1185  * @cgrp: the cgroup to be updated
1186  *
1187  * Return a new css_set that's equivalent to @old_cset, but with @cgrp
1188  * substituted into the appropriate hierarchy.
1189  */
1190 static struct css_set *find_css_set(struct css_set *old_cset,
1191                                     struct cgroup *cgrp)
1192 {
1193         struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT] = { };
1194         struct css_set *cset;
1195         struct list_head tmp_links;
1196         struct cgrp_cset_link *link;
1197         struct cgroup_subsys *ss;
1198         unsigned long key;
1199         int ssid;
1200
1201         lockdep_assert_held(&cgroup_mutex);
1202
1203         /* First see if we already have a cgroup group that matches
1204          * the desired set */
1205         spin_lock_irq(&css_set_lock);
1206         cset = find_existing_css_set(old_cset, cgrp, template);
1207         if (cset)
1208                 get_css_set(cset);
1209         spin_unlock_irq(&css_set_lock);
1210
1211         if (cset)
1212                 return cset;
1213
1214         cset = kzalloc(sizeof(*cset), GFP_KERNEL);
1215         if (!cset)
1216                 return NULL;
1217
1218         /* Allocate all the cgrp_cset_link objects that we'll need */
1219         if (allocate_cgrp_cset_links(cgroup_root_count, &tmp_links) < 0) {
1220                 kfree(cset);
1221                 return NULL;
1222         }
1223
1224         refcount_set(&cset->refcount, 1);
1225         cset->dom_cset = cset;
1226         INIT_LIST_HEAD(&cset->tasks);
1227         INIT_LIST_HEAD(&cset->mg_tasks);
1228         INIT_LIST_HEAD(&cset->dying_tasks);
1229         INIT_LIST_HEAD(&cset->task_iters);
1230         INIT_LIST_HEAD(&cset->threaded_csets);
1231         INIT_HLIST_NODE(&cset->hlist);
1232         INIT_LIST_HEAD(&cset->cgrp_links);
1233         INIT_LIST_HEAD(&cset->mg_src_preload_node);
1234         INIT_LIST_HEAD(&cset->mg_dst_preload_node);
1235         INIT_LIST_HEAD(&cset->mg_node);
1236
1237         /* Copy the set of subsystem state objects generated in
1238          * find_existing_css_set() */
1239         memcpy(cset->subsys, template, sizeof(cset->subsys));
1240
1241         spin_lock_irq(&css_set_lock);
1242         /* Add reference counts and links from the new css_set. */
1243         list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
1244                 struct cgroup *c = link->cgrp;
1245
1246                 if (c->root == cgrp->root)
1247                         c = cgrp;
1248                 link_css_set(&tmp_links, cset, c);
1249         }
1250
1251         BUG_ON(!list_empty(&tmp_links));
1252
1253         css_set_count++;
1254
1255         /* Add @cset to the hash table */
1256         key = css_set_hash(cset->subsys);
1257         hash_add(css_set_table, &cset->hlist, key);
1258
1259         for_each_subsys(ss, ssid) {
1260                 struct cgroup_subsys_state *css = cset->subsys[ssid];
1261
1262                 list_add_tail(&cset->e_cset_node[ssid],
1263                               &css->cgroup->e_csets[ssid]);
1264                 css_get(css);
1265         }
1266
1267         spin_unlock_irq(&css_set_lock);
1268
1269         /*
1270          * If @cset should be threaded, look up the matching dom_cset and
1271          * link them up.  We first fully initialize @cset then look for the
1272          * dom_cset.  It's simpler this way and safe as @cset is guaranteed
1273          * to stay empty until we return.
1274          */
1275         if (cgroup_is_threaded(cset->dfl_cgrp)) {
1276                 struct css_set *dcset;
1277
1278                 dcset = find_css_set(cset, cset->dfl_cgrp->dom_cgrp);
1279                 if (!dcset) {
1280                         put_css_set(cset);
1281                         return NULL;
1282                 }
1283
1284                 spin_lock_irq(&css_set_lock);
1285                 cset->dom_cset = dcset;
1286                 list_add_tail(&cset->threaded_csets_node,
1287                               &dcset->threaded_csets);
1288                 spin_unlock_irq(&css_set_lock);
1289         }
1290
1291         return cset;
1292 }
1293
1294 struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
1295 {
1296         struct cgroup *root_cgrp = kernfs_root_to_node(kf_root)->priv;
1297
1298         return root_cgrp->root;
1299 }
1300
1301 void cgroup_favor_dynmods(struct cgroup_root *root, bool favor)
1302 {
1303         bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS;
1304
1305         /* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */
1306         if (favor && !favoring) {
1307                 rcu_sync_enter(&cgroup_threadgroup_rwsem.rss);
1308                 root->flags |= CGRP_ROOT_FAVOR_DYNMODS;
1309         } else if (!favor && favoring) {
1310                 rcu_sync_exit(&cgroup_threadgroup_rwsem.rss);
1311                 root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
1312         }
1313 }
1314
1315 static int cgroup_init_root_id(struct cgroup_root *root)
1316 {
1317         int id;
1318
1319         lockdep_assert_held(&cgroup_mutex);
1320
1321         id = idr_alloc_cyclic(&cgroup_hierarchy_idr, root, 0, 0, GFP_KERNEL);
1322         if (id < 0)
1323                 return id;
1324
1325         root->hierarchy_id = id;
1326         return 0;
1327 }
1328
1329 static void cgroup_exit_root_id(struct cgroup_root *root)
1330 {
1331         lockdep_assert_held(&cgroup_mutex);
1332
1333         idr_remove(&cgroup_hierarchy_idr, root->hierarchy_id);
1334 }
1335
1336 void cgroup_free_root(struct cgroup_root *root)
1337 {
1338         kfree(root);
1339 }
1340
1341 static void cgroup_destroy_root(struct cgroup_root *root)
1342 {
1343         struct cgroup *cgrp = &root->cgrp;
1344         struct cgrp_cset_link *link, *tmp_link;
1345
1346         trace_cgroup_destroy_root(root);
1347
1348         cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp);
1349
1350         BUG_ON(atomic_read(&root->nr_cgrps));
1351         BUG_ON(!list_empty(&cgrp->self.children));
1352
1353         /* Rebind all subsystems back to the default hierarchy */
1354         WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask));
1355
1356         /*
1357          * Release all the links from cset_links to this hierarchy's
1358          * root cgroup
1359          */
1360         spin_lock_irq(&css_set_lock);
1361
1362         list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
1363                 list_del(&link->cset_link);
1364                 list_del(&link->cgrp_link);
1365                 kfree(link);
1366         }
1367
1368         spin_unlock_irq(&css_set_lock);
1369
1370         if (!list_empty(&root->root_list)) {
1371                 list_del(&root->root_list);
1372                 cgroup_root_count--;
1373         }
1374
1375         cgroup_favor_dynmods(root, false);
1376         cgroup_exit_root_id(root);
1377
1378         cgroup_unlock();
1379
1380         cgroup_rstat_exit(cgrp);
1381         kernfs_destroy_root(root->kf_root);
1382         cgroup_free_root(root);
1383 }
1384
1385 /*
1386  * Returned cgroup is without refcount but it's valid as long as cset pins it.
1387  */
1388 static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
1389                                             struct cgroup_root *root)
1390 {
1391         struct cgroup *res_cgroup = NULL;
1392
1393         if (cset == &init_css_set) {
1394                 res_cgroup = &root->cgrp;
1395         } else if (root == &cgrp_dfl_root) {
1396                 res_cgroup = cset->dfl_cgrp;
1397         } else {
1398                 struct cgrp_cset_link *link;
1399                 lockdep_assert_held(&css_set_lock);
1400
1401                 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
1402                         struct cgroup *c = link->cgrp;
1403
1404                         if (c->root == root) {
1405                                 res_cgroup = c;
1406                                 break;
1407                         }
1408                 }
1409         }
1410
1411         BUG_ON(!res_cgroup);
1412         return res_cgroup;
1413 }
1414
1415 /*
1416  * look up cgroup associated with current task's cgroup namespace on the
1417  * specified hierarchy
1418  */
1419 static struct cgroup *
1420 current_cgns_cgroup_from_root(struct cgroup_root *root)
1421 {
1422         struct cgroup *res = NULL;
1423         struct css_set *cset;
1424
1425         lockdep_assert_held(&css_set_lock);
1426
1427         rcu_read_lock();
1428
1429         cset = current->nsproxy->cgroup_ns->root_cset;
1430         res = __cset_cgroup_from_root(cset, root);
1431
1432         rcu_read_unlock();
1433
1434         return res;
1435 }
1436
1437 /*
1438  * Look up cgroup associated with current task's cgroup namespace on the default
1439  * hierarchy.
1440  *
1441  * Unlike current_cgns_cgroup_from_root(), this doesn't need locks:
1442  * - Internal rcu_read_lock is unnecessary because we don't dereference any rcu
1443  *   pointers.
1444  * - css_set_lock is not needed because we just read cset->dfl_cgrp.
1445  * - As a bonus returned cgrp is pinned with the current because it cannot
1446  *   switch cgroup_ns asynchronously.
1447  */
1448 static struct cgroup *current_cgns_cgroup_dfl(void)
1449 {
1450         struct css_set *cset;
1451
1452         if (current->nsproxy) {
1453                 cset = current->nsproxy->cgroup_ns->root_cset;
1454                 return __cset_cgroup_from_root(cset, &cgrp_dfl_root);
1455         } else {
1456                 /*
1457                  * NOTE: This function may be called from bpf_cgroup_from_id()
1458                  * on a task which has already passed exit_task_namespaces() and
1459                  * nsproxy == NULL. Fall back to cgrp_dfl_root which will make all
1460                  * cgroups visible for lookups.
1461                  */
1462                 return &cgrp_dfl_root.cgrp;
1463         }
1464 }
1465
1466 /* look up cgroup associated with given css_set on the specified hierarchy */
1467 static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
1468                                             struct cgroup_root *root)
1469 {
1470         lockdep_assert_held(&cgroup_mutex);
1471         lockdep_assert_held(&css_set_lock);
1472
1473         return __cset_cgroup_from_root(cset, root);
1474 }
1475
1476 /*
1477  * Return the cgroup for "task" from the given hierarchy. Must be
1478  * called with cgroup_mutex and css_set_lock held.
1479  */
1480 struct cgroup *task_cgroup_from_root(struct task_struct *task,
1481                                      struct cgroup_root *root)
1482 {
1483         /*
1484          * No need to lock the task - since we hold css_set_lock the
1485          * task can't change groups.
1486          */
1487         return cset_cgroup_from_root(task_css_set(task), root);
1488 }
1489
1490 /*
1491  * A task must hold cgroup_mutex to modify cgroups.
1492  *
1493  * Any task can increment and decrement the count field without lock.
1494  * So in general, code holding cgroup_mutex can't rely on the count
1495  * field not changing.  However, if the count goes to zero, then only
1496  * cgroup_attach_task() can increment it again.  Because a count of zero
1497  * means that no tasks are currently attached, therefore there is no
1498  * way a task attached to that cgroup can fork (the other way to
1499  * increment the count).  So code holding cgroup_mutex can safely
1500  * assume that if the count is zero, it will stay zero. Similarly, if
1501  * a task holds cgroup_mutex on a cgroup with zero count, it
1502  * knows that the cgroup won't be removed, as cgroup_rmdir()
1503  * needs that mutex.
1504  *
1505  * A cgroup can only be deleted if both its 'count' of using tasks
1506  * is zero, and its list of 'children' cgroups is empty.  Since all
1507  * tasks in the system use _some_ cgroup, and since there is always at
1508  * least one task in the system (init, pid == 1), therefore, root cgroup
1509  * always has either children cgroups and/or using tasks.  So we don't
1510  * need a special hack to ensure that root cgroup cannot be deleted.
1511  *
1512  * P.S.  One more locking exception.  RCU is used to guard the
1513  * update of a tasks cgroup pointer by cgroup_attach_task()
1514  */
1515
1516 static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
1517
1518 static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
1519                               char *buf)
1520 {
1521         struct cgroup_subsys *ss = cft->ss;
1522
1523         if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
1524             !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
1525                 const char *dbg = (cft->flags & CFTYPE_DEBUG) ? ".__DEBUG__." : "";
1526
1527                 snprintf(buf, CGROUP_FILE_NAME_MAX, "%s%s.%s",
1528                          dbg, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
1529                          cft->name);
1530         } else {
1531                 strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
1532         }
1533         return buf;
1534 }
1535
1536 /**
1537  * cgroup_file_mode - deduce file mode of a control file
1538  * @cft: the control file in question
1539  *
1540  * S_IRUGO for read, S_IWUSR for write.
1541  */
1542 static umode_t cgroup_file_mode(const struct cftype *cft)
1543 {
1544         umode_t mode = 0;
1545
1546         if (cft->read_u64 || cft->read_s64 || cft->seq_show)
1547                 mode |= S_IRUGO;
1548
1549         if (cft->write_u64 || cft->write_s64 || cft->write) {
1550                 if (cft->flags & CFTYPE_WORLD_WRITABLE)
1551                         mode |= S_IWUGO;
1552                 else
1553                         mode |= S_IWUSR;
1554         }
1555
1556         return mode;
1557 }
1558
1559 /**
1560  * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask
1561  * @subtree_control: the new subtree_control mask to consider
1562  * @this_ss_mask: available subsystems
1563  *
1564  * On the default hierarchy, a subsystem may request other subsystems to be
1565  * enabled together through its ->depends_on mask.  In such cases, more
1566  * subsystems than specified in "cgroup.subtree_control" may be enabled.
1567  *
1568  * This function calculates which subsystems need to be enabled if
1569  * @subtree_control is to be applied while restricted to @this_ss_mask.
1570  */
1571 static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask)
1572 {
1573         u16 cur_ss_mask = subtree_control;
1574         struct cgroup_subsys *ss;
1575         int ssid;
1576
1577         lockdep_assert_held(&cgroup_mutex);
1578
1579         cur_ss_mask |= cgrp_dfl_implicit_ss_mask;
1580
1581         while (true) {
1582                 u16 new_ss_mask = cur_ss_mask;
1583
1584                 do_each_subsys_mask(ss, ssid, cur_ss_mask) {
1585                         new_ss_mask |= ss->depends_on;
1586                 } while_each_subsys_mask();
1587
1588                 /*
1589                  * Mask out subsystems which aren't available.  This can
1590                  * happen only if some depended-upon subsystems were bound
1591                  * to non-default hierarchies.
1592                  */
1593                 new_ss_mask &= this_ss_mask;
1594
1595                 if (new_ss_mask == cur_ss_mask)
1596                         break;
1597                 cur_ss_mask = new_ss_mask;
1598         }
1599
1600         return cur_ss_mask;
1601 }
1602
1603 /**
1604  * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
1605  * @kn: the kernfs_node being serviced
1606  *
1607  * This helper undoes cgroup_kn_lock_live() and should be invoked before
1608  * the method finishes if locking succeeded.  Note that once this function
1609  * returns the cgroup returned by cgroup_kn_lock_live() may become
1610  * inaccessible any time.  If the caller intends to continue to access the
1611  * cgroup, it should pin it before invoking this function.
1612  */
1613 void cgroup_kn_unlock(struct kernfs_node *kn)
1614 {
1615         struct cgroup *cgrp;
1616
1617         if (kernfs_type(kn) == KERNFS_DIR)
1618                 cgrp = kn->priv;
1619         else
1620                 cgrp = kn->parent->priv;
1621
1622         cgroup_unlock();
1623
1624         kernfs_unbreak_active_protection(kn);
1625         cgroup_put(cgrp);
1626 }
1627
1628 /**
1629  * cgroup_kn_lock_live - locking helper for cgroup kernfs methods
1630  * @kn: the kernfs_node being serviced
1631  * @drain_offline: perform offline draining on the cgroup
1632  *
1633  * This helper is to be used by a cgroup kernfs method currently servicing
1634  * @kn.  It breaks the active protection, performs cgroup locking and
1635  * verifies that the associated cgroup is alive.  Returns the cgroup if
1636  * alive; otherwise, %NULL.  A successful return should be undone by a
1637  * matching cgroup_kn_unlock() invocation.  If @drain_offline is %true, the
1638  * cgroup is drained of offlining csses before return.
1639  *
1640  * Any cgroup kernfs method implementation which requires locking the
1641  * associated cgroup should use this helper.  It avoids nesting cgroup
1642  * locking under kernfs active protection and allows all kernfs operations
1643  * including self-removal.
1644  */
1645 struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline)
1646 {
1647         struct cgroup *cgrp;
1648
1649         if (kernfs_type(kn) == KERNFS_DIR)
1650                 cgrp = kn->priv;
1651         else
1652                 cgrp = kn->parent->priv;
1653
1654         /*
1655          * We're gonna grab cgroup_mutex which nests outside kernfs
1656          * active_ref.  cgroup liveliness check alone provides enough
1657          * protection against removal.  Ensure @cgrp stays accessible and
1658          * break the active_ref protection.
1659          */
1660         if (!cgroup_tryget(cgrp))
1661                 return NULL;
1662         kernfs_break_active_protection(kn);
1663
1664         if (drain_offline)
1665                 cgroup_lock_and_drain_offline(cgrp);
1666         else
1667                 cgroup_lock();
1668
1669         if (!cgroup_is_dead(cgrp))
1670                 return cgrp;
1671
1672         cgroup_kn_unlock(kn);
1673         return NULL;
1674 }
1675
1676 static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
1677 {
1678         char name[CGROUP_FILE_NAME_MAX];
1679
1680         lockdep_assert_held(&cgroup_mutex);
1681
1682         if (cft->file_offset) {
1683                 struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss);
1684                 struct cgroup_file *cfile = (void *)css + cft->file_offset;
1685
1686                 spin_lock_irq(&cgroup_file_kn_lock);
1687                 cfile->kn = NULL;
1688                 spin_unlock_irq(&cgroup_file_kn_lock);
1689
1690                 del_timer_sync(&cfile->notify_timer);
1691         }
1692
1693         kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name));
1694 }
1695
1696 /**
1697  * css_clear_dir - remove subsys files in a cgroup directory
1698  * @css: target css
1699  */
1700 static void css_clear_dir(struct cgroup_subsys_state *css)
1701 {
1702         struct cgroup *cgrp = css->cgroup;
1703         struct cftype *cfts;
1704
1705         if (!(css->flags & CSS_VISIBLE))
1706                 return;
1707
1708         css->flags &= ~CSS_VISIBLE;
1709
1710         if (!css->ss) {
1711                 if (cgroup_on_dfl(cgrp)) {
1712                         cgroup_addrm_files(css, cgrp,
1713                                            cgroup_base_files, false);
1714                         if (cgroup_psi_enabled())
1715                                 cgroup_addrm_files(css, cgrp,
1716                                                    cgroup_psi_files, false);
1717                 } else {
1718                         cgroup_addrm_files(css, cgrp,
1719                                            cgroup1_base_files, false);
1720                 }
1721         } else {
1722                 list_for_each_entry(cfts, &css->ss->cfts, node)
1723                         cgroup_addrm_files(css, cgrp, cfts, false);
1724         }
1725 }
1726
1727 /**
1728  * css_populate_dir - create subsys files in a cgroup directory
1729  * @css: target css
1730  *
1731  * On failure, no file is added.
1732  */
1733 static int css_populate_dir(struct cgroup_subsys_state *css)
1734 {
1735         struct cgroup *cgrp = css->cgroup;
1736         struct cftype *cfts, *failed_cfts;
1737         int ret;
1738
1739         if ((css->flags & CSS_VISIBLE) || !cgrp->kn)
1740                 return 0;
1741
1742         if (!css->ss) {
1743                 if (cgroup_on_dfl(cgrp)) {
1744                         ret = cgroup_addrm_files(&cgrp->self, cgrp,
1745                                                  cgroup_base_files, true);
1746                         if (ret < 0)
1747                                 return ret;
1748
1749                         if (cgroup_psi_enabled()) {
1750                                 ret = cgroup_addrm_files(&cgrp->self, cgrp,
1751                                                          cgroup_psi_files, true);
1752                                 if (ret < 0)
1753                                         return ret;
1754                         }
1755                 } else {
1756                         cgroup_addrm_files(css, cgrp,
1757                                            cgroup1_base_files, true);
1758                 }
1759         } else {
1760                 list_for_each_entry(cfts, &css->ss->cfts, node) {
1761                         ret = cgroup_addrm_files(css, cgrp, cfts, true);
1762                         if (ret < 0) {
1763                                 failed_cfts = cfts;
1764                                 goto err;
1765                         }
1766                 }
1767         }
1768
1769         css->flags |= CSS_VISIBLE;
1770
1771         return 0;
1772 err:
1773         list_for_each_entry(cfts, &css->ss->cfts, node) {
1774                 if (cfts == failed_cfts)
1775                         break;
1776                 cgroup_addrm_files(css, cgrp, cfts, false);
1777         }
1778         return ret;
1779 }
1780
1781 int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
1782 {
1783         struct cgroup *dcgrp = &dst_root->cgrp;
1784         struct cgroup_subsys *ss;
1785         int ssid, ret;
1786         u16 dfl_disable_ss_mask = 0;
1787
1788         lockdep_assert_held(&cgroup_mutex);
1789
1790         do_each_subsys_mask(ss, ssid, ss_mask) {
1791                 /*
1792                  * If @ss has non-root csses attached to it, can't move.
1793                  * If @ss is an implicit controller, it is exempt from this
1794                  * rule and can be stolen.
1795                  */
1796                 if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) &&
1797                     !ss->implicit_on_dfl)
1798                         return -EBUSY;
1799
1800                 /* can't move between two non-dummy roots either */
1801                 if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
1802                         return -EBUSY;
1803
1804                 /*
1805                  * Collect ssid's that need to be disabled from default
1806                  * hierarchy.
1807                  */
1808                 if (ss->root == &cgrp_dfl_root)
1809                         dfl_disable_ss_mask |= 1 << ssid;
1810
1811         } while_each_subsys_mask();
1812
1813         if (dfl_disable_ss_mask) {
1814                 struct cgroup *scgrp = &cgrp_dfl_root.cgrp;
1815
1816                 /*
1817                  * Controllers from default hierarchy that need to be rebound
1818                  * are all disabled together in one go.
1819                  */
1820                 cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask;
1821                 WARN_ON(cgroup_apply_control(scgrp));
1822                 cgroup_finalize_control(scgrp, 0);
1823         }
1824
1825         do_each_subsys_mask(ss, ssid, ss_mask) {
1826                 struct cgroup_root *src_root = ss->root;
1827                 struct cgroup *scgrp = &src_root->cgrp;
1828                 struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
1829                 struct css_set *cset, *cset_pos;
1830                 struct css_task_iter *it;
1831
1832                 WARN_ON(!css || cgroup_css(dcgrp, ss));
1833
1834                 if (src_root != &cgrp_dfl_root) {
1835                         /* disable from the source */
1836                         src_root->subsys_mask &= ~(1 << ssid);
1837                         WARN_ON(cgroup_apply_control(scgrp));
1838                         cgroup_finalize_control(scgrp, 0);
1839                 }
1840
1841                 /* rebind */
1842                 RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
1843                 rcu_assign_pointer(dcgrp->subsys[ssid], css);
1844                 ss->root = dst_root;
1845                 css->cgroup = dcgrp;
1846
1847                 spin_lock_irq(&css_set_lock);
1848                 WARN_ON(!list_empty(&dcgrp->e_csets[ss->id]));
1849                 list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id],
1850                                          e_cset_node[ss->id]) {
1851                         list_move_tail(&cset->e_cset_node[ss->id],
1852                                        &dcgrp->e_csets[ss->id]);
1853                         /*
1854                          * all css_sets of scgrp together in same order to dcgrp,
1855                          * patch in-flight iterators to preserve correct iteration.
1856                          * since the iterator is always advanced right away and
1857                          * finished when it->cset_pos meets it->cset_head, so only
1858                          * update it->cset_head is enough here.
1859                          */
1860                         list_for_each_entry(it, &cset->task_iters, iters_node)
1861                                 if (it->cset_head == &scgrp->e_csets[ss->id])
1862                                         it->cset_head = &dcgrp->e_csets[ss->id];
1863                 }
1864                 spin_unlock_irq(&css_set_lock);
1865
1866                 if (ss->css_rstat_flush) {
1867                         list_del_rcu(&css->rstat_css_node);
1868                         synchronize_rcu();
1869                         list_add_rcu(&css->rstat_css_node,
1870                                      &dcgrp->rstat_css_list);
1871                 }
1872
1873                 /* default hierarchy doesn't enable controllers by default */
1874                 dst_root->subsys_mask |= 1 << ssid;
1875                 if (dst_root == &cgrp_dfl_root) {
1876                         static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
1877                 } else {
1878                         dcgrp->subtree_control |= 1 << ssid;
1879                         static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
1880                 }
1881
1882                 ret = cgroup_apply_control(dcgrp);
1883                 if (ret)
1884                         pr_warn("partial failure to rebind %s controller (err=%d)\n",
1885                                 ss->name, ret);
1886
1887                 if (ss->bind)
1888                         ss->bind(css);
1889         } while_each_subsys_mask();
1890
1891         kernfs_activate(dcgrp->kn);
1892         return 0;
1893 }
1894
1895 int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
1896                      struct kernfs_root *kf_root)
1897 {
1898         int len = 0;
1899         char *buf = NULL;
1900         struct cgroup_root *kf_cgroot = cgroup_root_from_kf(kf_root);
1901         struct cgroup *ns_cgroup;
1902
1903         buf = kmalloc(PATH_MAX, GFP_KERNEL);
1904         if (!buf)
1905                 return -ENOMEM;
1906
1907         spin_lock_irq(&css_set_lock);
1908         ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);
1909         len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX);
1910         spin_unlock_irq(&css_set_lock);
1911
1912         if (len >= PATH_MAX)
1913                 len = -ERANGE;
1914         else if (len > 0) {
1915                 seq_escape(sf, buf, " \t\n\\");
1916                 len = 0;
1917         }
1918         kfree(buf);
1919         return len;
1920 }
1921
1922 enum cgroup2_param {
1923         Opt_nsdelegate,
1924         Opt_favordynmods,
1925         Opt_memory_localevents,
1926         Opt_memory_recursiveprot,
1927         nr__cgroup2_params
1928 };
1929
1930 static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
1931         fsparam_flag("nsdelegate",              Opt_nsdelegate),
1932         fsparam_flag("favordynmods",            Opt_favordynmods),
1933         fsparam_flag("memory_localevents",      Opt_memory_localevents),
1934         fsparam_flag("memory_recursiveprot",    Opt_memory_recursiveprot),
1935         {}
1936 };
1937
1938 static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param)
1939 {
1940         struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
1941         struct fs_parse_result result;
1942         int opt;
1943
1944         opt = fs_parse(fc, cgroup2_fs_parameters, param, &result);
1945         if (opt < 0)
1946                 return opt;
1947
1948         switch (opt) {
1949         case Opt_nsdelegate:
1950                 ctx->flags |= CGRP_ROOT_NS_DELEGATE;
1951                 return 0;
1952         case Opt_favordynmods:
1953                 ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
1954                 return 0;
1955         case Opt_memory_localevents:
1956                 ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1957                 return 0;
1958         case Opt_memory_recursiveprot:
1959                 ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
1960                 return 0;
1961         }
1962         return -EINVAL;
1963 }
1964
1965 static void apply_cgroup_root_flags(unsigned int root_flags)
1966 {
1967         if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
1968                 if (root_flags & CGRP_ROOT_NS_DELEGATE)
1969                         cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
1970                 else
1971                         cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
1972
1973                 cgroup_favor_dynmods(&cgrp_dfl_root,
1974                                      root_flags & CGRP_ROOT_FAVOR_DYNMODS);
1975
1976                 if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
1977                         cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1978                 else
1979                         cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;
1980
1981                 if (root_flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
1982                         cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
1983                 else
1984                         cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT;
1985         }
1986 }
1987
1988 static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
1989 {
1990         if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
1991                 seq_puts(seq, ",nsdelegate");
1992         if (cgrp_dfl_root.flags & CGRP_ROOT_FAVOR_DYNMODS)
1993                 seq_puts(seq, ",favordynmods");
1994         if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
1995                 seq_puts(seq, ",memory_localevents");
1996         if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
1997                 seq_puts(seq, ",memory_recursiveprot");
1998         return 0;
1999 }
2000
2001 static int cgroup_reconfigure(struct fs_context *fc)
2002 {
2003         struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
2004
2005         apply_cgroup_root_flags(ctx->flags);
2006         return 0;
2007 }
2008
2009 static void init_cgroup_housekeeping(struct cgroup *cgrp)
2010 {
2011         struct cgroup_subsys *ss;
2012         int ssid;
2013
2014         INIT_LIST_HEAD(&cgrp->self.sibling);
2015         INIT_LIST_HEAD(&cgrp->self.children);
2016         INIT_LIST_HEAD(&cgrp->cset_links);
2017         INIT_LIST_HEAD(&cgrp->pidlists);
2018         mutex_init(&cgrp->pidlist_mutex);
2019         cgrp->self.cgroup = cgrp;
2020         cgrp->self.flags |= CSS_ONLINE;
2021         cgrp->dom_cgrp = cgrp;
2022         cgrp->max_descendants = INT_MAX;
2023         cgrp->max_depth = INT_MAX;
2024         INIT_LIST_HEAD(&cgrp->rstat_css_list);
2025         prev_cputime_init(&cgrp->prev_cputime);
2026
2027         for_each_subsys(ss, ssid)
2028                 INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
2029
2030         init_waitqueue_head(&cgrp->offline_waitq);
2031         INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent);
2032 }
2033
2034 void init_cgroup_root(struct cgroup_fs_context *ctx)
2035 {
2036         struct cgroup_root *root = ctx->root;
2037         struct cgroup *cgrp = &root->cgrp;
2038
2039         INIT_LIST_HEAD(&root->root_list);
2040         atomic_set(&root->nr_cgrps, 1);
2041         cgrp->root = root;
2042         init_cgroup_housekeeping(cgrp);
2043
2044         /* DYNMODS must be modified through cgroup_favor_dynmods() */
2045         root->flags = ctx->flags & ~CGRP_ROOT_FAVOR_DYNMODS;
2046         if (ctx->release_agent)
2047                 strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
2048         if (ctx->name)
2049                 strscpy(root->name, ctx->name, MAX_CGROUP_ROOT_NAMELEN);
2050         if (ctx->cpuset_clone_children)
2051                 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags);
2052 }
2053
2054 int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
2055 {
2056         LIST_HEAD(tmp_links);
2057         struct cgroup *root_cgrp = &root->cgrp;
2058         struct kernfs_syscall_ops *kf_sops;
2059         struct css_set *cset;
2060         int i, ret;
2061
2062         lockdep_assert_held(&cgroup_mutex);
2063
2064         ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release,
2065                               0, GFP_KERNEL);
2066         if (ret)
2067                 goto out;
2068
2069         /*
2070          * We're accessing css_set_count without locking css_set_lock here,
2071          * but that's OK - it can only be increased by someone holding
2072          * cgroup_lock, and that's us.  Later rebinding may disable
2073          * controllers on the default hierarchy and thus create new csets,
2074          * which can't be more than the existing ones.  Allocate 2x.
2075          */
2076         ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links);
2077         if (ret)
2078                 goto cancel_ref;
2079
2080         ret = cgroup_init_root_id(root);
2081         if (ret)
2082                 goto cancel_ref;
2083
2084         kf_sops = root == &cgrp_dfl_root ?
2085                 &cgroup_kf_syscall_ops : &cgroup1_kf_syscall_ops;
2086
2087         root->kf_root = kernfs_create_root(kf_sops,
2088                                            KERNFS_ROOT_CREATE_DEACTIVATED |
2089                                            KERNFS_ROOT_SUPPORT_EXPORTOP |
2090                                            KERNFS_ROOT_SUPPORT_USER_XATTR,
2091                                            root_cgrp);
2092         if (IS_ERR(root->kf_root)) {
2093                 ret = PTR_ERR(root->kf_root);
2094                 goto exit_root_id;
2095         }
2096         root_cgrp->kn = kernfs_root_to_node(root->kf_root);
2097         WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
2098         root_cgrp->ancestors[0] = root_cgrp;
2099
2100         ret = css_populate_dir(&root_cgrp->self);
2101         if (ret)
2102                 goto destroy_root;
2103
2104         ret = cgroup_rstat_init(root_cgrp);
2105         if (ret)
2106                 goto destroy_root;
2107
2108         ret = rebind_subsystems(root, ss_mask);
2109         if (ret)
2110                 goto exit_stats;
2111
2112         ret = cgroup_bpf_inherit(root_cgrp);
2113         WARN_ON_ONCE(ret);
2114
2115         trace_cgroup_setup_root(root);
2116
2117         /*
2118          * There must be no failure case after here, since rebinding takes
2119          * care of subsystems' refcounts, which are explicitly dropped in
2120          * the failure exit path.
2121          */
2122         list_add(&root->root_list, &cgroup_roots);
2123         cgroup_root_count++;
2124
2125         /*
2126          * Link the root cgroup in this hierarchy into all the css_set
2127          * objects.
2128          */
2129         spin_lock_irq(&css_set_lock);
2130         hash_for_each(css_set_table, i, cset, hlist) {
2131                 link_css_set(&tmp_links, cset, root_cgrp);
2132                 if (css_set_populated(cset))
2133                         cgroup_update_populated(root_cgrp, true);
2134         }
2135         spin_unlock_irq(&css_set_lock);
2136
2137         BUG_ON(!list_empty(&root_cgrp->self.children));
2138         BUG_ON(atomic_read(&root->nr_cgrps) != 1);
2139
2140         ret = 0;
2141         goto out;
2142
2143 exit_stats:
2144         cgroup_rstat_exit(root_cgrp);
2145 destroy_root:
2146         kernfs_destroy_root(root->kf_root);
2147         root->kf_root = NULL;
2148 exit_root_id:
2149         cgroup_exit_root_id(root);
2150 cancel_ref:
2151         percpu_ref_exit(&root_cgrp->self.refcnt);
2152 out:
2153         free_cgrp_cset_links(&tmp_links);
2154         return ret;
2155 }
2156
2157 int cgroup_do_get_tree(struct fs_context *fc)
2158 {
2159         struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
2160         int ret;
2161
2162         ctx->kfc.root = ctx->root->kf_root;
2163         if (fc->fs_type == &cgroup2_fs_type)
2164                 ctx->kfc.magic = CGROUP2_SUPER_MAGIC;
2165         else
2166                 ctx->kfc.magic = CGROUP_SUPER_MAGIC;
2167         ret = kernfs_get_tree(fc);
2168
2169         /*
2170          * In non-init cgroup namespace, instead of root cgroup's dentry,
2171          * we return the dentry corresponding to the cgroupns->root_cgrp.
2172          */
2173         if (!ret && ctx->ns != &init_cgroup_ns) {
2174                 struct dentry *nsdentry;
2175                 struct super_block *sb = fc->root->d_sb;
2176                 struct cgroup *cgrp;
2177
2178                 cgroup_lock();
2179                 spin_lock_irq(&css_set_lock);
2180
2181                 cgrp = cset_cgroup_from_root(ctx->ns->root_cset, ctx->root);
2182
2183                 spin_unlock_irq(&css_set_lock);
2184                 cgroup_unlock();
2185
2186                 nsdentry = kernfs_node_dentry(cgrp->kn, sb);
2187                 dput(fc->root);
2188                 if (IS_ERR(nsdentry)) {
2189                         deactivate_locked_super(sb);
2190                         ret = PTR_ERR(nsdentry);
2191                         nsdentry = NULL;
2192                 }
2193                 fc->root = nsdentry;
2194         }
2195
2196         if (!ctx->kfc.new_sb_created)
2197                 cgroup_put(&ctx->root->cgrp);
2198
2199         return ret;
2200 }
2201
2202 /*
2203  * Destroy a cgroup filesystem context.
2204  */
2205 static void cgroup_fs_context_free(struct fs_context *fc)
2206 {
2207         struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
2208
2209         kfree(ctx->name);
2210         kfree(ctx->release_agent);
2211         put_cgroup_ns(ctx->ns);
2212         kernfs_free_fs_context(fc);
2213         kfree(ctx);
2214 }
2215
2216 static int cgroup_get_tree(struct fs_context *fc)
2217 {
2218         struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
2219         int ret;
2220
2221         WRITE_ONCE(cgrp_dfl_visible, true);
2222         cgroup_get_live(&cgrp_dfl_root.cgrp);
2223         ctx->root = &cgrp_dfl_root;
2224
2225         ret = cgroup_do_get_tree(fc);
2226         if (!ret)
2227                 apply_cgroup_root_flags(ctx->flags);
2228         return ret;
2229 }
2230
2231 static const struct fs_context_operations cgroup_fs_context_ops = {
2232         .free           = cgroup_fs_context_free,
2233         .parse_param    = cgroup2_parse_param,
2234         .get_tree       = cgroup_get_tree,
2235         .reconfigure    = cgroup_reconfigure,
2236 };
2237
2238 static const struct fs_context_operations cgroup1_fs_context_ops = {
2239         .free           = cgroup_fs_context_free,
2240         .parse_param    = cgroup1_parse_param,
2241         .get_tree       = cgroup1_get_tree,
2242         .reconfigure    = cgroup1_reconfigure,
2243 };
2244
2245 /*
2246  * Initialise the cgroup filesystem creation/reconfiguration context.  Notably,
2247  * we select the namespace we're going to use.
2248  */
2249 static int cgroup_init_fs_context(struct fs_context *fc)
2250 {
2251         struct cgroup_fs_context *ctx;
2252
2253         ctx = kzalloc(sizeof(struct cgroup_fs_context), GFP_KERNEL);
2254         if (!ctx)
2255                 return -ENOMEM;
2256
2257         ctx->ns = current->nsproxy->cgroup_ns;
2258         get_cgroup_ns(ctx->ns);
2259         fc->fs_private = &ctx->kfc;
2260         if (fc->fs_type == &cgroup2_fs_type)
2261                 fc->ops = &cgroup_fs_context_ops;
2262         else
2263                 fc->ops = &cgroup1_fs_context_ops;
2264         put_user_ns(fc->user_ns);
2265         fc->user_ns = get_user_ns(ctx->ns->user_ns);
2266         fc->global = true;
2267
2268 #ifdef CONFIG_CGROUP_FAVOR_DYNMODS
2269         ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
2270 #endif
2271         return 0;
2272 }
2273
2274 static void cgroup_kill_sb(struct super_block *sb)
2275 {
2276         struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
2277         struct cgroup_root *root = cgroup_root_from_kf(kf_root);
2278
2279         /*
2280          * If @root doesn't have any children, start killing it.
2281          * This prevents new mounts by disabling percpu_ref_tryget_live().
2282          *
2283          * And don't kill the default root.
2284          */
2285         if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
2286             !percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
2287                 cgroup_bpf_offline(&root->cgrp);
2288                 percpu_ref_kill(&root->cgrp.self.refcnt);
2289         }
2290         cgroup_put(&root->cgrp);
2291         kernfs_kill_sb(sb);
2292 }
2293
2294 struct file_system_type cgroup_fs_type = {
2295         .name                   = "cgroup",
2296         .init_fs_context        = cgroup_init_fs_context,
2297         .parameters             = cgroup1_fs_parameters,
2298         .kill_sb                = cgroup_kill_sb,
2299         .fs_flags               = FS_USERNS_MOUNT,
2300 };
2301
2302 static struct file_system_type cgroup2_fs_type = {
2303         .name                   = "cgroup2",
2304         .init_fs_context        = cgroup_init_fs_context,
2305         .parameters             = cgroup2_fs_parameters,
2306         .kill_sb                = cgroup_kill_sb,
2307         .fs_flags               = FS_USERNS_MOUNT,
2308 };
2309
2310 #ifdef CONFIG_CPUSETS
2311 static const struct fs_context_operations cpuset_fs_context_ops = {
2312         .get_tree       = cgroup1_get_tree,
2313         .free           = cgroup_fs_context_free,
2314 };
2315
2316 /*
2317  * This is ugly, but preserves the userspace API for existing cpuset
2318  * users. If someone tries to mount the "cpuset" filesystem, we
2319  * silently switch it to mount "cgroup" instead
2320  */
2321 static int cpuset_init_fs_context(struct fs_context *fc)
2322 {
2323         char *agent = kstrdup("/sbin/cpuset_release_agent", GFP_USER);
2324         struct cgroup_fs_context *ctx;
2325         int err;
2326
2327         err = cgroup_init_fs_context(fc);
2328         if (err) {
2329                 kfree(agent);
2330                 return err;
2331         }
2332
2333         fc->ops = &cpuset_fs_context_ops;
2334
2335         ctx = cgroup_fc2context(fc);
2336         ctx->subsys_mask = 1 << cpuset_cgrp_id;
2337         ctx->flags |= CGRP_ROOT_NOPREFIX;
2338         ctx->release_agent = agent;
2339
2340         get_filesystem(&cgroup_fs_type);
2341         put_filesystem(fc->fs_type);
2342         fc->fs_type = &cgroup_fs_type;
2343
2344         return 0;
2345 }
2346
2347 static struct file_system_type cpuset_fs_type = {
2348         .name                   = "cpuset",
2349         .init_fs_context        = cpuset_init_fs_context,
2350         .fs_flags               = FS_USERNS_MOUNT,
2351 };
2352 #endif
2353
2354 int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
2355                           struct cgroup_namespace *ns)
2356 {
2357         struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
2358
2359         return kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
2360 }
2361
2362 int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
2363                    struct cgroup_namespace *ns)
2364 {
2365         int ret;
2366
2367         cgroup_lock();
2368         spin_lock_irq(&css_set_lock);
2369
2370         ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
2371
2372         spin_unlock_irq(&css_set_lock);
2373         cgroup_unlock();
2374
2375         return ret;
2376 }
2377 EXPORT_SYMBOL_GPL(cgroup_path_ns);
2378
2379 /**
2380  * cgroup_attach_lock - Lock for ->attach()
2381  * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem
2382  *
2383  * cgroup migration sometimes needs to stabilize threadgroups against forks and
2384  * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
2385  * implementations (e.g. cpuset), also need to disable CPU hotplug.
2386  * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can
2387  * lead to deadlocks.
2388  *
2389  * Bringing up a CPU may involve creating and destroying tasks which requires
2390  * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside
2391  * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while
2392  * write-locking threadgroup_rwsem, the locking order is reversed and we end up
2393  * waiting for an on-going CPU hotplug operation which in turn is waiting for
2394  * the threadgroup_rwsem to be released to create new tasks. For more details:
2395  *
2396  *   http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu
2397  *
2398  * Resolve the situation by always acquiring cpus_read_lock() before optionally
2399  * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
2400  * CPU hotplug is disabled on entry.
2401  */
2402 void cgroup_attach_lock(bool lock_threadgroup)
2403 {
2404         cpus_read_lock();
2405         if (lock_threadgroup)
2406                 percpu_down_write(&cgroup_threadgroup_rwsem);
2407 }
2408
2409 /**
2410  * cgroup_attach_unlock - Undo cgroup_attach_lock()
2411  * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
2412  */
2413 void cgroup_attach_unlock(bool lock_threadgroup)
2414 {
2415         if (lock_threadgroup)
2416                 percpu_up_write(&cgroup_threadgroup_rwsem);
2417         cpus_read_unlock();
2418 }
2419
2420 /**
2421  * cgroup_migrate_add_task - add a migration target task to a migration context
2422  * @task: target task
2423  * @mgctx: target migration context
2424  *
2425  * Add @task, which is a migration target, to @mgctx->tset.  This function
2426  * becomes noop if @task doesn't need to be migrated.  @task's css_set
2427  * should have been added as a migration source and @task->cg_list will be
2428  * moved from the css_set's tasks list to mg_tasks one.
2429  */
2430 static void cgroup_migrate_add_task(struct task_struct *task,
2431                                     struct cgroup_mgctx *mgctx)
2432 {
2433         struct css_set *cset;
2434
2435         lockdep_assert_held(&css_set_lock);
2436
2437         /* @task either already exited or can't exit until the end */
2438         if (task->flags & PF_EXITING)
2439                 return;
2440
2441         /* cgroup_threadgroup_rwsem protects racing against forks */
2442         WARN_ON_ONCE(list_empty(&task->cg_list));
2443
2444         cset = task_css_set(task);
2445         if (!cset->mg_src_cgrp)
2446                 return;
2447
2448         mgctx->tset.nr_tasks++;
2449
2450         list_move_tail(&task->cg_list, &cset->mg_tasks);
2451         if (list_empty(&cset->mg_node))
2452                 list_add_tail(&cset->mg_node,
2453                               &mgctx->tset.src_csets);
2454         if (list_empty(&cset->mg_dst_cset->mg_node))
2455                 list_add_tail(&cset->mg_dst_cset->mg_node,
2456                               &mgctx->tset.dst_csets);
2457 }
2458
2459 /**
2460  * cgroup_taskset_first - reset taskset and return the first task
2461  * @tset: taskset of interest
2462  * @dst_cssp: output variable for the destination css
2463  *
2464  * @tset iteration is initialized and the first task is returned.
2465  */
2466 struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
2467                                          struct cgroup_subsys_state **dst_cssp)
2468 {
2469         tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node);
2470         tset->cur_task = NULL;
2471
2472         return cgroup_taskset_next(tset, dst_cssp);
2473 }
2474
2475 /**
2476  * cgroup_taskset_next - iterate to the next task in taskset
2477  * @tset: taskset of interest
2478  * @dst_cssp: output variable for the destination css
2479  *
2480  * Return the next task in @tset.  Iteration must have been initialized
2481  * with cgroup_taskset_first().
2482  */
2483 struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
2484                                         struct cgroup_subsys_state **dst_cssp)
2485 {
2486         struct css_set *cset = tset->cur_cset;
2487         struct task_struct *task = tset->cur_task;
2488
2489         while (CGROUP_HAS_SUBSYS_CONFIG && &cset->mg_node != tset->csets) {
2490                 if (!task)
2491                         task = list_first_entry(&cset->mg_tasks,
2492                                                 struct task_struct, cg_list);
2493                 else
2494                         task = list_next_entry(task, cg_list);
2495
2496                 if (&task->cg_list != &cset->mg_tasks) {
2497                         tset->cur_cset = cset;
2498                         tset->cur_task = task;
2499
2500                         /*
2501                          * This function may be called both before and
2502                          * after cgroup_taskset_migrate().  The two cases
2503                          * can be distinguished by looking at whether @cset
2504                          * has its ->mg_dst_cset set.
2505                          */
2506                         if (cset->mg_dst_cset)
2507                                 *dst_cssp = cset->mg_dst_cset->subsys[tset->ssid];
2508                         else
2509                                 *dst_cssp = cset->subsys[tset->ssid];
2510
2511                         return task;
2512                 }
2513
2514                 cset = list_next_entry(cset, mg_node);
2515                 task = NULL;
2516         }
2517
2518         return NULL;
2519 }
2520
2521 /**
2522  * cgroup_migrate_execute - migrate a taskset
2523  * @mgctx: migration context
2524  *
2525  * Migrate tasks in @mgctx as setup by migration preparation functions.
2526  * This function fails iff one of the ->can_attach callbacks fails and
2527  * guarantees that either all or none of the tasks in @mgctx are migrated.
2528  * @mgctx is consumed regardless of success.
2529  */
2530 static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
2531 {
2532         struct cgroup_taskset *tset = &mgctx->tset;
2533         struct cgroup_subsys *ss;
2534         struct task_struct *task, *tmp_task;
2535         struct css_set *cset, *tmp_cset;
2536         int ssid, failed_ssid, ret;
2537
2538         /* check that we can legitimately attach to the cgroup */
2539         if (tset->nr_tasks) {
2540                 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2541                         if (ss->can_attach) {
2542                                 tset->ssid = ssid;
2543                                 ret = ss->can_attach(tset);
2544                                 if (ret) {
2545                                         failed_ssid = ssid;
2546                                         goto out_cancel_attach;
2547                                 }
2548                         }
2549                 } while_each_subsys_mask();
2550         }
2551
2552         /*
2553          * Now that we're guaranteed success, proceed to move all tasks to
2554          * the new cgroup.  There are no failure cases after here, so this
2555          * is the commit point.
2556          */
2557         spin_lock_irq(&css_set_lock);
2558         list_for_each_entry(cset, &tset->src_csets, mg_node) {
2559                 list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
2560                         struct css_set *from_cset = task_css_set(task);
2561                         struct css_set *to_cset = cset->mg_dst_cset;
2562
2563                         get_css_set(to_cset);
2564                         to_cset->nr_tasks++;
2565                         css_set_move_task(task, from_cset, to_cset, true);
2566                         from_cset->nr_tasks--;
2567                         /*
2568                          * If the source or destination cgroup is frozen,
2569                          * the task might require to change its state.
2570                          */
2571                         cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp,
2572                                                     to_cset->dfl_cgrp);
2573                         put_css_set_locked(from_cset);
2574
2575                 }
2576         }
2577         spin_unlock_irq(&css_set_lock);
2578
2579         /*
2580          * Migration is committed, all target tasks are now on dst_csets.
2581          * Nothing is sensitive to fork() after this point.  Notify
2582          * controllers that migration is complete.
2583          */
2584         tset->csets = &tset->dst_csets;
2585
2586         if (tset->nr_tasks) {
2587                 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2588                         if (ss->attach) {
2589                                 tset->ssid = ssid;
2590                                 ss->attach(tset);
2591                         }
2592                 } while_each_subsys_mask();
2593         }
2594
2595         ret = 0;
2596         goto out_release_tset;
2597
2598 out_cancel_attach:
2599         if (tset->nr_tasks) {
2600                 do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
2601                         if (ssid == failed_ssid)
2602                                 break;
2603                         if (ss->cancel_attach) {
2604                                 tset->ssid = ssid;
2605                                 ss->cancel_attach(tset);
2606                         }
2607                 } while_each_subsys_mask();
2608         }
2609 out_release_tset:
2610         spin_lock_irq(&css_set_lock);
2611         list_splice_init(&tset->dst_csets, &tset->src_csets);
2612         list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
2613                 list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
2614                 list_del_init(&cset->mg_node);
2615         }
2616         spin_unlock_irq(&css_set_lock);
2617
2618         /*
2619          * Re-initialize the cgroup_taskset structure in case it is reused
2620          * again in another cgroup_migrate_add_task()/cgroup_migrate_execute()
2621          * iteration.
2622          */
2623         tset->nr_tasks = 0;
2624         tset->csets    = &tset->src_csets;
2625         return ret;
2626 }
2627
2628 /**
2629  * cgroup_migrate_vet_dst - verify whether a cgroup can be migration destination
2630  * @dst_cgrp: destination cgroup to test
2631  *
2632  * On the default hierarchy, except for the mixable, (possible) thread root
2633  * and threaded cgroups, subtree_control must be zero for migration
2634  * destination cgroups with tasks so that child cgroups don't compete
2635  * against tasks.
2636  */
2637 int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
2638 {
2639         /* v1 doesn't have any restriction */
2640         if (!cgroup_on_dfl(dst_cgrp))
2641                 return 0;
2642
2643         /* verify @dst_cgrp can host resources */
2644         if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
2645                 return -EOPNOTSUPP;
2646
2647         /*
2648          * If @dst_cgrp is already or can become a thread root or is
2649          * threaded, it doesn't matter.
2650          */
2651         if (cgroup_can_be_thread_root(dst_cgrp) || cgroup_is_threaded(dst_cgrp))
2652                 return 0;
2653
2654         /* apply no-internal-process constraint */
2655         if (dst_cgrp->subtree_control)
2656                 return -EBUSY;
2657
2658         return 0;
2659 }
2660
2661 /**
2662  * cgroup_migrate_finish - cleanup after attach
2663  * @mgctx: migration context
2664  *
2665  * Undo cgroup_migrate_add_src() and cgroup_migrate_prepare_dst().  See
2666  * those functions for details.
2667  */
2668 void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
2669 {
2670         struct css_set *cset, *tmp_cset;
2671
2672         lockdep_assert_held(&cgroup_mutex);
2673
2674         spin_lock_irq(&css_set_lock);
2675
2676         list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets,
2677                                  mg_src_preload_node) {
2678                 cset->mg_src_cgrp = NULL;
2679                 cset->mg_dst_cgrp = NULL;
2680                 cset->mg_dst_cset = NULL;
2681                 list_del_init(&cset->mg_src_preload_node);
2682                 put_css_set_locked(cset);
2683         }
2684
2685         list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets,
2686                                  mg_dst_preload_node) {
2687                 cset->mg_src_cgrp = NULL;
2688                 cset->mg_dst_cgrp = NULL;
2689                 cset->mg_dst_cset = NULL;
2690                 list_del_init(&cset->mg_dst_preload_node);
2691                 put_css_set_locked(cset);
2692         }
2693
2694         spin_unlock_irq(&css_set_lock);
2695 }
2696
2697 /**
2698  * cgroup_migrate_add_src - add a migration source css_set
2699  * @src_cset: the source css_set to add
2700  * @dst_cgrp: the destination cgroup
2701  * @mgctx: migration context
2702  *
2703  * Tasks belonging to @src_cset are about to be migrated to @dst_cgrp.  Pin
2704  * @src_cset and add it to @mgctx->src_csets, which should later be cleaned
2705  * up by cgroup_migrate_finish().
2706  *
2707  * This function may be called without holding cgroup_threadgroup_rwsem
2708  * even if the target is a process.  Threads may be created and destroyed
2709  * but as long as cgroup_mutex is not dropped, no new css_set can be put
2710  * into play and the preloaded css_sets are guaranteed to cover all
2711  * migrations.
2712  */
2713 void cgroup_migrate_add_src(struct css_set *src_cset,
2714                             struct cgroup *dst_cgrp,
2715                             struct cgroup_mgctx *mgctx)
2716 {
2717         struct cgroup *src_cgrp;
2718
2719         lockdep_assert_held(&cgroup_mutex);
2720         lockdep_assert_held(&css_set_lock);
2721
2722         /*
2723          * If ->dead, @src_set is associated with one or more dead cgroups
2724          * and doesn't contain any migratable tasks.  Ignore it early so
2725          * that the rest of migration path doesn't get confused by it.
2726          */
2727         if (src_cset->dead)
2728                 return;
2729
2730         if (!list_empty(&src_cset->mg_src_preload_node))
2731                 return;
2732
2733         src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
2734
2735         WARN_ON(src_cset->mg_src_cgrp);
2736         WARN_ON(src_cset->mg_dst_cgrp);
2737         WARN_ON(!list_empty(&src_cset->mg_tasks));
2738         WARN_ON(!list_empty(&src_cset->mg_node));
2739
2740         src_cset->mg_src_cgrp = src_cgrp;
2741         src_cset->mg_dst_cgrp = dst_cgrp;
2742         get_css_set(src_cset);
2743         list_add_tail(&src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets);
2744 }
2745
2746 /**
2747  * cgroup_migrate_prepare_dst - prepare destination css_sets for migration
2748  * @mgctx: migration context
2749  *
2750  * Tasks are about to be moved and all the source css_sets have been
2751  * preloaded to @mgctx->preloaded_src_csets.  This function looks up and
2752  * pins all destination css_sets, links each to its source, and append them
2753  * to @mgctx->preloaded_dst_csets.
2754  *
2755  * This function must be called after cgroup_migrate_add_src() has been
2756  * called on each migration source css_set.  After migration is performed
2757  * using cgroup_migrate(), cgroup_migrate_finish() must be called on
2758  * @mgctx.
2759  */
2760 int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
2761 {
2762         struct css_set *src_cset, *tmp_cset;
2763
2764         lockdep_assert_held(&cgroup_mutex);
2765
2766         /* look up the dst cset for each src cset and link it to src */
2767         list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
2768                                  mg_src_preload_node) {
2769                 struct css_set *dst_cset;
2770                 struct cgroup_subsys *ss;
2771                 int ssid;
2772
2773                 dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
2774                 if (!dst_cset)
2775                         return -ENOMEM;
2776
2777                 WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
2778
2779                 /*
2780                  * If src cset equals dst, it's noop.  Drop the src.
2781                  * cgroup_migrate() will skip the cset too.  Note that we
2782                  * can't handle src == dst as some nodes are used by both.
2783                  */
2784                 if (src_cset == dst_cset) {
2785                         src_cset->mg_src_cgrp = NULL;
2786                         src_cset->mg_dst_cgrp = NULL;
2787                         list_del_init(&src_cset->mg_src_preload_node);
2788                         put_css_set(src_cset);
2789                         put_css_set(dst_cset);
2790                         continue;
2791                 }
2792
2793                 src_cset->mg_dst_cset = dst_cset;
2794
2795                 if (list_empty(&dst_cset->mg_dst_preload_node))
2796                         list_add_tail(&dst_cset->mg_dst_preload_node,
2797                                       &mgctx->preloaded_dst_csets);
2798                 else
2799                         put_css_set(dst_cset);
2800
2801                 for_each_subsys(ss, ssid)
2802                         if (src_cset->subsys[ssid] != dst_cset->subsys[ssid])
2803                                 mgctx->ss_mask |= 1 << ssid;
2804         }
2805
2806         return 0;
2807 }
2808
2809 /**
2810  * cgroup_migrate - migrate a process or task to a cgroup
2811  * @leader: the leader of the process or the task to migrate
2812  * @threadgroup: whether @leader points to the whole process or a single task
2813  * @mgctx: migration context
2814  *
2815  * Migrate a process or task denoted by @leader.  If migrating a process,
2816  * the caller must be holding cgroup_threadgroup_rwsem.  The caller is also
2817  * responsible for invoking cgroup_migrate_add_src() and
2818  * cgroup_migrate_prepare_dst() on the targets before invoking this
2819  * function and following up with cgroup_migrate_finish().
2820  *
2821  * As long as a controller's ->can_attach() doesn't fail, this function is
2822  * guaranteed to succeed.  This means that, excluding ->can_attach()
2823  * failure, when migrating multiple targets, the success or failure can be
2824  * decided for all targets by invoking group_migrate_prepare_dst() before
2825  * actually starting migrating.
2826  */
2827 int cgroup_migrate(struct task_struct *leader, bool threadgroup,
2828                    struct cgroup_mgctx *mgctx)
2829 {
2830         struct task_struct *task;
2831
2832         /*
2833          * The following thread iteration should be inside an RCU critical
2834          * section to prevent tasks from being freed while taking the snapshot.
2835          * spin_lock_irq() implies RCU critical section here.
2836          */
2837         spin_lock_irq(&css_set_lock);
2838         task = leader;
2839         do {
2840                 cgroup_migrate_add_task(task, mgctx);
2841                 if (!threadgroup)
2842                         break;
2843         } while_each_thread(leader, task);
2844         spin_unlock_irq(&css_set_lock);
2845
2846         return cgroup_migrate_execute(mgctx);
2847 }
2848
2849 /**
2850  * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
2851  * @dst_cgrp: the cgroup to attach to
2852  * @leader: the task or the leader of the threadgroup to be attached
2853  * @threadgroup: attach the whole threadgroup?
2854  *
2855  * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
2856  */
2857 int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
2858                        bool threadgroup)
2859 {
2860         DEFINE_CGROUP_MGCTX(mgctx);
2861         struct task_struct *task;
2862         int ret = 0;
2863
2864         /* look up all src csets */
2865         spin_lock_irq(&css_set_lock);
2866         rcu_read_lock();
2867         task = leader;
2868         do {
2869                 cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx);
2870                 if (!threadgroup)
2871                         break;
2872         } while_each_thread(leader, task);
2873         rcu_read_unlock();
2874         spin_unlock_irq(&css_set_lock);
2875
2876         /* prepare dst csets and commit */
2877         ret = cgroup_migrate_prepare_dst(&mgctx);
2878         if (!ret)
2879                 ret = cgroup_migrate(leader, threadgroup, &mgctx);
2880
2881         cgroup_migrate_finish(&mgctx);
2882
2883         if (!ret)
2884                 TRACE_CGROUP_PATH(attach_task, dst_cgrp, leader, threadgroup);
2885
2886         return ret;
2887 }
2888
2889 struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
2890                                              bool *threadgroup_locked)
2891 {
2892         struct task_struct *tsk;
2893         pid_t pid;
2894
2895         if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
2896                 return ERR_PTR(-EINVAL);
2897
2898         /*
2899          * If we migrate a single thread, we don't care about threadgroup
2900          * stability. If the thread is `current`, it won't exit(2) under our
2901          * hands or change PID through exec(2). We exclude
2902          * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write
2903          * callers by cgroup_mutex.
2904          * Therefore, we can skip the global lock.
2905          */
2906         lockdep_assert_held(&cgroup_mutex);
2907         *threadgroup_locked = pid || threadgroup;
2908         cgroup_attach_lock(*threadgroup_locked);
2909
2910         rcu_read_lock();
2911         if (pid) {
2912                 tsk = find_task_by_vpid(pid);
2913                 if (!tsk) {
2914                         tsk = ERR_PTR(-ESRCH);
2915                         goto out_unlock_threadgroup;
2916                 }
2917         } else {
2918                 tsk = current;
2919         }
2920
2921         if (threadgroup)
2922                 tsk = tsk->group_leader;
2923
2924         /*
2925          * kthreads may acquire PF_NO_SETAFFINITY during initialization.
2926          * If userland migrates such a kthread to a non-root cgroup, it can
2927          * become trapped in a cpuset, or RT kthread may be born in a
2928          * cgroup with no rt_runtime allocated.  Just say no.
2929          */
2930         if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
2931                 tsk = ERR_PTR(-EINVAL);
2932                 goto out_unlock_threadgroup;
2933         }
2934
2935         get_task_struct(tsk);
2936         goto out_unlock_rcu;
2937
2938 out_unlock_threadgroup:
2939         cgroup_attach_unlock(*threadgroup_locked);
2940         *threadgroup_locked = false;
2941 out_unlock_rcu:
2942         rcu_read_unlock();
2943         return tsk;
2944 }
2945
2946 void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked)
2947 {
2948         struct cgroup_subsys *ss;
2949         int ssid;
2950
2951         /* release reference from cgroup_procs_write_start() */
2952         put_task_struct(task);
2953
2954         cgroup_attach_unlock(threadgroup_locked);
2955
2956         for_each_subsys(ss, ssid)
2957                 if (ss->post_attach)
2958                         ss->post_attach();
2959 }
2960
2961 static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask)
2962 {
2963         struct cgroup_subsys *ss;
2964         bool printed = false;
2965         int ssid;
2966
2967         do_each_subsys_mask(ss, ssid, ss_mask) {
2968                 if (printed)
2969                         seq_putc(seq, ' ');
2970                 seq_puts(seq, ss->name);
2971                 printed = true;
2972         } while_each_subsys_mask();
2973         if (printed)
2974                 seq_putc(seq, '\n');
2975 }
2976
2977 /* show controllers which are enabled from the parent */
2978 static int cgroup_controllers_show(struct seq_file *seq, void *v)
2979 {
2980         struct cgroup *cgrp = seq_css(seq)->cgroup;
2981
2982         cgroup_print_ss_mask(seq, cgroup_control(cgrp));
2983         return 0;
2984 }
2985
2986 /* show controllers which are enabled for a given cgroup's children */
2987 static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
2988 {
2989         struct cgroup *cgrp = seq_css(seq)->cgroup;
2990
2991         cgroup_print_ss_mask(seq, cgrp->subtree_control);
2992         return 0;
2993 }
2994
2995 /**
2996  * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy
2997  * @cgrp: root of the subtree to update csses for
2998  *
2999  * @cgrp's control masks have changed and its subtree's css associations
3000  * need to be updated accordingly.  This function looks up all css_sets
3001  * which are attached to the subtree, creates the matching updated css_sets
3002  * and migrates the tasks to the new ones.
3003  */
3004 static int cgroup_update_dfl_csses(struct cgroup *cgrp)
3005 {
3006         DEFINE_CGROUP_MGCTX(mgctx);
3007         struct cgroup_subsys_state *d_css;
3008         struct cgroup *dsct;
3009         struct css_set *src_cset;
3010         bool has_tasks;
3011         int ret;
3012
3013         lockdep_assert_held(&cgroup_mutex);
3014
3015         /* look up all csses currently attached to @cgrp's subtree */
3016         spin_lock_irq(&css_set_lock);
3017         cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
3018                 struct cgrp_cset_link *link;
3019
3020                 /*
3021                  * As cgroup_update_dfl_csses() is only called by
3022                  * cgroup_apply_control(). The csses associated with the
3023                  * given cgrp will not be affected by changes made to
3024                  * its subtree_control file. We can skip them.
3025                  */
3026                 if (dsct == cgrp)
3027                         continue;
3028
3029                 list_for_each_entry(link, &dsct->cset_links, cset_link)
3030                         cgroup_migrate_add_src(link->cset, dsct, &mgctx);
3031         }
3032         spin_unlock_irq(&css_set_lock);
3033
3034         /*
3035          * We need to write-lock threadgroup_rwsem while migrating tasks.
3036          * However, if there are no source csets for @cgrp, changing its
3037          * controllers isn't gonna produce any task migrations and the
3038          * write-locking can be skipped safely.
3039          */
3040         has_tasks = !list_empty(&mgctx.preloaded_src_csets);
3041         cgroup_attach_lock(has_tasks);
3042
3043         /* NULL dst indicates self on default hierarchy */
3044         ret = cgroup_migrate_prepare_dst(&mgctx);
3045         if (ret)
3046                 goto out_finish;
3047
3048         spin_lock_irq(&css_set_lock);
3049         list_for_each_entry(src_cset, &mgctx.preloaded_src_csets,
3050                             mg_src_preload_node) {
3051                 struct task_struct *task, *ntask;
3052
3053                 /* all tasks in src_csets need to be migrated */
3054                 list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
3055                         cgroup_migrate_add_task(task, &mgctx);
3056         }
3057         spin_unlock_irq(&css_set_lock);
3058
3059         ret = cgroup_migrate_execute(&mgctx);
3060 out_finish:
3061         cgroup_migrate_finish(&mgctx);
3062         cgroup_attach_unlock(has_tasks);
3063         return ret;
3064 }
3065
3066 /**
3067  * cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses
3068  * @cgrp: root of the target subtree
3069  *
3070  * Because css offlining is asynchronous, userland may try to re-enable a
3071  * controller while the previous css is still around.  This function grabs
3072  * cgroup_mutex and drains the previous css instances of @cgrp's subtree.
3073  */
3074 void cgroup_lock_and_drain_offline(struct cgroup *cgrp)
3075         __acquires(&cgroup_mutex)
3076 {
3077         struct cgroup *dsct;
3078         struct cgroup_subsys_state *d_css;
3079         struct cgroup_subsys *ss;
3080         int ssid;
3081
3082 restart:
3083         cgroup_lock();
3084
3085         cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
3086                 for_each_subsys(ss, ssid) {
3087                         struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
3088                         DEFINE_WAIT(wait);
3089
3090                         if (!css || !percpu_ref_is_dying(&css->refcnt))
3091                                 continue;
3092
3093                         cgroup_get_live(dsct);
3094                         prepare_to_wait(&dsct->offline_waitq, &wait,
3095                                         TASK_UNINTERRUPTIBLE);
3096
3097                         cgroup_unlock();
3098                         schedule();
3099                         finish_wait(&dsct->offline_waitq, &wait);
3100
3101                         cgroup_put(dsct);
3102                         goto restart;
3103                 }
3104         }
3105 }
3106
3107 /**
3108  * cgroup_save_control - save control masks and dom_cgrp of a subtree
3109  * @cgrp: root of the target subtree
3110  *
3111  * Save ->subtree_control, ->subtree_ss_mask and ->dom_cgrp to the
3112  * respective old_ prefixed fields for @cgrp's subtree including @cgrp
3113  * itself.
3114  */
3115 static void cgroup_save_control(struct cgroup *cgrp)
3116 {
3117         struct cgroup *dsct;
3118         struct cgroup_subsys_state *d_css;
3119
3120         cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
3121                 dsct->old_subtree_control = dsct->subtree_control;
3122                 dsct->old_subtree_ss_mask = dsct->subtree_ss_mask;
3123                 dsct->old_dom_cgrp = dsct->dom_cgrp;
3124         }
3125 }
3126
3127 /**
3128  * cgroup_propagate_control - refresh control masks of a subtree
3129  * @cgrp: root of the target subtree
3130  *
3131  * For @cgrp and its subtree, ensure ->subtree_ss_mask matches
3132  * ->subtree_control and propagate controller availability through the
3133  * subtree so that descendants don't have unavailable controllers enabled.
3134  */
3135 static void cgroup_propagate_control(struct cgroup *cgrp)
3136 {
3137         struct cgroup *dsct;
3138         struct cgroup_subsys_state *d_css;
3139
3140         cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
3141                 dsct->subtree_control &= cgroup_control(dsct);
3142                 dsct->subtree_ss_mask =
3143                         cgroup_calc_subtree_ss_mask(dsct->subtree_control,
3144                                                     cgroup_ss_mask(dsct));
3145         }
3146 }
3147
3148 /**
3149  * cgroup_restore_control - restore control masks and dom_cgrp of a subtree
3150  * @cgrp: root of the target subtree
3151  *
3152  * Restore ->subtree_control, ->subtree_ss_mask and ->dom_cgrp from the
3153  * respective old_ prefixed fields for @cgrp's subtree including @cgrp
3154  * itself.
3155  */
3156 static void cgroup_restore_control(struct cgroup *cgrp)
3157 {
3158         struct cgroup *dsct;
3159         struct cgroup_subsys_state *d_css;
3160
3161         cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
3162                 dsct->subtree_control = dsct->old_subtree_control;
3163                 dsct->subtree_ss_mask = dsct->old_subtree_ss_mask;
3164                 dsct->dom_cgrp = dsct->old_dom_cgrp;
3165         }
3166 }
3167
3168 static bool css_visible(struct cgroup_subsys_state *css)
3169 {
3170         struct cgroup_subsys *ss = css->ss;
3171         struct cgroup *cgrp = css->cgroup;
3172
3173         if (cgroup_control(cgrp) & (1 << ss->id))
3174                 return true;
3175         if (!(cgroup_ss_mask(cgrp) & (1 << ss->id)))
3176                 return false;
3177         return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl;
3178 }
3179
3180 /**
3181  * cgroup_apply_control_enable - enable or show csses according to control
3182  * @cgrp: root of the target subtree
3183  *
3184  * Walk @cgrp's subtree and create new csses or make the existing ones
3185  * visible.  A css is created invisible if it's being implicitly enabled
3186  * through dependency.  An invisible css is made visible when the userland
3187  * explicitly enables it.
3188  *
3189  * Returns 0 on success, -errno on failure.  On failure, csses which have
3190  * been processed already aren't cleaned up.  The caller is responsible for
3191  * cleaning up with cgroup_apply_control_disable().
3192  */
3193 static int cgroup_apply_control_enable(struct cgroup *cgrp)
3194 {
3195         struct cgroup *dsct;
3196         struct cgroup_subsys_state *d_css;
3197         struct cgroup_subsys *ss;
3198         int ssid, ret;
3199
3200         cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
3201                 for_each_subsys(ss, ssid) {
3202                         struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
3203
3204                         if (!(cgroup_ss_mask(dsct) & (1 << ss->id)))
3205                                 continue;
3206
3207                         if (!css) {
3208                                 css = css_create(dsct, ss);
3209                                 if (IS_ERR(css))
3210                                         return PTR_ERR(css);
3211                         }
3212
3213                         WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
3214
3215                         if (css_visible(css)) {
3216                                 ret = css_populate_dir(css);
3217                                 if (ret)
3218                                         return ret;
3219                         }
3220                 }
3221         }
3222
3223         return 0;
3224 }
3225
3226 /**
3227  * cgroup_apply_control_disable - kill or hide csses according to control
3228  * @cgrp: root of the target subtree
3229  *
3230  * Walk @cgrp's subtree and kill and hide csses so that they match
3231  * cgroup_ss_mask() and cgroup_visible_mask().
3232  *
3233  * A css is hidden when the userland requests it to be disabled while other
3234  * subsystems are still depending on it.  The css must not actively control
3235  * resources and be in the vanilla state if it's made visible again later.
3236  * Controllers which may be depended upon should provide ->css_reset() for
3237  * this purpose.
3238  */
3239 static void cgroup_apply_control_disable(struct cgroup *cgrp)
3240 {
3241         struct cgroup *dsct;
3242         struct cgroup_subsys_state *d_css;
3243         struct cgroup_subsys *ss;
3244         int ssid;
3245
3246         cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) {
3247                 for_each_subsys(ss, ssid) {
3248                         struct cgroup_subsys_state *css = cgroup_css(dsct, ss);
3249
3250                         if (!css)
3251                                 continue;
3252
3253                         WARN_ON_ONCE(percpu_ref_is_dying(&css->refcnt));
3254
3255                         if (css->parent &&
3256                             !(cgroup_ss_mask(dsct) & (1 << ss->id))) {
3257                                 kill_css(css);
3258                         } else if (!css_visible(css)) {
3259                                 css_clear_dir(css);
3260                                 if (ss->css_reset)
3261                                         ss->css_reset(css);
3262                         }
3263                 }
3264         }
3265 }
3266
3267 /**
3268  * cgroup_apply_control - apply control mask updates to the subtree
3269  * @cgrp: root of the target subtree
3270  *
3271  * subsystems can be enabled and disabled in a subtree using the following
3272  * steps.
3273  *
3274  * 1. Call cgroup_save_control() to stash the current state.
3275  * 2. Update ->subtree_control masks in the subtree as desired.
3276  * 3. Call cgroup_apply_control() to apply the changes.
3277  * 4. Optionally perform other related operations.
3278  * 5. Call cgroup_finalize_control() to finish up.
3279  *
3280  * This function implements step 3 and propagates the mask changes
3281  * throughout @cgrp's subtree, updates csses accordingly and perform
3282  * process migrations.
3283  */
3284 static int cgroup_apply_control(struct cgroup *cgrp)
3285 {
3286         int ret;
3287
3288         cgroup_propagate_control(cgrp);
3289
3290         ret = cgroup_apply_control_enable(cgrp);
3291         if (ret)
3292                 return ret;
3293
3294         /*
3295          * At this point, cgroup_e_css_by_mask() results reflect the new csses
3296          * making the following cgroup_update_dfl_csses() properly update
3297          * css associations of all tasks in the subtree.
3298          */
3299         return cgroup_update_dfl_csses(cgrp);
3300 }
3301
3302 /**
3303  * cgroup_finalize_control - finalize control mask update
3304  * @cgrp: root of the target subtree
3305  * @ret: the result of the update
3306  *
3307  * Finalize control mask update.  See cgroup_apply_control() for more info.
3308  */
3309 static void cgroup_finalize_control(struct cgroup *cgrp, int ret)
3310 {
3311         if (ret) {
3312                 cgroup_restore_control(cgrp);
3313                 cgroup_propagate_control(cgrp);
3314         }
3315
3316         cgroup_apply_control_disable(cgrp);
3317 }
3318
3319 static int cgroup_vet_subtree_control_enable(struct cgroup *cgrp, u16 enable)
3320 {
3321         u16 domain_enable = enable & ~cgrp_dfl_threaded_ss_mask;
3322
3323         /* if nothing is getting enabled, nothing to worry about */
3324         if (!enable)
3325                 return 0;
3326
3327         /* can @cgrp host any resources? */
3328         if (!cgroup_is_valid_domain(cgrp->dom_cgrp))
3329                 return -EOPNOTSUPP;
3330
3331         /* mixables don't care */
3332         if (cgroup_is_mixable(cgrp))
3333                 return 0;
3334
3335         if (domain_enable) {
3336                 /* can't enable domain controllers inside a thread subtree */
3337                 if (cgroup_is_thread_root(cgrp) || cgroup_is_threaded(cgrp))
3338                         return -EOPNOTSUPP;
3339         } else {
3340                 /*
3341                  * Threaded controllers can handle internal competitions
3342                  * and are always allowed inside a (prospective) thread
3343                  * subtree.
3344                  */
3345                 if (cgroup_can_be_thread_root(cgrp) || cgroup_is_threaded(cgrp))
3346                         return 0;
3347         }
3348
3349         /*
3350          * Controllers can't be enabled for a cgroup with tasks to avoid
3351          * child cgroups competing against tasks.
3352          */
3353         if (cgroup_has_tasks(cgrp))
3354                 return -EBUSY;
3355
3356         return 0;
3357 }
3358
3359 /* change the enabled child controllers for a cgroup in the default hierarchy */
3360 static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
3361                                             char *buf, size_t nbytes,
3362                                             loff_t off)
3363 {
3364         u16 enable = 0, disable = 0;
3365         struct cgroup *cgrp, *child;
3366         struct cgroup_subsys *ss;
3367         char *tok;
3368         int ssid, ret;
3369
3370         /*
3371          * Parse input - space separated list of subsystem names prefixed
3372          * with either + or -.
3373          */
3374         buf = strstrip(buf);
3375         while ((tok = strsep(&buf, " "))) {
3376                 if (tok[0] == '\0')
3377                         continue;
3378                 do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) {
3379                         if (!cgroup_ssid_enabled(ssid) ||
3380                             strcmp(tok + 1, ss->name))
3381                                 continue;
3382
3383                         if (*tok == '+') {
3384                                 enable |= 1 << ssid;
3385                                 disable &= ~(1 << ssid);
3386                         } else if (*tok == '-') {
3387                                 disable |= 1 << ssid;
3388                                 enable &= ~(1 << ssid);
3389                         } else {
3390                                 return -EINVAL;
3391                         }
3392                         break;
3393                 } while_each_subsys_mask();
3394                 if (ssid == CGROUP_SUBSYS_COUNT)
3395                         return -EINVAL;
3396         }
3397
3398         cgrp = cgroup_kn_lock_live(of->kn, true);
3399         if (!cgrp)
3400                 return -ENODEV;
3401
3402         for_each_subsys(ss, ssid) {
3403                 if (enable & (1 << ssid)) {
3404                         if (cgrp->subtree_control & (1 << ssid)) {
3405                                 enable &= ~(1 << ssid);
3406                                 continue;
3407                         }
3408
3409                         if (!(cgroup_control(cgrp) & (1 << ssid))) {
3410                                 ret = -ENOENT;
3411                                 goto out_unlock;
3412                         }
3413                 } else if (disable & (1 << ssid)) {
3414                         if (!(cgrp->subtree_control & (1 << ssid))) {
3415                                 disable &= ~(1 << ssid);
3416                                 continue;
3417                         }
3418
3419                         /* a child has it enabled? */
3420                         cgroup_for_each_live_child(child, cgrp) {
3421                                 if (child->subtree_control & (1 << ssid)) {
3422                                         ret = -EBUSY;
3423                                         goto out_unlock;
3424                                 }
3425                         }
3426                 }
3427         }
3428
3429         if (!enable && !disable) {
3430                 ret = 0;
3431                 goto out_unlock;
3432         }
3433
3434         ret = cgroup_vet_subtree_control_enable(cgrp, enable);
3435         if (ret)
3436                 goto out_unlock;
3437
3438         /* save and update control masks and prepare csses */
3439         cgroup_save_control(cgrp);
3440
3441         cgrp->subtree_control |= enable;
3442         cgrp->subtree_control &= ~disable;
3443
3444         ret = cgroup_apply_control(cgrp);
3445         cgroup_finalize_control(cgrp, ret);
3446         if (ret)
3447                 goto out_unlock;
3448
3449         kernfs_activate(cgrp->kn);
3450 out_unlock:
3451         cgroup_kn_unlock(of->kn);
3452         return ret ?: nbytes;
3453 }
3454
3455 /**
3456  * cgroup_enable_threaded - make @cgrp threaded
3457  * @cgrp: the target cgroup
3458  *
3459  * Called when "threaded" is written to the cgroup.type interface file and
3460  * tries to make @cgrp threaded and join the parent's resource domain.
3461  * This function is never called on the root cgroup as cgroup.type doesn't
3462  * exist on it.
3463  */
3464 static int cgroup_enable_threaded(struct cgroup *cgrp)
3465 {
3466         struct cgroup *parent = cgroup_parent(cgrp);
3467         struct cgroup *dom_cgrp = parent->dom_cgrp;
3468         struct cgroup *dsct;
3469         struct cgroup_subsys_state *d_css;
3470         int ret;
3471
3472         lockdep_assert_held(&cgroup_mutex);
3473
3474         /* noop if already threaded */
3475         if (cgroup_is_threaded(cgrp))
3476                 return 0;
3477
3478         /*
3479          * If @cgroup is populated or has domain controllers enabled, it
3480          * can't be switched.  While the below cgroup_can_be_thread_root()
3481          * test can catch the same conditions, that's only when @parent is
3482          * not mixable, so let's check it explicitly.
3483          */
3484         if (cgroup_is_populated(cgrp) ||
3485             cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
3486                 return -EOPNOTSUPP;
3487
3488         /* we're joining the parent's domain, ensure its validity */
3489         if (!cgroup_is_valid_domain(dom_cgrp) ||
3490             !cgroup_can_be_thread_root(dom_cgrp))
3491                 return -EOPNOTSUPP;
3492
3493         /*
3494          * The following shouldn't cause actual migrations and should
3495          * always succeed.
3496          */
3497         cgroup_save_control(cgrp);
3498
3499         cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp)
3500                 if (dsct == cgrp || cgroup_is_threaded(dsct))
3501                         dsct->dom_cgrp = dom_cgrp;
3502
3503         ret = cgroup_apply_control(cgrp);
3504         if (!ret)
3505                 parent->nr_threaded_children++;
3506
3507         cgroup_finalize_control(cgrp, ret);
3508         return ret;
3509 }
3510
3511 static int cgroup_type_show(struct seq_file *seq, void *v)
3512 {
3513         struct cgroup *cgrp = seq_css(seq)->cgroup;
3514
3515         if (cgroup_is_threaded(cgrp))
3516                 seq_puts(seq, "threaded\n");
3517         else if (!cgroup_is_valid_domain(cgrp))
3518                 seq_puts(seq, "domain invalid\n");
3519         else if (cgroup_is_thread_root(cgrp))
3520                 seq_puts(seq, "domain threaded\n");
3521         else
3522                 seq_puts(seq, "domain\n");
3523
3524         return 0;
3525 }
3526
3527 static ssize_t cgroup_type_write(struct kernfs_open_file *of, char *buf,
3528                                  size_t nbytes, loff_t off)
3529 {
3530         struct cgroup *cgrp;
3531         int ret;
3532
3533         /* only switching to threaded mode is supported */
3534         if (strcmp(strstrip(buf), "threaded"))
3535                 return -EINVAL;
3536
3537         /* drain dying csses before we re-apply (threaded) subtree control */
3538         cgrp = cgroup_kn_lock_live(of->kn, true);
3539         if (!cgrp)
3540                 return -ENOENT;
3541
3542         /* threaded can only be enabled */
3543         ret = cgroup_enable_threaded(cgrp);
3544
3545         cgroup_kn_unlock(of->kn);
3546         return ret ?: nbytes;
3547 }
3548
3549 static int cgroup_max_descendants_show(struct seq_file *seq, void *v)
3550 {
3551         struct cgroup *cgrp = seq_css(seq)->cgroup;
3552         int descendants = READ_ONCE(cgrp->max_descendants);
3553
3554         if (descendants == INT_MAX)
3555                 seq_puts(seq, "max\n");
3556         else
3557                 seq_printf(seq, "%d\n", descendants);
3558
3559         return 0;
3560 }
3561
3562 static ssize_t cgroup_max_descendants_write(struct kernfs_open_file *of,
3563                                            char *buf, size_t nbytes, loff_t off)
3564 {
3565         struct cgroup *cgrp;
3566         int descendants;
3567         ssize_t ret;
3568
3569         buf = strstrip(buf);
3570         if (!strcmp(buf, "max")) {
3571                 descendants = INT_MAX;
3572         } else {
3573                 ret = kstrtoint(buf, 0, &descendants);
3574                 if (ret)
3575                         return ret;
3576         }
3577
3578         if (descendants < 0)
3579                 return -ERANGE;
3580
3581         cgrp = cgroup_kn_lock_live(of->kn, false);
3582         if (!cgrp)
3583                 return -ENOENT;
3584
3585         cgrp->max_descendants = descendants;
3586
3587         cgroup_kn_unlock(of->kn);
3588
3589         return nbytes;
3590 }
3591
3592 static int cgroup_max_depth_show(struct seq_file *seq, void *v)
3593 {
3594         struct cgroup *cgrp = seq_css(seq)->cgroup;
3595         int depth = READ_ONCE(cgrp->max_depth);
3596
3597         if (depth == INT_MAX)
3598                 seq_puts(seq, "max\n");
3599         else
3600                 seq_printf(seq, "%d\n", depth);
3601
3602         return 0;
3603 }
3604
3605 static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
3606                                       char *buf, size_t nbytes, loff_t off)
3607 {
3608         struct cgroup *cgrp;
3609         ssize_t ret;
3610         int depth;
3611
3612         buf = strstrip(buf);
3613         if (!strcmp(buf, "max")) {
3614                 depth = INT_MAX;
3615         } else {
3616                 ret = kstrtoint(buf, 0, &depth);
3617                 if (ret)
3618                         return ret;
3619         }
3620
3621         if (depth < 0)
3622                 return -ERANGE;
3623
3624         cgrp = cgroup_kn_lock_live(of->kn, false);
3625         if (!cgrp)
3626                 return -ENOENT;
3627
3628         cgrp->max_depth = depth;
3629
3630         cgroup_kn_unlock(of->kn);
3631
3632         return nbytes;
3633 }
3634
3635 static int cgroup_events_show(struct seq_file *seq, void *v)
3636 {
3637         struct cgroup *cgrp = seq_css(seq)->cgroup;
3638
3639         seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp));
3640         seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags));
3641
3642         return 0;
3643 }
3644
3645 static int cgroup_stat_show(struct seq_file *seq, void *v)
3646 {
3647         struct cgroup *cgroup = seq_css(seq)->cgroup;
3648
3649         seq_printf(seq, "nr_descendants %d\n",
3650                    cgroup->nr_descendants);
3651         seq_printf(seq, "nr_dying_descendants %d\n",
3652                    cgroup->nr_dying_descendants);
3653
3654         return 0;
3655 }
3656
3657 static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
3658                                                  struct cgroup *cgrp, int ssid)
3659 {
3660         struct cgroup_subsys *ss = cgroup_subsys[ssid];
3661         struct cgroup_subsys_state *css;
3662         int ret;
3663
3664         if (!ss->css_extra_stat_show)
3665                 return 0;
3666
3667         css = cgroup_tryget_css(cgrp, ss);
3668         if (!css)
3669                 return 0;
3670
3671         ret = ss->css_extra_stat_show(seq, css);
3672         css_put(css);
3673         return ret;
3674 }
3675
3676 static int cpu_stat_show(struct seq_file *seq, void *v)
3677 {
3678         struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
3679         int ret = 0;
3680
3681         cgroup_base_stat_cputime_show(seq);
3682 #ifdef CONFIG_CGROUP_SCHED
3683         ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
3684 #endif
3685         return ret;
3686 }
3687
3688 #ifdef CONFIG_PSI
3689 static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
3690 {
3691         struct cgroup *cgrp = seq_css(seq)->cgroup;
3692         struct psi_group *psi = cgroup_psi(cgrp);
3693
3694         return psi_show(seq, psi, PSI_IO);
3695 }
3696 static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
3697 {
3698         struct cgroup *cgrp = seq_css(seq)->cgroup;
3699         struct psi_group *psi = cgroup_psi(cgrp);
3700
3701         return psi_show(seq, psi, PSI_MEM);
3702 }
3703 static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
3704 {
3705         struct cgroup *cgrp = seq_css(seq)->cgroup;
3706         struct psi_group *psi = cgroup_psi(cgrp);
3707
3708         return psi_show(seq, psi, PSI_CPU);
3709 }
3710
3711 static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
3712                               size_t nbytes, enum psi_res res)
3713 {
3714         struct cgroup_file_ctx *ctx = of->priv;
3715         struct psi_trigger *new;
3716         struct cgroup *cgrp;
3717         struct psi_group *psi;
3718
3719         cgrp = cgroup_kn_lock_live(of->kn, false);
3720         if (!cgrp)
3721                 return -ENODEV;
3722
3723         cgroup_get(cgrp);
3724         cgroup_kn_unlock(of->kn);
3725
3726         /* Allow only one trigger per file descriptor */
3727         if (ctx->psi.trigger) {
3728                 cgroup_put(cgrp);
3729                 return -EBUSY;
3730         }
3731
3732         psi = cgroup_psi(cgrp);
3733         new = psi_trigger_create(psi, buf, res, of->file, of);
3734         if (IS_ERR(new)) {
3735                 cgroup_put(cgrp);
3736                 return PTR_ERR(new);
3737         }
3738
3739         smp_store_release(&ctx->psi.trigger, new);
3740         cgroup_put(cgrp);
3741
3742         return nbytes;
3743 }
3744
3745 static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
3746                                           char *buf, size_t nbytes,
3747                                           loff_t off)
3748 {
3749         return pressure_write(of, buf, nbytes, PSI_IO);
3750 }
3751
3752 static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
3753                                           char *buf, size_t nbytes,
3754                                           loff_t off)
3755 {
3756         return pressure_write(of, buf, nbytes, PSI_MEM);
3757 }
3758
3759 static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
3760                                           char *buf, size_t nbytes,
3761                                           loff_t off)
3762 {
3763         return pressure_write(of, buf, nbytes, PSI_CPU);
3764 }
3765
3766 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
3767 static int cgroup_irq_pressure_show(struct seq_file *seq, void *v)
3768 {
3769         struct cgroup *cgrp = seq_css(seq)->cgroup;
3770         struct psi_group *psi = cgroup_psi(cgrp);
3771
3772         return psi_show(seq, psi, PSI_IRQ);
3773 }
3774
3775 static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of,
3776                                          char *buf, size_t nbytes,
3777                                          loff_t off)
3778 {
3779         return pressure_write(of, buf, nbytes, PSI_IRQ);
3780 }
3781 #endif
3782
3783 static int cgroup_pressure_show(struct seq_file *seq, void *v)
3784 {
3785         struct cgroup *cgrp = seq_css(seq)->cgroup;
3786         struct psi_group *psi = cgroup_psi(cgrp);
3787
3788         seq_printf(seq, "%d\n", psi->enabled);
3789
3790         return 0;
3791 }
3792
3793 static ssize_t cgroup_pressure_write(struct kernfs_open_file *of,
3794                                      char *buf, size_t nbytes,
3795                                      loff_t off)
3796 {
3797         ssize_t ret;
3798         int enable;
3799         struct cgroup *cgrp;
3800         struct psi_group *psi;
3801
3802         ret = kstrtoint(strstrip(buf), 0, &enable);
3803         if (ret)
3804                 return ret;
3805
3806         if (enable < 0 || enable > 1)
3807                 return -ERANGE;
3808
3809         cgrp = cgroup_kn_lock_live(of->kn, false);
3810         if (!cgrp)
3811                 return -ENOENT;
3812
3813         psi = cgroup_psi(cgrp);
3814         if (psi->enabled != enable) {
3815                 int i;
3816
3817                 /* show or hide {cpu,memory,io,irq}.pressure files */
3818                 for (i = 0; i < NR_PSI_RESOURCES; i++)
3819                         cgroup_file_show(&cgrp->psi_files[i], enable);
3820
3821                 psi->enabled = enable;
3822                 if (enable)
3823                         psi_cgroup_restart(psi);
3824         }
3825
3826         cgroup_kn_unlock(of->kn);
3827
3828         return nbytes;
3829 }
3830
3831 static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
3832                                           poll_table *pt)
3833 {
3834         struct cgroup_file_ctx *ctx = of->priv;
3835
3836         return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
3837 }
3838
3839 static int cgroup_pressure_open(struct kernfs_open_file *of)
3840 {
3841         if (of->file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE))
3842                 return -EPERM;
3843
3844         return 0;
3845 }
3846
3847 static void cgroup_pressure_release(struct kernfs_open_file *of)
3848 {
3849         struct cgroup_file_ctx *ctx = of->priv;
3850
3851         psi_trigger_destroy(ctx->psi.trigger);
3852 }
3853
3854 bool cgroup_psi_enabled(void)
3855 {
3856         if (static_branch_likely(&psi_disabled))
3857                 return false;
3858
3859         return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
3860 }
3861
3862 #else /* CONFIG_PSI */
3863 bool cgroup_psi_enabled(void)
3864 {
3865         return false;
3866 }
3867
3868 #endif /* CONFIG_PSI */
3869
3870 static int cgroup_freeze_show(struct seq_file *seq, void *v)
3871 {
3872         struct cgroup *cgrp = seq_css(seq)->cgroup;
3873
3874         seq_printf(seq, "%d\n", cgrp->freezer.freeze);
3875
3876         return 0;
3877 }
3878
3879 static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
3880                                    char *buf, size_t nbytes, loff_t off)
3881 {
3882         struct cgroup *cgrp;
3883         ssize_t ret;
3884         int freeze;
3885
3886         ret = kstrtoint(strstrip(buf), 0, &freeze);
3887         if (ret)
3888                 return ret;
3889
3890         if (freeze < 0 || freeze > 1)
3891                 return -ERANGE;
3892
3893         cgrp = cgroup_kn_lock_live(of->kn, false);
3894         if (!cgrp)
3895                 return -ENOENT;
3896
3897         cgroup_freeze(cgrp, freeze);
3898
3899         cgroup_kn_unlock(of->kn);
3900
3901         return nbytes;
3902 }
3903
3904 static void __cgroup_kill(struct cgroup *cgrp)
3905 {
3906         struct css_task_iter it;
3907         struct task_struct *task;
3908
3909         lockdep_assert_held(&cgroup_mutex);
3910
3911         spin_lock_irq(&css_set_lock);
3912         set_bit(CGRP_KILL, &cgrp->flags);
3913         spin_unlock_irq(&css_set_lock);
3914
3915         css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
3916         while ((task = css_task_iter_next(&it))) {
3917                 /* Ignore kernel threads here. */
3918                 if (task->flags & PF_KTHREAD)
3919                         continue;
3920
3921                 /* Skip tasks that are already dying. */
3922                 if (__fatal_signal_pending(task))
3923                         continue;
3924
3925                 send_sig(SIGKILL, task, 0);
3926         }
3927         css_task_iter_end(&it);
3928
3929         spin_lock_irq(&css_set_lock);
3930         clear_bit(CGRP_KILL, &cgrp->flags);
3931         spin_unlock_irq(&css_set_lock);
3932 }
3933
3934 static void cgroup_kill(struct cgroup *cgrp)
3935 {
3936         struct cgroup_subsys_state *css;
3937         struct cgroup *dsct;
3938
3939         lockdep_assert_held(&cgroup_mutex);
3940
3941         cgroup_for_each_live_descendant_pre(dsct, css, cgrp)
3942                 __cgroup_kill(dsct);
3943 }
3944
3945 static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf,
3946                                  size_t nbytes, loff_t off)
3947 {
3948         ssize_t ret = 0;
3949         int kill;
3950         struct cgroup *cgrp;
3951
3952         ret = kstrtoint(strstrip(buf), 0, &kill);
3953         if (ret)
3954                 return ret;
3955
3956         if (kill != 1)
3957                 return -ERANGE;
3958
3959         cgrp = cgroup_kn_lock_live(of->kn, false);
3960         if (!cgrp)
3961                 return -ENOENT;
3962
3963         /*
3964          * Killing is a process directed operation, i.e. the whole thread-group
3965          * is taken down so act like we do for cgroup.procs and only make this
3966          * writable in non-threaded cgroups.
3967          */
3968         if (cgroup_is_threaded(cgrp))
3969                 ret = -EOPNOTSUPP;
3970         else
3971                 cgroup_kill(cgrp);
3972
3973         cgroup_kn_unlock(of->kn);
3974
3975         return ret ?: nbytes;
3976 }
3977
3978 static int cgroup_file_open(struct kernfs_open_file *of)
3979 {
3980         struct cftype *cft = of_cft(of);
3981         struct cgroup_file_ctx *ctx;
3982         int ret;
3983
3984         ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
3985         if (!ctx)
3986                 return -ENOMEM;
3987
3988         ctx->ns = current->nsproxy->cgroup_ns;
3989         get_cgroup_ns(ctx->ns);
3990         of->priv = ctx;
3991
3992         if (!cft->open)
3993                 return 0;
3994
3995         ret = cft->open(of);
3996         if (ret) {
3997                 put_cgroup_ns(ctx->ns);
3998                 kfree(ctx);
3999         }
4000         return ret;
4001 }
4002
4003 static void cgroup_file_release(struct kernfs_open_file *of)
4004 {
4005         struct cftype *cft = of_cft(of);
4006         struct cgroup_file_ctx *ctx = of->priv;
4007
4008         if (cft->release)
4009                 cft->release(of);
4010         put_cgroup_ns(ctx->ns);
4011         kfree(ctx);
4012 }
4013
4014 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
4015                                  size_t nbytes, loff_t off)
4016 {
4017         struct cgroup_file_ctx *ctx = of->priv;
4018         struct cgroup *cgrp = of->kn->parent->priv;
4019         struct cftype *cft = of_cft(of);
4020         struct cgroup_subsys_state *css;
4021         int ret;
4022
4023         if (!nbytes)
4024                 return 0;
4025
4026         /*
4027          * If namespaces are delegation boundaries, disallow writes to
4028          * files in an non-init namespace root from inside the namespace
4029          * except for the files explicitly marked delegatable -
4030          * cgroup.procs and cgroup.subtree_control.
4031          */
4032         if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
4033             !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
4034             ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp)
4035                 return -EPERM;
4036
4037         if (cft->write)
4038                 return cft->write(of, buf, nbytes, off);
4039
4040         /*
4041          * kernfs guarantees that a file isn't deleted with operations in
4042          * flight, which means that the matching css is and stays alive and
4043          * doesn't need to be pinned.  The RCU locking is not necessary
4044          * either.  It's just for the convenience of using cgroup_css().
4045          */
4046         rcu_read_lock();
4047         css = cgroup_css(cgrp, cft->ss);
4048         rcu_read_unlock();
4049
4050         if (cft->write_u64) {
4051                 unsigned long long v;
4052                 ret = kstrtoull(buf, 0, &v);
4053                 if (!ret)
4054                         ret = cft->write_u64(css, cft, v);
4055         } else if (cft->write_s64) {
4056                 long long v;
4057                 ret = kstrtoll(buf, 0, &v);
4058                 if (!ret)
4059                         ret = cft->write_s64(css, cft, v);
4060         } else {
4061                 ret = -EINVAL;
4062         }
4063
4064         return ret ?: nbytes;
4065 }
4066
4067 static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt)
4068 {
4069         struct cftype *cft = of_cft(of);
4070
4071         if (cft->poll)
4072                 return cft->poll(of, pt);
4073
4074         return kernfs_generic_poll(of, pt);
4075 }
4076
4077 static void *cgroup_seqfile_start(struct seq_file *seq, loff_t *ppos)
4078 {
4079         return seq_cft(seq)->seq_start(seq, ppos);
4080 }
4081
4082 static void *cgroup_seqfile_next(struct seq_file *seq, void *v, loff_t *ppos)
4083 {
4084         return seq_cft(seq)->seq_next(seq, v, ppos);
4085 }
4086
4087 static void cgroup_seqfile_stop(struct seq_file *seq, void *v)
4088 {
4089         if (seq_cft(seq)->seq_stop)
4090                 seq_cft(seq)->seq_stop(seq, v);
4091 }
4092
4093 static int cgroup_seqfile_show(struct seq_file *m, void *arg)
4094 {
4095         struct cftype *cft = seq_cft(m);
4096         struct cgroup_subsys_state *css = seq_css(m);
4097
4098         if (cft->seq_show)
4099                 return cft->seq_show(m, arg);
4100
4101         if (cft->read_u64)
4102                 seq_printf(m, "%llu\n", cft->read_u64(css, cft));
4103         else if (cft->read_s64)
4104                 seq_printf(m, "%lld\n", cft->read_s64(css, cft));
4105         else
4106                 return -EINVAL;
4107         return 0;
4108 }
4109
4110 static struct kernfs_ops cgroup_kf_single_ops = {
4111         .atomic_write_len       = PAGE_SIZE,
4112         .open                   = cgroup_file_open,
4113         .release                = cgroup_file_release,
4114         .write                  = cgroup_file_write,
4115         .poll                   = cgroup_file_poll,
4116         .seq_show               = cgroup_seqfile_show,
4117 };
4118
4119 static struct kernfs_ops cgroup_kf_ops = {
4120         .atomic_write_len       = PAGE_SIZE,
4121         .open                   = cgroup_file_open,
4122         .release                = cgroup_file_release,
4123         .write                  = cgroup_file_write,
4124         .poll                   = cgroup_file_poll,
4125         .seq_start              = cgroup_seqfile_start,
4126         .seq_next               = cgroup_seqfile_next,
4127         .seq_stop               = cgroup_seqfile_stop,
4128         .seq_show               = cgroup_seqfile_show,
4129 };
4130
4131 /* set uid and gid of cgroup dirs and files to that of the creator */
4132 static int cgroup_kn_set_ugid(struct kernfs_node *kn)
4133 {
4134         struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
4135                                .ia_uid = current_fsuid(),
4136                                .ia_gid = current_fsgid(), };
4137
4138         if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
4139             gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
4140                 return 0;
4141
4142         return kernfs_setattr(kn, &iattr);
4143 }
4144
4145 static void cgroup_file_notify_timer(struct timer_list *timer)
4146 {
4147         cgroup_file_notify(container_of(timer, struct cgroup_file,
4148                                         notify_timer));
4149 }
4150
4151 static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
4152                            struct cftype *cft)
4153 {
4154         char name[CGROUP_FILE_NAME_MAX];
4155         struct kernfs_node *kn;
4156         struct lock_class_key *key = NULL;
4157         int ret;
4158
4159 #ifdef CONFIG_DEBUG_LOCK_ALLOC
4160         key = &cft->lockdep_key;
4161 #endif
4162         kn = __kernfs_create_file(cgrp->kn, cgroup_file_name(cgrp, cft, name),
4163                                   cgroup_file_mode(cft),
4164                                   GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
4165                                   0, cft->kf_ops, cft,
4166                                   NULL, key);
4167         if (IS_ERR(kn))
4168                 return PTR_ERR(kn);
4169
4170         ret = cgroup_kn_set_ugid(kn);
4171         if (ret) {
4172                 kernfs_remove(kn);
4173                 return ret;
4174         }
4175
4176         if (cft->file_offset) {
4177                 struct cgroup_file *cfile = (void *)css + cft->file_offset;
4178
4179                 timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0);
4180
4181                 spin_lock_irq(&cgroup_file_kn_lock);
4182                 cfile->kn = kn;
4183                 spin_unlock_irq(&cgroup_file_kn_lock);
4184         }
4185
4186         return 0;
4187 }
4188
4189 /**
4190  * cgroup_addrm_files - add or remove files to a cgroup directory
4191  * @css: the target css
4192  * @cgrp: the target cgroup (usually css->cgroup)
4193  * @cfts: array of cftypes to be added
4194  * @is_add: whether to add or remove
4195  *
4196  * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
4197  * For removals, this function never fails.
4198  */
4199 static int cgroup_addrm_files(struct cgroup_subsys_state *css,
4200                               struct cgroup *cgrp, struct cftype cfts[],
4201                               bool is_add)
4202 {
4203         struct cftype *cft, *cft_end = NULL;
4204         int ret = 0;
4205
4206         lockdep_assert_held(&cgroup_mutex);
4207
4208 restart:
4209         for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
4210                 /* does cft->flags tell us to skip this file on @cgrp? */
4211                 if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
4212                         continue;
4213                 if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
4214                         continue;
4215                 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
4216                         continue;
4217                 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgroup_parent(cgrp))
4218                         continue;
4219                 if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug)
4220                         continue;
4221                 if (is_add) {
4222                         ret = cgroup_add_file(css, cgrp, cft);
4223                         if (ret) {
4224                                 pr_warn("%s: failed to add %s, err=%d\n",
4225                                         __func__, cft->name, ret);
4226                                 cft_end = cft;
4227                                 is_add = false;
4228                                 goto restart;
4229                         }
4230                 } else {
4231                         cgroup_rm_file(cgrp, cft);
4232                 }
4233         }
4234         return ret;
4235 }
4236
4237 static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
4238 {
4239         struct cgroup_subsys *ss = cfts[0].ss;
4240         struct cgroup *root = &ss->root->cgrp;
4241         struct cgroup_subsys_state *css;
4242         int ret = 0;
4243
4244         lockdep_assert_held(&cgroup_mutex);
4245
4246         /* add/rm files for all cgroups created before */
4247         css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
4248                 struct cgroup *cgrp = css->cgroup;
4249
4250                 if (!(css->flags & CSS_VISIBLE))
4251                         continue;
4252
4253                 ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
4254                 if (ret)
4255                         break;
4256         }
4257
4258         if (is_add && !ret)
4259                 kernfs_activate(root->kn);
4260         return ret;
4261 }
4262
4263 static void cgroup_exit_cftypes(struct cftype *cfts)
4264 {
4265         struct cftype *cft;
4266
4267         for (cft = cfts; cft->name[0] != '\0'; cft++) {
4268                 /* free copy for custom atomic_write_len, see init_cftypes() */
4269                 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE)
4270                         kfree(cft->kf_ops);
4271                 cft->kf_ops = NULL;
4272                 cft->ss = NULL;
4273
4274                 /* revert flags set by cgroup core while adding @cfts */
4275                 cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL |
4276                                 __CFTYPE_ADDED);
4277         }
4278 }
4279
4280 static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
4281 {
4282         struct cftype *cft;
4283         int ret = 0;
4284
4285         for (cft = cfts; cft->name[0] != '\0'; cft++) {
4286                 struct kernfs_ops *kf_ops;
4287
4288                 WARN_ON(cft->ss || cft->kf_ops);
4289
4290                 if (cft->flags & __CFTYPE_ADDED) {
4291                         ret = -EBUSY;
4292                         break;
4293                 }
4294
4295                 if (cft->seq_start)
4296                         kf_ops = &cgroup_kf_ops;
4297                 else
4298                         kf_ops = &cgroup_kf_single_ops;
4299
4300                 /*
4301                  * Ugh... if @cft wants a custom max_write_len, we need to
4302                  * make a copy of kf_ops to set its atomic_write_len.
4303                  */
4304                 if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
4305                         kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
4306                         if (!kf_ops) {
4307                                 ret = -ENOMEM;
4308                                 break;
4309                         }
4310                         kf_ops->atomic_write_len = cft->max_write_len;
4311                 }
4312
4313                 cft->kf_ops = kf_ops;
4314                 cft->ss = ss;
4315                 cft->flags |= __CFTYPE_ADDED;
4316         }
4317
4318         if (ret)
4319                 cgroup_exit_cftypes(cfts);
4320         return ret;
4321 }
4322
4323 static int cgroup_rm_cftypes_locked(struct cftype *cfts)
4324 {
4325         lockdep_assert_held(&cgroup_mutex);
4326
4327         list_del(&cfts->node);
4328         cgroup_apply_cftypes(cfts, false);
4329         cgroup_exit_cftypes(cfts);
4330         return 0;
4331 }
4332
4333 /**
4334  * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
4335  * @cfts: zero-length name terminated array of cftypes
4336  *
4337  * Unregister @cfts.  Files described by @cfts are removed from all
4338  * existing cgroups and all future cgroups won't have them either.  This
4339  * function can be called anytime whether @cfts' subsys is attached or not.
4340  *
4341  * Returns 0 on successful unregistration, -ENOENT if @cfts is not
4342  * registered.
4343  */
4344 int cgroup_rm_cftypes(struct cftype *cfts)
4345 {
4346         int ret;
4347
4348         if (!cfts || cfts[0].name[0] == '\0')
4349                 return 0;
4350
4351         if (!(cfts[0].flags & __CFTYPE_ADDED))
4352                 return -ENOENT;
4353
4354         cgroup_lock();
4355         ret = cgroup_rm_cftypes_locked(cfts);
4356         cgroup_unlock();
4357         return ret;
4358 }
4359
4360 /**
4361  * cgroup_add_cftypes - add an array of cftypes to a subsystem
4362  * @ss: target cgroup subsystem
4363  * @cfts: zero-length name terminated array of cftypes
4364  *
4365  * Register @cfts to @ss.  Files described by @cfts are created for all
4366  * existing cgroups to which @ss is attached and all future cgroups will
4367  * have them too.  This function can be called anytime whether @ss is
4368  * attached or not.
4369  *
4370  * Returns 0 on successful registration, -errno on failure.  Note that this
4371  * function currently returns 0 as long as @cfts registration is successful
4372  * even if some file creation attempts on existing cgroups fail.
4373  */
4374 static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
4375 {
4376         int ret;
4377
4378         if (!cgroup_ssid_enabled(ss->id))
4379                 return 0;
4380
4381         if (!cfts || cfts[0].name[0] == '\0')
4382                 return 0;
4383
4384         ret = cgroup_init_cftypes(ss, cfts);
4385         if (ret)
4386                 return ret;
4387
4388         cgroup_lock();
4389
4390         list_add_tail(&cfts->node, &ss->cfts);
4391         ret = cgroup_apply_cftypes(cfts, true);
4392         if (ret)
4393                 cgroup_rm_cftypes_locked(cfts);
4394
4395         cgroup_unlock();
4396         return ret;
4397 }
4398
4399 /**
4400  * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
4401  * @ss: target cgroup subsystem
4402  * @cfts: zero-length name terminated array of cftypes
4403  *
4404  * Similar to cgroup_add_cftypes() but the added files are only used for
4405  * the default hierarchy.
4406  */
4407 int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
4408 {
4409         struct cftype *cft;
4410
4411         for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
4412                 cft->flags |= __CFTYPE_ONLY_ON_DFL;
4413         return cgroup_add_cftypes(ss, cfts);
4414 }
4415
4416 /**
4417  * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
4418  * @ss: target cgroup subsystem
4419  * @cfts: zero-length name terminated array of cftypes
4420  *
4421  * Similar to cgroup_add_cftypes() but the added files are only used for
4422  * the legacy hierarchies.
4423  */
4424 int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
4425 {
4426         struct cftype *cft;
4427
4428         for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
4429                 cft->flags |= __CFTYPE_NOT_ON_DFL;
4430         return cgroup_add_cftypes(ss, cfts);
4431 }
4432
4433 /**
4434  * cgroup_file_notify - generate a file modified event for a cgroup_file
4435  * @cfile: target cgroup_file
4436  *
4437  * @cfile must have been obtained by setting cftype->file_offset.
4438  */
4439 void cgroup_file_notify(struct cgroup_file *cfile)
4440 {
4441         unsigned long flags;
4442
4443         spin_lock_irqsave(&cgroup_file_kn_lock, flags);
4444         if (cfile->kn) {
4445                 unsigned long last = cfile->notified_at;
4446                 unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV;
4447
4448                 if (time_in_range(jiffies, last, next)) {
4449                         timer_reduce(&cfile->notify_timer, next);
4450                 } else {
4451                         kernfs_notify(cfile->kn);
4452                         cfile->notified_at = jiffies;
4453                 }
4454         }
4455         spin_unlock_irqrestore(&cgroup_file_kn_lock, flags);
4456 }
4457
4458 /**
4459  * cgroup_file_show - show or hide a hidden cgroup file
4460  * @cfile: target cgroup_file obtained by setting cftype->file_offset
4461  * @show: whether to show or hide
4462  */
4463 void cgroup_file_show(struct cgroup_file *cfile, bool show)
4464 {
4465         struct kernfs_node *kn;
4466
4467         spin_lock_irq(&cgroup_file_kn_lock);
4468         kn = cfile->kn;
4469         kernfs_get(kn);
4470         spin_unlock_irq(&cgroup_file_kn_lock);
4471
4472         if (kn)
4473                 kernfs_show(kn, show);
4474
4475         kernfs_put(kn);
4476 }
4477
4478 /**
4479  * css_next_child - find the next child of a given css
4480  * @pos: the current position (%NULL to initiate traversal)
4481  * @parent: css whose children to walk
4482  *
4483  * This function returns the next child of @parent and should be called
4484  * under either cgroup_mutex or RCU read lock.  The only requirement is
4485  * that @parent and @pos are accessible.  The next sibling is guaranteed to
4486  * be returned regardless of their states.
4487  *
4488  * If a subsystem synchronizes ->css_online() and the start of iteration, a
4489  * css which finished ->css_online() is guaranteed to be visible in the
4490  * future iterations and will stay visible until the last reference is put.
4491  * A css which hasn't finished ->css_online() or already finished
4492  * ->css_offline() may show up during traversal.  It's each subsystem's
4493  * responsibility to synchronize against on/offlining.
4494  */
4495 struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
4496                                            struct cgroup_subsys_state *parent)
4497 {
4498         struct cgroup_subsys_state *next;
4499
4500         cgroup_assert_mutex_or_rcu_locked();
4501
4502         /*
4503          * @pos could already have been unlinked from the sibling list.
4504          * Once a cgroup is removed, its ->sibling.next is no longer
4505          * updated when its next sibling changes.  CSS_RELEASED is set when
4506          * @pos is taken off list, at which time its next pointer is valid,
4507          * and, as releases are serialized, the one pointed to by the next
4508          * pointer is guaranteed to not have started release yet.  This
4509          * implies that if we observe !CSS_RELEASED on @pos in this RCU
4510          * critical section, the one pointed to by its next pointer is
4511          * guaranteed to not have finished its RCU grace period even if we
4512          * have dropped rcu_read_lock() in-between iterations.
4513          *
4514          * If @pos has CSS_RELEASED set, its next pointer can't be
4515          * dereferenced; however, as each css is given a monotonically
4516          * increasing unique serial number and always appended to the
4517          * sibling list, the next one can be found by walking the parent's
4518          * children until the first css with higher serial number than
4519          * @pos's.  While this path can be slower, it happens iff iteration
4520          * races against release and the race window is very small.
4521          */
4522         if (!pos) {
4523                 next = list_entry_rcu(parent->children.next, struct cgroup_subsys_state, sibling);
4524         } else if (likely(!(pos->flags & CSS_RELEASED))) {
4525                 next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
4526         } else {
4527                 list_for_each_entry_rcu(next, &parent->children, sibling,
4528                                         lockdep_is_held(&cgroup_mutex))
4529                         if (next->serial_nr > pos->serial_nr)
4530                                 break;
4531         }
4532
4533         /*
4534          * @next, if not pointing to the head, can be dereferenced and is
4535          * the next sibling.
4536          */
4537         if (&next->sibling != &parent->children)
4538                 return next;
4539         return NULL;
4540 }
4541
4542 /**
4543  * css_next_descendant_pre - find the next descendant for pre-order walk
4544  * @pos: the current position (%NULL to initiate traversal)
4545  * @root: css whose descendants to walk
4546  *
4547  * To be used by css_for_each_descendant_pre().  Find the next descendant
4548  * to visit for pre-order traversal of @root's descendants.  @root is
4549  * included in the iteration and the first node to be visited.
4550  *
4551  * While this function requires cgroup_mutex or RCU read locking, it
4552  * doesn't require the whole traversal to be contained in a single critical
4553  * section.  This function will return the correct next descendant as long
4554  * as both @pos and @root are accessible and @pos is a descendant of @root.
4555  *
4556  * If a subsystem synchronizes ->css_online() and the start of iteration, a
4557  * css which finished ->css_online() is guaranteed to be visible in the
4558  * future iterations and will stay visible until the last reference is put.
4559  * A css which hasn't finished ->css_online() or already finished
4560  * ->css_offline() may show up during traversal.  It's each subsystem's
4561  * responsibility to synchronize against on/offlining.
4562  */
4563 struct cgroup_subsys_state *
4564 css_next_descendant_pre(struct cgroup_subsys_state *pos,
4565                         struct cgroup_subsys_state *root)
4566 {
4567         struct cgroup_subsys_state *next;
4568
4569         cgroup_assert_mutex_or_rcu_locked();
4570
4571         /* if first iteration, visit @root */
4572         if (!pos)
4573                 return root;
4574
4575         /* visit the first child if exists */
4576         next = css_next_child(NULL, pos);
4577         if (next)
4578                 return next;
4579
4580         /* no child, visit my or the closest ancestor's next sibling */
4581         while (pos != root) {
4582                 next = css_next_child(pos, pos->parent);
4583                 if (next)
4584                         return next;
4585                 pos = pos->parent;
4586         }
4587
4588         return NULL;
4589 }
4590 EXPORT_SYMBOL_GPL(css_next_descendant_pre);
4591
4592 /**
4593  * css_rightmost_descendant - return the rightmost descendant of a css
4594  * @pos: css of interest
4595  *
4596  * Return the rightmost descendant of @pos.  If there's no descendant, @pos
4597  * is returned.  This can be used during pre-order traversal to skip
4598  * subtree of @pos.
4599  *
4600  * While this function requires cgroup_mutex or RCU read locking, it
4601  * doesn't require the whole traversal to be contained in a single critical
4602  * section.  This function will return the correct rightmost descendant as
4603  * long as @pos is accessible.
4604  */
4605 struct cgroup_subsys_state *
4606 css_rightmost_descendant(struct cgroup_subsys_state *pos)
4607 {
4608         struct cgroup_subsys_state *last, *tmp;
4609
4610         cgroup_assert_mutex_or_rcu_locked();
4611
4612         do {
4613                 last = pos;
4614                 /* ->prev isn't RCU safe, walk ->next till the end */
4615                 pos = NULL;
4616                 css_for_each_child(tmp, last)
4617                         pos = tmp;
4618         } while (pos);
4619
4620         return last;
4621 }
4622
4623 static struct cgroup_subsys_state *
4624 css_leftmost_descendant(struct cgroup_subsys_state *pos)
4625 {
4626         struct cgroup_subsys_state *last;
4627
4628         do {
4629                 last = pos;
4630                 pos = css_next_child(NULL, pos);
4631         } while (pos);
4632
4633         return last;
4634 }
4635
4636 /**
4637  * css_next_descendant_post - find the next descendant for post-order walk
4638  * @pos: the current position (%NULL to initiate traversal)
4639  * @root: css whose descendants to walk
4640  *
4641  * To be used by css_for_each_descendant_post().  Find the next descendant
4642  * to visit for post-order traversal of @root's descendants.  @root is
4643  * included in the iteration and the last node to be visited.
4644  *
4645  * While this function requires cgroup_mutex or RCU read locking, it
4646  * doesn't require the whole traversal to be contained in a single critical
4647  * section.  This function will return the correct next descendant as long
4648  * as both @pos and @cgroup are accessible and @pos is a descendant of
4649  * @cgroup.
4650  *
4651  * If a subsystem synchronizes ->css_online() and the start of iteration, a
4652  * css which finished ->css_online() is guaranteed to be visible in the
4653  * future iterations and will stay visible until the last reference is put.
4654  * A css which hasn't finished ->css_online() or already finished
4655  * ->css_offline() may show up during traversal.  It's each subsystem's
4656  * responsibility to synchronize against on/offlining.
4657  */
4658 struct cgroup_subsys_state *
4659 css_next_descendant_post(struct cgroup_subsys_state *pos,
4660                          struct cgroup_subsys_state *root)
4661 {
4662         struct cgroup_subsys_state *next;
4663
4664         cgroup_assert_mutex_or_rcu_locked();
4665
4666         /* if first iteration, visit leftmost descendant which may be @root */
4667         if (!pos)
4668                 return css_leftmost_descendant(root);
4669
4670         /* if we visited @root, we're done */
4671         if (pos == root)
4672                 return NULL;
4673
4674         /* if there's an unvisited sibling, visit its leftmost descendant */
4675         next = css_next_child(pos, pos->parent);
4676         if (next)
4677                 return css_leftmost_descendant(next);
4678
4679         /* no sibling left, visit parent */
4680         return pos->parent;
4681 }
4682
4683 /**
4684  * css_has_online_children - does a css have online children
4685  * @css: the target css
4686  *
4687  * Returns %true if @css has any online children; otherwise, %false.  This
4688  * function can be called from any context but the caller is responsible
4689  * for synchronizing against on/offlining as necessary.
4690  */
4691 bool css_has_online_children(struct cgroup_subsys_state *css)
4692 {
4693         struct cgroup_subsys_state *child;
4694         bool ret = false;
4695
4696         rcu_read_lock();
4697         css_for_each_child(child, css) {
4698                 if (child->flags & CSS_ONLINE) {
4699                         ret = true;
4700                         break;
4701                 }
4702         }
4703         rcu_read_unlock();
4704         return ret;
4705 }
4706
4707 static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)
4708 {
4709         struct list_head *l;
4710         struct cgrp_cset_link *link;
4711         struct css_set *cset;
4712
4713         lockdep_assert_held(&css_set_lock);
4714
4715         /* find the next threaded cset */
4716         if (it->tcset_pos) {
4717                 l = it->tcset_pos->next;
4718
4719                 if (l != it->tcset_head) {
4720                         it->tcset_pos = l;
4721                         return container_of(l, struct css_set,
4722                                             threaded_csets_node);
4723                 }
4724
4725                 it->tcset_pos = NULL;
4726         }
4727
4728         /* find the next cset */
4729         l = it->cset_pos;
4730         l = l->next;
4731         if (l == it->cset_head) {
4732                 it->cset_pos = NULL;
4733                 return NULL;
4734         }
4735
4736         if (it->ss) {
4737                 cset = container_of(l, struct css_set, e_cset_node[it->ss->id]);
4738         } else {
4739                 link = list_entry(l, struct cgrp_cset_link, cset_link);
4740                 cset = link->cset;
4741         }
4742
4743         it->cset_pos = l;
4744
4745         /* initialize threaded css_set walking */
4746         if (it->flags & CSS_TASK_ITER_THREADED) {
4747                 if (it->cur_dcset)
4748                         put_css_set_locked(it->cur_dcset);
4749                 it->cur_dcset = cset;
4750                 get_css_set(cset);
4751
4752                 it->tcset_head = &cset->threaded_csets;
4753                 it->tcset_pos = &cset->threaded_csets;
4754         }
4755
4756         return cset;
4757 }
4758
4759 /**
4760  * css_task_iter_advance_css_set - advance a task iterator to the next css_set
4761  * @it: the iterator to advance
4762  *
4763  * Advance @it to the next css_set to walk.
4764  */
4765 static void css_task_iter_advance_css_set(struct css_task_iter *it)
4766 {
4767         struct css_set *cset;
4768
4769         lockdep_assert_held(&css_set_lock);
4770
4771         /* Advance to the next non-empty css_set and find first non-empty tasks list*/
4772         while ((cset = css_task_iter_next_css_set(it))) {
4773                 if (!list_empty(&cset->tasks)) {
4774                         it->cur_tasks_head = &cset->tasks;
4775                         break;
4776                 } else if (!list_empty(&cset->mg_tasks)) {
4777                         it->cur_tasks_head = &cset->mg_tasks;
4778                         break;
4779                 } else if (!list_empty(&cset->dying_tasks)) {
4780                         it->cur_tasks_head = &cset->dying_tasks;
4781                         break;
4782                 }
4783         }
4784         if (!cset) {
4785                 it->task_pos = NULL;
4786                 return;
4787         }
4788         it->task_pos = it->cur_tasks_head->next;
4789
4790         /*
4791          * We don't keep css_sets locked across iteration steps and thus
4792          * need to take steps to ensure that iteration can be resumed after
4793          * the lock is re-acquired.  Iteration is performed at two levels -
4794          * css_sets and tasks in them.
4795          *
4796          * Once created, a css_set never leaves its cgroup lists, so a
4797          * pinned css_set is guaranteed to stay put and we can resume
4798          * iteration afterwards.
4799          *
4800          * Tasks may leave @cset across iteration steps.  This is resolved
4801          * by registering each iterator with the css_set currently being
4802          * walked and making css_set_move_task() advance iterators whose
4803          * next task is leaving.
4804          */
4805         if (it->cur_cset) {
4806                 list_del(&it->iters_node);
4807                 put_css_set_locked(it->cur_cset);
4808         }
4809         get_css_set(cset);
4810         it->cur_cset = cset;
4811         list_add(&it->iters_node, &cset->task_iters);
4812 }
4813
4814 static void css_task_iter_skip(struct css_task_iter *it,
4815                                struct task_struct *task)
4816 {
4817         lockdep_assert_held(&css_set_lock);
4818
4819         if (it->task_pos == &task->cg_list) {
4820                 it->task_pos = it->task_pos->next;
4821                 it->flags |= CSS_TASK_ITER_SKIPPED;
4822         }
4823 }
4824
4825 static void css_task_iter_advance(struct css_task_iter *it)
4826 {
4827         struct task_struct *task;
4828
4829         lockdep_assert_held(&css_set_lock);
4830 repeat:
4831         if (it->task_pos) {
4832                 /*
4833                  * Advance iterator to find next entry. We go through cset
4834                  * tasks, mg_tasks and dying_tasks, when consumed we move onto
4835                  * the next cset.
4836                  */
4837                 if (it->flags & CSS_TASK_ITER_SKIPPED)
4838                         it->flags &= ~CSS_TASK_ITER_SKIPPED;
4839                 else
4840                         it->task_pos = it->task_pos->next;
4841
4842                 if (it->task_pos == &it->cur_cset->tasks) {
4843                         it->cur_tasks_head = &it->cur_cset->mg_tasks;
4844                         it->task_pos = it->cur_tasks_head->next;
4845                 }
4846                 if (it->task_pos == &it->cur_cset->mg_tasks) {
4847                         it->cur_tasks_head = &it->cur_cset->dying_tasks;
4848                         it->task_pos = it->cur_tasks_head->next;
4849                 }
4850                 if (it->task_pos == &it->cur_cset->dying_tasks)
4851                         css_task_iter_advance_css_set(it);
4852         } else {
4853                 /* called from start, proceed to the first cset */
4854                 css_task_iter_advance_css_set(it);
4855         }
4856
4857         if (!it->task_pos)
4858                 return;
4859
4860         task = list_entry(it->task_pos, struct task_struct, cg_list);
4861
4862         if (it->flags & CSS_TASK_ITER_PROCS) {
4863                 /* if PROCS, skip over tasks which aren't group leaders */
4864                 if (!thread_group_leader(task))
4865                         goto repeat;
4866
4867                 /* and dying leaders w/o live member threads */
4868                 if (it->cur_tasks_head == &it->cur_cset->dying_tasks &&
4869                     !atomic_read(&task->signal->live))
4870                         goto repeat;
4871         } else {
4872                 /* skip all dying ones */
4873                 if (it->cur_tasks_head == &it->cur_cset->dying_tasks)
4874                         goto repeat;
4875         }
4876 }
4877
4878 /**
4879  * css_task_iter_start - initiate task iteration
4880  * @css: the css to walk tasks of
4881  * @flags: CSS_TASK_ITER_* flags
4882  * @it: the task iterator to use
4883  *
4884  * Initiate iteration through the tasks of @css.  The caller can call
4885  * css_task_iter_next() to walk through the tasks until the function
4886  * returns NULL.  On completion of iteration, css_task_iter_end() must be
4887  * called.
4888  */
4889 void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
4890                          struct css_task_iter *it)
4891 {
4892         memset(it, 0, sizeof(*it));
4893
4894         spin_lock_irq(&css_set_lock);
4895
4896         it->ss = css->ss;
4897         it->flags = flags;
4898
4899         if (CGROUP_HAS_SUBSYS_CONFIG && it->ss)
4900                 it->cset_pos = &css->cgroup->e_csets[css->ss->id];
4901         else
4902                 it->cset_pos = &css->cgroup->cset_links;
4903
4904         it->cset_head = it->cset_pos;
4905
4906         css_task_iter_advance(it);
4907
4908         spin_unlock_irq(&css_set_lock);
4909 }
4910
4911 /**
4912  * css_task_iter_next - return the next task for the iterator
4913  * @it: the task iterator being iterated
4914  *
4915  * The "next" function for task iteration.  @it should have been
4916  * initialized via css_task_iter_start().  Returns NULL when the iteration
4917  * reaches the end.
4918  */
4919 struct task_struct *css_task_iter_next(struct css_task_iter *it)
4920 {
4921         if (it->cur_task) {
4922                 put_task_struct(it->cur_task);
4923                 it->cur_task = NULL;
4924         }
4925
4926         spin_lock_irq(&css_set_lock);
4927
4928         /* @it may be half-advanced by skips, finish advancing */
4929         if (it->flags & CSS_TASK_ITER_SKIPPED)
4930                 css_task_iter_advance(it);
4931
4932         if (it->task_pos) {
4933                 it->cur_task = list_entry(it->task_pos, struct task_struct,
4934                                           cg_list);
4935                 get_task_struct(it->cur_task);
4936                 css_task_iter_advance(it);
4937         }
4938
4939         spin_unlock_irq(&css_set_lock);
4940
4941         return it->cur_task;
4942 }
4943
4944 /**
4945  * css_task_iter_end - finish task iteration
4946  * @it: the task iterator to finish
4947  *
4948  * Finish task iteration started by css_task_iter_start().
4949  */
4950 void css_task_iter_end(struct css_task_iter *it)
4951 {
4952         if (it->cur_cset) {
4953                 spin_lock_irq(&css_set_lock);
4954                 list_del(&it->iters_node);
4955                 put_css_set_locked(it->cur_cset);
4956                 spin_unlock_irq(&css_set_lock);
4957         }
4958
4959         if (it->cur_dcset)
4960                 put_css_set(it->cur_dcset);
4961
4962         if (it->cur_task)
4963                 put_task_struct(it->cur_task);
4964 }
4965
4966 static void cgroup_procs_release(struct kernfs_open_file *of)
4967 {
4968         struct cgroup_file_ctx *ctx = of->priv;
4969
4970         if (ctx->procs.started)
4971                 css_task_iter_end(&ctx->procs.iter);
4972 }
4973
4974 static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
4975 {
4976         struct kernfs_open_file *of = s->private;
4977         struct cgroup_file_ctx *ctx = of->priv;
4978
4979         if (pos)
4980                 (*pos)++;
4981
4982         return css_task_iter_next(&ctx->procs.iter);
4983 }
4984
4985 static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
4986                                   unsigned int iter_flags)
4987 {
4988         struct kernfs_open_file *of = s->private;
4989         struct cgroup *cgrp = seq_css(s)->cgroup;
4990         struct cgroup_file_ctx *ctx = of->priv;
4991         struct css_task_iter *it = &ctx->procs.iter;
4992
4993         /*
4994          * When a seq_file is seeked, it's always traversed sequentially
4995          * from position 0, so we can simply keep iterating on !0 *pos.
4996          */
4997         if (!ctx->procs.started) {
4998                 if (WARN_ON_ONCE((*pos)))
4999                         return ERR_PTR(-EINVAL);
5000                 css_task_iter_start(&cgrp->self, iter_flags, it);
5001                 ctx->procs.started = true;
5002         } else if (!(*pos)) {
5003                 css_task_iter_end(it);
5004                 css_task_iter_start(&cgrp->self, iter_flags, it);
5005         } else
5006                 return it->cur_task;
5007
5008         return cgroup_procs_next(s, NULL, NULL);
5009 }
5010
5011 static void *cgroup_procs_start(struct seq_file *s, loff_t *pos)
5012 {
5013         struct cgroup *cgrp = seq_css(s)->cgroup;
5014
5015         /*
5016          * All processes of a threaded subtree belong to the domain cgroup
5017          * of the subtree.  Only threads can be distributed across the
5018          * subtree.  Reject reads on cgroup.procs in the subtree proper.
5019          * They're always empty anyway.
5020          */
5021         if (cgroup_is_threaded(cgrp))
5022                 return ERR_PTR(-EOPNOTSUPP);
5023
5024         return __cgroup_procs_start(s, pos, CSS_TASK_ITER_PROCS |
5025                                             CSS_TASK_ITER_THREADED);
5026 }
5027
5028 static int cgroup_procs_show(struct seq_file *s, void *v)
5029 {
5030         seq_printf(s, "%d\n", task_pid_vnr(v));
5031         return 0;
5032 }
5033
5034 static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
5035 {
5036         int ret;
5037         struct inode *inode;
5038
5039         lockdep_assert_held(&cgroup_mutex);
5040
5041         inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
5042         if (!inode)
5043                 return -ENOMEM;
5044
5045         ret = inode_permission(&nop_mnt_idmap, inode, MAY_WRITE);
5046         iput(inode);
5047         return ret;
5048 }
5049
5050 static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
5051                                          struct cgroup *dst_cgrp,
5052                                          struct super_block *sb,
5053                                          struct cgroup_namespace *ns)
5054 {
5055         struct cgroup *com_cgrp = src_cgrp;
5056         int ret;
5057
5058         lockdep_assert_held(&cgroup_mutex);
5059
5060         /* find the common ancestor */
5061         while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
5062                 com_cgrp = cgroup_parent(com_cgrp);
5063
5064         /* %current should be authorized to migrate to the common ancestor */
5065         ret = cgroup_may_write(com_cgrp, sb);
5066         if (ret)
5067                 return ret;
5068
5069         /*
5070          * If namespaces are delegation boundaries, %current must be able
5071          * to see both source and destination cgroups from its namespace.
5072          */
5073         if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
5074             (!cgroup_is_descendant(src_cgrp, ns->root_cset->dfl_cgrp) ||
5075              !cgroup_is_descendant(dst_cgrp, ns->root_cset->dfl_cgrp)))
5076                 return -ENOENT;
5077
5078         return 0;
5079 }
5080
5081 static int cgroup_attach_permissions(struct cgroup *src_cgrp,
5082                                      struct cgroup *dst_cgrp,
5083                                      struct super_block *sb, bool threadgroup,
5084                                      struct cgroup_namespace *ns)
5085 {
5086         int ret = 0;
5087
5088         ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns);
5089         if (ret)
5090                 return ret;
5091
5092         ret = cgroup_migrate_vet_dst(dst_cgrp);
5093         if (ret)
5094                 return ret;
5095
5096         if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp))
5097                 ret = -EOPNOTSUPP;
5098
5099         return ret;
5100 }
5101
5102 static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
5103                                     bool threadgroup)
5104 {
5105         struct cgroup_file_ctx *ctx = of->priv;
5106         struct cgroup *src_cgrp, *dst_cgrp;
5107         struct task_struct *task;
5108         const struct cred *saved_cred;
5109         ssize_t ret;
5110         bool threadgroup_locked;
5111
5112         dst_cgrp = cgroup_kn_lock_live(of->kn, false);
5113         if (!dst_cgrp)
5114                 return -ENODEV;
5115
5116         task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked);
5117         ret = PTR_ERR_OR_ZERO(task);
5118         if (ret)
5119                 goto out_unlock;
5120
5121         /* find the source cgroup */
5122         spin_lock_irq(&css_set_lock);
5123         src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
5124         spin_unlock_irq(&css_set_lock);
5125
5126         /*
5127          * Process and thread migrations follow same delegation rule. Check
5128          * permissions using the credentials from file open to protect against
5129          * inherited fd attacks.
5130          */
5131         saved_cred = override_creds(of->file->f_cred);
5132         ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
5133                                         of->file->f_path.dentry->d_sb,
5134                                         threadgroup, ctx->ns);
5135         revert_creds(saved_cred);
5136         if (ret)
5137                 goto out_finish;
5138
5139         ret = cgroup_attach_task(dst_cgrp, task, threadgroup);
5140
5141 out_finish:
5142         cgroup_procs_write_finish(task, threadgroup_locked);
5143 out_unlock:
5144         cgroup_kn_unlock(of->kn);
5145
5146         return ret;
5147 }
5148
5149 static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
5150                                   char *buf, size_t nbytes, loff_t off)
5151 {
5152         return __cgroup_procs_write(of, buf, true) ?: nbytes;
5153 }
5154
5155 static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
5156 {
5157         return __cgroup_procs_start(s, pos, 0);
5158 }
5159
5160 static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
5161                                     char *buf, size_t nbytes, loff_t off)
5162 {
5163         return __cgroup_procs_write(of, buf, false) ?: nbytes;
5164 }
5165
5166 /* cgroup core interface files for the default hierarchy */
5167 static struct cftype cgroup_base_files[] = {
5168         {
5169                 .name = "cgroup.type",
5170                 .flags = CFTYPE_NOT_ON_ROOT,
5171                 .seq_show = cgroup_type_show,
5172                 .write = cgroup_type_write,
5173         },
5174         {
5175                 .name = "cgroup.procs",
5176                 .flags = CFTYPE_NS_DELEGATABLE,
5177                 .file_offset = offsetof(struct cgroup, procs_file),
5178                 .release = cgroup_procs_release,
5179                 .seq_start = cgroup_procs_start,
5180                 .seq_next = cgroup_procs_next,
5181                 .seq_show = cgroup_procs_show,
5182                 .write = cgroup_procs_write,
5183         },
5184         {
5185                 .name = "cgroup.threads",
5186                 .flags = CFTYPE_NS_DELEGATABLE,
5187                 .release = cgroup_procs_release,
5188                 .seq_start = cgroup_threads_start,
5189                 .seq_next = cgroup_procs_next,
5190                 .seq_show = cgroup_procs_show,
5191                 .write = cgroup_threads_write,
5192         },
5193         {
5194                 .name = "cgroup.controllers",
5195                 .seq_show = cgroup_controllers_show,
5196         },
5197         {
5198                 .name = "cgroup.subtree_control",
5199                 .flags = CFTYPE_NS_DELEGATABLE,
5200                 .seq_show = cgroup_subtree_control_show,
5201                 .write = cgroup_subtree_control_write,
5202         },
5203         {
5204                 .name = "cgroup.events",
5205                 .flags = CFTYPE_NOT_ON_ROOT,
5206                 .file_offset = offsetof(struct cgroup, events_file),
5207                 .seq_show = cgroup_events_show,
5208         },
5209         {
5210                 .name = "cgroup.max.descendants",
5211                 .seq_show = cgroup_max_descendants_show,
5212                 .write = cgroup_max_descendants_write,
5213         },
5214         {
5215                 .name = "cgroup.max.depth",
5216                 .seq_show = cgroup_max_depth_show,
5217                 .write = cgroup_max_depth_write,
5218         },
5219         {
5220                 .name = "cgroup.stat",
5221                 .seq_show = cgroup_stat_show,
5222         },
5223         {
5224                 .name = "cgroup.freeze",
5225                 .flags = CFTYPE_NOT_ON_ROOT,
5226                 .seq_show = cgroup_freeze_show,
5227                 .write = cgroup_freeze_write,
5228         },
5229         {
5230                 .name = "cgroup.kill",
5231                 .flags = CFTYPE_NOT_ON_ROOT,
5232                 .write = cgroup_kill_write,
5233         },
5234         {
5235                 .name = "cpu.stat",
5236                 .seq_show = cpu_stat_show,
5237         },
5238         { }     /* terminate */
5239 };
5240
5241 static struct cftype cgroup_psi_files[] = {
5242 #ifdef CONFIG_PSI
5243         {
5244                 .name = "io.pressure",
5245                 .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
5246                 .open = cgroup_pressure_open,
5247                 .seq_show = cgroup_io_pressure_show,
5248                 .write = cgroup_io_pressure_write,
5249                 .poll = cgroup_pressure_poll,
5250                 .release = cgroup_pressure_release,
5251         },
5252         {
5253                 .name = "memory.pressure",
5254                 .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
5255                 .open = cgroup_pressure_open,
5256                 .seq_show = cgroup_memory_pressure_show,
5257                 .write = cgroup_memory_pressure_write,
5258                 .poll = cgroup_pressure_poll,
5259                 .release = cgroup_pressure_release,
5260         },
5261         {
5262                 .name = "cpu.pressure",
5263                 .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
5264                 .open = cgroup_pressure_open,
5265                 .seq_show = cgroup_cpu_pressure_show,
5266                 .write = cgroup_cpu_pressure_write,
5267                 .poll = cgroup_pressure_poll,
5268                 .release = cgroup_pressure_release,
5269         },
5270 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
5271         {
5272                 .name = "irq.pressure",
5273                 .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
5274                 .open = cgroup_pressure_open,
5275                 .seq_show = cgroup_irq_pressure_show,
5276                 .write = cgroup_irq_pressure_write,
5277                 .poll = cgroup_pressure_poll,
5278                 .release = cgroup_pressure_release,
5279         },
5280 #endif
5281         {
5282                 .name = "cgroup.pressure",
5283                 .seq_show = cgroup_pressure_show,
5284                 .write = cgroup_pressure_write,
5285         },
5286 #endif /* CONFIG_PSI */
5287         { }     /* terminate */
5288 };
5289
5290 /*
5291  * css destruction is four-stage process.
5292  *
5293  * 1. Destruction starts.  Killing of the percpu_ref is initiated.
5294  *    Implemented in kill_css().
5295  *
5296  * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs
5297  *    and thus css_tryget_online() is guaranteed to fail, the css can be
5298  *    offlined by invoking offline_css().  After offlining, the base ref is
5299  *    put.  Implemented in css_killed_work_fn().
5300  *
5301  * 3. When the percpu_ref reaches zero, the only possible remaining
5302  *    accessors are inside RCU read sections.  css_release() schedules the
5303  *    RCU callback.
5304  *
5305  * 4. After the grace period, the css can be freed.  Implemented in
5306  *    css_free_work_fn().
5307  *
5308  * It is actually hairier because both step 2 and 4 require process context
5309  * and thus involve punting to css->destroy_work adding two additional
5310  * steps to the already complex sequence.
5311  */
5312 static void css_free_rwork_fn(struct work_struct *work)
5313 {
5314         struct cgroup_subsys_state *css = container_of(to_rcu_work(work),
5315                                 struct cgroup_subsys_state, destroy_rwork);
5316         struct cgroup_subsys *ss = css->ss;
5317         struct cgroup *cgrp = css->cgroup;
5318
5319         percpu_ref_exit(&css->refcnt);
5320
5321         if (ss) {
5322                 /* css free path */
5323                 struct cgroup_subsys_state *parent = css->parent;
5324                 int id = css->id;
5325
5326                 ss->css_free(css);
5327                 cgroup_idr_remove(&ss->css_idr, id);
5328                 cgroup_put(cgrp);
5329
5330                 if (parent)
5331                         css_put(parent);
5332         } else {
5333                 /* cgroup free path */
5334                 atomic_dec(&cgrp->root->nr_cgrps);
5335                 cgroup1_pidlist_destroy_all(cgrp);
5336                 cancel_work_sync(&cgrp->release_agent_work);
5337                 bpf_cgrp_storage_free(cgrp);
5338
5339                 if (cgroup_parent(cgrp)) {
5340                         /*
5341                          * We get a ref to the parent, and put the ref when
5342                          * this cgroup is being freed, so it's guaranteed
5343                          * that the parent won't be destroyed before its
5344                          * children.
5345                          */
5346                         cgroup_put(cgroup_parent(cgrp));
5347                         kernfs_put(cgrp->kn);
5348                         psi_cgroup_free(cgrp);
5349                         cgroup_rstat_exit(cgrp);
5350                         kfree(cgrp);
5351                 } else {
5352                         /*
5353                          * This is root cgroup's refcnt reaching zero,
5354                          * which indicates that the root should be
5355                          * released.
5356                          */
5357                         cgroup_destroy_root(cgrp->root);
5358                 }
5359         }
5360 }
5361
5362 static void css_release_work_fn(struct work_struct *work)
5363 {
5364         struct cgroup_subsys_state *css =
5365                 container_of(work, struct cgroup_subsys_state, destroy_work);
5366         struct cgroup_subsys *ss = css->ss;
5367         struct cgroup *cgrp = css->cgroup;
5368
5369         cgroup_lock();
5370
5371         css->flags |= CSS_RELEASED;
5372         list_del_rcu(&css->sibling);
5373
5374         if (ss) {
5375                 /* css release path */
5376                 if (!list_empty(&css->rstat_css_node)) {
5377                         cgroup_rstat_flush(cgrp);
5378                         list_del_rcu(&css->rstat_css_node);
5379                 }
5380
5381                 cgroup_idr_replace(&ss->css_idr, NULL, css->id);
5382                 if (ss->css_released)
5383                         ss->css_released(css);
5384         } else {
5385                 struct cgroup *tcgrp;
5386
5387                 /* cgroup release path */
5388                 TRACE_CGROUP_PATH(release, cgrp);
5389
5390                 cgroup_rstat_flush(cgrp);
5391
5392                 spin_lock_irq(&css_set_lock);
5393                 for (tcgrp = cgroup_parent(cgrp); tcgrp;
5394                      tcgrp = cgroup_parent(tcgrp))
5395                         tcgrp->nr_dying_descendants--;
5396                 spin_unlock_irq(&css_set_lock);
5397
5398                 /*
5399                  * There are two control paths which try to determine
5400                  * cgroup from dentry without going through kernfs -
5401                  * cgroupstats_build() and css_tryget_online_from_dir().
5402                  * Those are supported by RCU protecting clearing of
5403                  * cgrp->kn->priv backpointer.
5404                  */
5405                 if (cgrp->kn)
5406                         RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
5407                                          NULL);
5408         }
5409
5410         cgroup_unlock();
5411
5412         INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
5413         queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
5414 }
5415
5416 static void css_release(struct percpu_ref *ref)
5417 {
5418         struct cgroup_subsys_state *css =
5419                 container_of(ref, struct cgroup_subsys_state, refcnt);
5420
5421         INIT_WORK(&css->destroy_work, css_release_work_fn);
5422         queue_work(cgroup_destroy_wq, &css->destroy_work);
5423 }
5424
5425 static void init_and_link_css(struct cgroup_subsys_state *css,
5426                               struct cgroup_subsys *ss, struct cgroup *cgrp)
5427 {
5428         lockdep_assert_held(&cgroup_mutex);
5429
5430         cgroup_get_live(cgrp);
5431
5432         memset(css, 0, sizeof(*css));
5433         css->cgroup = cgrp;
5434         css->ss = ss;
5435         css->id = -1;
5436         INIT_LIST_HEAD(&css->sibling);
5437         INIT_LIST_HEAD(&css->children);
5438         INIT_LIST_HEAD(&css->rstat_css_node);
5439         css->serial_nr = css_serial_nr_next++;
5440         atomic_set(&css->online_cnt, 0);
5441
5442         if (cgroup_parent(cgrp)) {
5443                 css->parent = cgroup_css(cgroup_parent(cgrp), ss);
5444                 css_get(css->parent);
5445         }
5446
5447         if (ss->css_rstat_flush)
5448                 list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list);
5449
5450         BUG_ON(cgroup_css(cgrp, ss));
5451 }
5452
5453 /* invoke ->css_online() on a new CSS and mark it online if successful */
5454 static int online_css(struct cgroup_subsys_state *css)
5455 {
5456         struct cgroup_subsys *ss = css->ss;
5457         int ret = 0;
5458
5459         lockdep_assert_held(&cgroup_mutex);
5460
5461         if (ss->css_online)
5462                 ret = ss->css_online(css);
5463         if (!ret) {
5464                 css->flags |= CSS_ONLINE;
5465                 rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
5466
5467                 atomic_inc(&css->online_cnt);
5468                 if (css->parent)
5469                         atomic_inc(&css->parent->online_cnt);
5470         }
5471         return ret;
5472 }
5473
5474 /* if the CSS is online, invoke ->css_offline() on it and mark it offline */
5475 static void offline_css(struct cgroup_subsys_state *css)
5476 {
5477         struct cgroup_subsys *ss = css->ss;
5478
5479         lockdep_assert_held(&cgroup_mutex);
5480
5481         if (!(css->flags & CSS_ONLINE))
5482                 return;
5483
5484         if (ss->css_offline)
5485                 ss->css_offline(css);
5486
5487         css->flags &= ~CSS_ONLINE;
5488         RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL);
5489
5490         wake_up_all(&css->cgroup->offline_waitq);
5491 }
5492
5493 /**
5494  * css_create - create a cgroup_subsys_state
5495  * @cgrp: the cgroup new css will be associated with
5496  * @ss: the subsys of new css
5497  *
5498  * Create a new css associated with @cgrp - @ss pair.  On success, the new
5499  * css is online and installed in @cgrp.  This function doesn't create the
5500  * interface files.  Returns 0 on success, -errno on failure.
5501  */
5502 static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
5503                                               struct cgroup_subsys *ss)
5504 {
5505         struct cgroup *parent = cgroup_parent(cgrp);
5506         struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
5507         struct cgroup_subsys_state *css;
5508         int err;
5509
5510         lockdep_assert_held(&cgroup_mutex);
5511
5512         css = ss->css_alloc(parent_css);
5513         if (!css)
5514                 css = ERR_PTR(-ENOMEM);
5515         if (IS_ERR(css))
5516                 return css;
5517
5518         init_and_link_css(css, ss, cgrp);
5519
5520         err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
5521         if (err)
5522                 goto err_free_css;
5523
5524         err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
5525         if (err < 0)
5526                 goto err_free_css;
5527         css->id = err;
5528
5529         /* @css is ready to be brought online now, make it visible */
5530         list_add_tail_rcu(&css->sibling, &parent_css->children);
5531         cgroup_idr_replace(&ss->css_idr, css, css->id);
5532
5533         err = online_css(css);
5534         if (err)
5535                 goto err_list_del;
5536
5537         return css;
5538
5539 err_list_del:
5540         list_del_rcu(&css->sibling);
5541 err_free_css:
5542         list_del_rcu(&css->rstat_css_node);
5543         INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn);
5544         queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork);
5545         return ERR_PTR(err);
5546 }
5547
5548 /*
5549  * The returned cgroup is fully initialized including its control mask, but
5550  * it isn't associated with its kernfs_node and doesn't have the control
5551  * mask applied.
5552  */
5553 static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
5554                                     umode_t mode)
5555 {
5556         struct cgroup_root *root = parent->root;
5557         struct cgroup *cgrp, *tcgrp;
5558         struct kernfs_node *kn;
5559         int level = parent->level + 1;
5560         int ret;
5561
5562         /* allocate the cgroup and its ID, 0 is reserved for the root */
5563         cgrp = kzalloc(struct_size(cgrp, ancestors, (level + 1)), GFP_KERNEL);
5564         if (!cgrp)
5565                 return ERR_PTR(-ENOMEM);
5566
5567         ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
5568         if (ret)
5569                 goto out_free_cgrp;
5570
5571         ret = cgroup_rstat_init(cgrp);
5572         if (ret)
5573                 goto out_cancel_ref;
5574
5575         /* create the directory */
5576         kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
5577         if (IS_ERR(kn)) {
5578                 ret = PTR_ERR(kn);
5579                 goto out_stat_exit;
5580         }
5581         cgrp->kn = kn;
5582
5583         init_cgroup_housekeeping(cgrp);
5584
5585         cgrp->self.parent = &parent->self;
5586         cgrp->root = root;
5587         cgrp->level = level;
5588
5589         ret = psi_cgroup_alloc(cgrp);
5590         if (ret)
5591                 goto out_kernfs_remove;
5592
5593         ret = cgroup_bpf_inherit(cgrp);
5594         if (ret)
5595                 goto out_psi_free;
5596
5597         /*
5598          * New cgroup inherits effective freeze counter, and
5599          * if the parent has to be frozen, the child has too.
5600          */
5601         cgrp->freezer.e_freeze = parent->freezer.e_freeze;
5602         if (cgrp->freezer.e_freeze) {
5603                 /*
5604                  * Set the CGRP_FREEZE flag, so when a process will be
5605                  * attached to the child cgroup, it will become frozen.
5606                  * At this point the new cgroup is unpopulated, so we can
5607                  * consider it frozen immediately.
5608                  */
5609                 set_bit(CGRP_FREEZE, &cgrp->flags);
5610                 set_bit(CGRP_FROZEN, &cgrp->flags);
5611         }
5612
5613         spin_lock_irq(&css_set_lock);
5614         for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
5615                 cgrp->ancestors[tcgrp->level] = tcgrp;
5616
5617                 if (tcgrp != cgrp) {
5618                         tcgrp->nr_descendants++;
5619
5620                         /*
5621                          * If the new cgroup is frozen, all ancestor cgroups
5622                          * get a new frozen descendant, but their state can't
5623                          * change because of this.
5624                          */
5625                         if (cgrp->freezer.e_freeze)
5626                                 tcgrp->freezer.nr_frozen_descendants++;
5627                 }
5628         }
5629         spin_unlock_irq(&css_set_lock);
5630
5631         if (notify_on_release(parent))
5632                 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
5633
5634         if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags))
5635                 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags);
5636
5637         cgrp->self.serial_nr = css_serial_nr_next++;
5638
5639         /* allocation complete, commit to creation */
5640         list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children);
5641         atomic_inc(&root->nr_cgrps);
5642         cgroup_get_live(parent);
5643
5644         /*
5645          * On the default hierarchy, a child doesn't automatically inherit
5646          * subtree_control from the parent.  Each is configured manually.
5647          */
5648         if (!cgroup_on_dfl(cgrp))
5649                 cgrp->subtree_control = cgroup_control(cgrp);
5650
5651         cgroup_propagate_control(cgrp);
5652
5653         return cgrp;
5654
5655 out_psi_free:
5656         psi_cgroup_free(cgrp);
5657 out_kernfs_remove:
5658         kernfs_remove(cgrp->kn);
5659 out_stat_exit:
5660         cgroup_rstat_exit(cgrp);
5661 out_cancel_ref:
5662         percpu_ref_exit(&cgrp->self.refcnt);
5663 out_free_cgrp:
5664         kfree(cgrp);
5665         return ERR_PTR(ret);
5666 }
5667
5668 static bool cgroup_check_hierarchy_limits(struct cgroup *parent)
5669 {
5670         struct cgroup *cgroup;
5671         int ret = false;
5672         int level = 1;
5673
5674         lockdep_assert_held(&cgroup_mutex);
5675
5676         for (cgroup = parent; cgroup; cgroup = cgroup_parent(cgroup)) {
5677                 if (cgroup->nr_descendants >= cgroup->max_descendants)
5678                         goto fail;
5679
5680                 if (level > cgroup->max_depth)
5681                         goto fail;
5682
5683                 level++;
5684         }
5685
5686         ret = true;
5687 fail:
5688         return ret;
5689 }
5690
5691 int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
5692 {
5693         struct cgroup *parent, *cgrp;
5694         int ret;
5695
5696         /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
5697         if (strchr(name, '\n'))
5698                 return -EINVAL;
5699
5700         parent = cgroup_kn_lock_live(parent_kn, false);
5701         if (!parent)
5702                 return -ENODEV;
5703
5704         if (!cgroup_check_hierarchy_limits(parent)) {
5705                 ret = -EAGAIN;
5706                 goto out_unlock;
5707         }
5708
5709         cgrp = cgroup_create(parent, name, mode);
5710         if (IS_ERR(cgrp)) {
5711                 ret = PTR_ERR(cgrp);
5712                 goto out_unlock;
5713         }
5714
5715         /*
5716          * This extra ref will be put in cgroup_free_fn() and guarantees
5717          * that @cgrp->kn is always accessible.
5718          */
5719         kernfs_get(cgrp->kn);
5720
5721         ret = cgroup_kn_set_ugid(cgrp->kn);
5722         if (ret)
5723                 goto out_destroy;
5724
5725         ret = css_populate_dir(&cgrp->self);
5726         if (ret)
5727                 goto out_destroy;
5728
5729         ret = cgroup_apply_control_enable(cgrp);
5730         if (ret)
5731                 goto out_destroy;
5732
5733         TRACE_CGROUP_PATH(mkdir, cgrp);
5734
5735         /* let's create and online css's */
5736         kernfs_activate(cgrp->kn);
5737
5738         ret = 0;
5739         goto out_unlock;
5740
5741 out_destroy:
5742         cgroup_destroy_locked(cgrp);
5743 out_unlock:
5744         cgroup_kn_unlock(parent_kn);
5745         return ret;
5746 }
5747
5748 /*
5749  * This is called when the refcnt of a css is confirmed to be killed.
5750  * css_tryget_online() is now guaranteed to fail.  Tell the subsystem to
5751  * initiate destruction and put the css ref from kill_css().
5752  */
5753 static void css_killed_work_fn(struct work_struct *work)
5754 {
5755         struct cgroup_subsys_state *css =
5756                 container_of(work, struct cgroup_subsys_state, destroy_work);
5757
5758         cgroup_lock();
5759
5760         do {
5761                 offline_css(css);
5762                 css_put(css);
5763                 /* @css can't go away while we're holding cgroup_mutex */
5764                 css = css->parent;
5765         } while (css && atomic_dec_and_test(&css->online_cnt));
5766
5767         cgroup_unlock();
5768 }
5769
5770 /* css kill confirmation processing requires process context, bounce */
5771 static void css_killed_ref_fn(struct percpu_ref *ref)
5772 {
5773         struct cgroup_subsys_state *css =
5774                 container_of(ref, struct cgroup_subsys_state, refcnt);
5775
5776         if (atomic_dec_and_test(&css->online_cnt)) {
5777                 INIT_WORK(&css->destroy_work, css_killed_work_fn);
5778                 queue_work(cgroup_destroy_wq, &css->destroy_work);
5779         }
5780 }
5781
5782 /**
5783  * kill_css - destroy a css
5784  * @css: css to destroy
5785  *
5786  * This function initiates destruction of @css by removing cgroup interface
5787  * files and putting its base reference.  ->css_offline() will be invoked
5788  * asynchronously once css_tryget_online() is guaranteed to fail and when
5789  * the reference count reaches zero, @css will be released.
5790  */
5791 static void kill_css(struct cgroup_subsys_state *css)
5792 {
5793         lockdep_assert_held(&cgroup_mutex);
5794
5795         if (css->flags & CSS_DYING)
5796                 return;
5797
5798         css->flags |= CSS_DYING;
5799
5800         /*
5801          * This must happen before css is disassociated with its cgroup.
5802          * See seq_css() for details.
5803          */
5804         css_clear_dir(css);
5805
5806         /*
5807          * Killing would put the base ref, but we need to keep it alive
5808          * until after ->css_offline().
5809          */
5810         css_get(css);
5811
5812         /*
5813          * cgroup core guarantees that, by the time ->css_offline() is
5814          * invoked, no new css reference will be given out via
5815          * css_tryget_online().  We can't simply call percpu_ref_kill() and
5816          * proceed to offlining css's because percpu_ref_kill() doesn't
5817          * guarantee that the ref is seen as killed on all CPUs on return.
5818          *
5819          * Use percpu_ref_kill_and_confirm() to get notifications as each
5820          * css is confirmed to be seen as killed on all CPUs.
5821          */
5822         percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn);
5823 }
5824
5825 /**
5826  * cgroup_destroy_locked - the first stage of cgroup destruction
5827  * @cgrp: cgroup to be destroyed
5828  *
5829  * css's make use of percpu refcnts whose killing latency shouldn't be
5830  * exposed to userland and are RCU protected.  Also, cgroup core needs to
5831  * guarantee that css_tryget_online() won't succeed by the time
5832  * ->css_offline() is invoked.  To satisfy all the requirements,
5833  * destruction is implemented in the following two steps.
5834  *
5835  * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
5836  *     userland visible parts and start killing the percpu refcnts of
5837  *     css's.  Set up so that the next stage will be kicked off once all
5838  *     the percpu refcnts are confirmed to be killed.
5839  *
5840  * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
5841  *     rest of destruction.  Once all cgroup references are gone, the
5842  *     cgroup is RCU-freed.
5843  *
5844  * This function implements s1.  After this step, @cgrp is gone as far as
5845  * the userland is concerned and a new cgroup with the same name may be
5846  * created.  As cgroup doesn't care about the names internally, this
5847  * doesn't cause any problem.
5848  */
5849 static int cgroup_destroy_locked(struct cgroup *cgrp)
5850         __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
5851 {
5852         struct cgroup *tcgrp, *parent = cgroup_parent(cgrp);
5853         struct cgroup_subsys_state *css;
5854         struct cgrp_cset_link *link;
5855         int ssid;
5856
5857         lockdep_assert_held(&cgroup_mutex);
5858
5859         /*
5860          * Only migration can raise populated from zero and we're already
5861          * holding cgroup_mutex.
5862          */
5863         if (cgroup_is_populated(cgrp))
5864                 return -EBUSY;
5865
5866         /*
5867          * Make sure there's no live children.  We can't test emptiness of
5868          * ->self.children as dead children linger on it while being
5869          * drained; otherwise, "rmdir parent/child parent" may fail.
5870          */
5871         if (css_has_online_children(&cgrp->self))
5872                 return -EBUSY;
5873
5874         /*
5875          * Mark @cgrp and the associated csets dead.  The former prevents
5876          * further task migration and child creation by disabling
5877          * cgroup_lock_live_group().  The latter makes the csets ignored by
5878          * the migration path.
5879          */
5880         cgrp->self.flags &= ~CSS_ONLINE;
5881
5882         spin_lock_irq(&css_set_lock);
5883         list_for_each_entry(link, &cgrp->cset_links, cset_link)
5884                 link->cset->dead = true;
5885         spin_unlock_irq(&css_set_lock);
5886
5887         /* initiate massacre of all css's */
5888         for_each_css(css, ssid, cgrp)
5889                 kill_css(css);
5890
5891         /* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */
5892         css_clear_dir(&cgrp->self);
5893         kernfs_remove(cgrp->kn);
5894
5895         if (cgroup_is_threaded(cgrp))
5896                 parent->nr_threaded_children--;
5897
5898         spin_lock_irq(&css_set_lock);
5899         for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) {
5900                 tcgrp->nr_descendants--;
5901                 tcgrp->nr_dying_descendants++;
5902                 /*
5903                  * If the dying cgroup is frozen, decrease frozen descendants
5904                  * counters of ancestor cgroups.
5905                  */
5906                 if (test_bit(CGRP_FROZEN, &cgrp->flags))
5907                         tcgrp->freezer.nr_frozen_descendants--;
5908         }
5909         spin_unlock_irq(&css_set_lock);
5910
5911         cgroup1_check_for_release(parent);
5912
5913         cgroup_bpf_offline(cgrp);
5914
5915         /* put the base reference */
5916         percpu_ref_kill(&cgrp->self.refcnt);
5917
5918         return 0;
5919 };
5920
5921 int cgroup_rmdir(struct kernfs_node *kn)
5922 {
5923         struct cgroup *cgrp;
5924         int ret = 0;
5925
5926         cgrp = cgroup_kn_lock_live(kn, false);
5927         if (!cgrp)
5928                 return 0;
5929
5930         ret = cgroup_destroy_locked(cgrp);
5931         if (!ret)
5932                 TRACE_CGROUP_PATH(rmdir, cgrp);
5933
5934         cgroup_kn_unlock(kn);
5935         return ret;
5936 }
5937
5938 static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
5939         .show_options           = cgroup_show_options,
5940         .mkdir                  = cgroup_mkdir,
5941         .rmdir                  = cgroup_rmdir,
5942         .show_path              = cgroup_show_path,
5943 };
5944
5945 static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
5946 {
5947         struct cgroup_subsys_state *css;
5948
5949         pr_debug("Initializing cgroup subsys %s\n", ss->name);
5950
5951         cgroup_lock();
5952
5953         idr_init(&ss->css_idr);
5954         INIT_LIST_HEAD(&ss->cfts);
5955
5956         /* Create the root cgroup state for this subsystem */
5957         ss->root = &cgrp_dfl_root;
5958         css = ss->css_alloc(NULL);
5959         /* We don't handle early failures gracefully */
5960         BUG_ON(IS_ERR(css));
5961         init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
5962
5963         /*
5964          * Root csses are never destroyed and we can't initialize
5965          * percpu_ref during early init.  Disable refcnting.
5966          */
5967         css->flags |= CSS_NO_REF;
5968
5969         if (early) {
5970                 /* allocation can't be done safely during early init */
5971                 css->id = 1;
5972         } else {
5973                 css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2, GFP_KERNEL);
5974                 BUG_ON(css->id < 0);
5975         }
5976
5977         /* Update the init_css_set to contain a subsys
5978          * pointer to this state - since the subsystem is
5979          * newly registered, all tasks and hence the
5980          * init_css_set is in the subsystem's root cgroup. */
5981         init_css_set.subsys[ss->id] = css;
5982
5983         have_fork_callback |= (bool)ss->fork << ss->id;
5984         have_exit_callback |= (bool)ss->exit << ss->id;
5985         have_release_callback |= (bool)ss->release << ss->id;
5986         have_canfork_callback |= (bool)ss->can_fork << ss->id;
5987
5988         /* At system boot, before all subsystems have been
5989          * registered, no tasks have been forked, so we don't
5990          * need to invoke fork callbacks here. */
5991         BUG_ON(!list_empty(&init_task.tasks));
5992
5993         BUG_ON(online_css(css));
5994
5995         cgroup_unlock();
5996 }
5997
5998 /**
5999  * cgroup_init_early - cgroup initialization at system boot
6000  *
6001  * Initialize cgroups at system boot, and initialize any
6002  * subsystems that request early init.
6003  */
6004 int __init cgroup_init_early(void)
6005 {
6006         static struct cgroup_fs_context __initdata ctx;
6007         struct cgroup_subsys *ss;
6008         int i;
6009
6010         ctx.root = &cgrp_dfl_root;
6011         init_cgroup_root(&ctx);
6012         cgrp_dfl_root.cgrp.self.flags |= CSS_NO_REF;
6013
6014         RCU_INIT_POINTER(init_task.cgroups, &init_css_set);
6015
6016         for_each_subsys(ss, i) {
6017                 WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id,
6018                      "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n",
6019                      i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free,
6020                      ss->id, ss->name);
6021                 WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN,
6022                      "cgroup_subsys_name %s too long\n", cgroup_subsys_name[i]);
6023
6024                 ss->id = i;
6025                 ss->name = cgroup_subsys_name[i];
6026                 if (!ss->legacy_name)
6027                         ss->legacy_name = cgroup_subsys_name[i];
6028
6029                 if (ss->early_init)
6030                         cgroup_init_subsys(ss, true);
6031         }
6032         return 0;
6033 }
6034
6035 /**
6036  * cgroup_init - cgroup initialization
6037  *
6038  * Register cgroup filesystem and /proc file, and initialize
6039  * any subsystems that didn't request early init.
6040  */
6041 int __init cgroup_init(void)
6042 {
6043         struct cgroup_subsys *ss;
6044         int ssid;
6045
6046         BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
6047         BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
6048         BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files));
6049         BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
6050
6051         cgroup_rstat_boot();
6052
6053         get_user_ns(init_cgroup_ns.user_ns);
6054
6055         cgroup_lock();
6056
6057         /*
6058          * Add init_css_set to the hash table so that dfl_root can link to
6059          * it during init.
6060          */
6061         hash_add(css_set_table, &init_css_set.hlist,
6062                  css_set_hash(init_css_set.subsys));
6063
6064         BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0));
6065
6066         cgroup_unlock();
6067
6068         for_each_subsys(ss, ssid) {
6069                 if (ss->early_init) {
6070                         struct cgroup_subsys_state *css =
6071                                 init_css_set.subsys[ss->id];
6072
6073                         css->id = cgroup_idr_alloc(&ss->css_idr, css, 1, 2,
6074                                                    GFP_KERNEL);
6075                         BUG_ON(css->id < 0);
6076                 } else {
6077                         cgroup_init_subsys(ss, false);
6078                 }
6079
6080                 list_add_tail(&init_css_set.e_cset_node[ssid],
6081                               &cgrp_dfl_root.cgrp.e_csets[ssid]);
6082
6083                 /*
6084                  * Setting dfl_root subsys_mask needs to consider the
6085                  * disabled flag and cftype registration needs kmalloc,
6086                  * both of which aren't available during early_init.
6087                  */
6088                 if (!cgroup_ssid_enabled(ssid))
6089                         continue;
6090
6091                 if (cgroup1_ssid_disabled(ssid))
6092                         printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
6093                                ss->name);
6094
6095                 cgrp_dfl_root.subsys_mask |= 1 << ss->id;
6096
6097                 /* implicit controllers must be threaded too */
6098                 WARN_ON(ss->implicit_on_dfl && !ss->threaded);
6099
6100                 if (ss->implicit_on_dfl)
6101                         cgrp_dfl_implicit_ss_mask |= 1 << ss->id;
6102                 else if (!ss->dfl_cftypes)
6103                         cgrp_dfl_inhibit_ss_mask |= 1 << ss->id;
6104
6105                 if (ss->threaded)
6106                         cgrp_dfl_threaded_ss_mask |= 1 << ss->id;
6107
6108                 if (ss->dfl_cftypes == ss->legacy_cftypes) {
6109                         WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
6110                 } else {
6111                         WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
6112                         WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
6113                 }
6114
6115                 if (ss->bind)
6116                         ss->bind(init_css_set.subsys[ssid]);
6117
6118                 cgroup_lock();
6119                 css_populate_dir(init_css_set.subsys[ssid]);
6120                 cgroup_unlock();
6121         }
6122
6123         /* init_css_set.subsys[] has been updated, re-hash */
6124         hash_del(&init_css_set.hlist);
6125         hash_add(css_set_table, &init_css_set.hlist,
6126                  css_set_hash(init_css_set.subsys));
6127
6128         WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
6129         WARN_ON(register_filesystem(&cgroup_fs_type));
6130         WARN_ON(register_filesystem(&cgroup2_fs_type));
6131         WARN_ON(!proc_create_single("cgroups", 0, NULL, proc_cgroupstats_show));
6132 #ifdef CONFIG_CPUSETS
6133         WARN_ON(register_filesystem(&cpuset_fs_type));
6134 #endif
6135
6136         return 0;
6137 }
6138
6139 static int __init cgroup_wq_init(void)
6140 {
6141         /*
6142          * There isn't much point in executing destruction path in
6143          * parallel.  Good chunk is serialized with cgroup_mutex anyway.
6144          * Use 1 for @max_active.
6145          *
6146          * We would prefer to do this in cgroup_init() above, but that
6147          * is called before init_workqueues(): so leave this until after.
6148          */
6149         cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
6150         BUG_ON(!cgroup_destroy_wq);
6151         return 0;
6152 }
6153 core_initcall(cgroup_wq_init);
6154
6155 void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
6156 {
6157         struct kernfs_node *kn;
6158
6159         kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
6160         if (!kn)
6161                 return;
6162         kernfs_path(kn, buf, buflen);
6163         kernfs_put(kn);
6164 }
6165
6166 /*
6167  * cgroup_get_from_id : get the cgroup associated with cgroup id
6168  * @id: cgroup id
6169  * On success return the cgrp or ERR_PTR on failure
6170  * Only cgroups within current task's cgroup NS are valid.
6171  */
6172 struct cgroup *cgroup_get_from_id(u64 id)
6173 {
6174         struct kernfs_node *kn;
6175         struct cgroup *cgrp, *root_cgrp;
6176
6177         kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
6178         if (!kn)
6179                 return ERR_PTR(-ENOENT);
6180
6181         if (kernfs_type(kn) != KERNFS_DIR) {
6182                 kernfs_put(kn);
6183                 return ERR_PTR(-ENOENT);
6184         }
6185
6186         rcu_read_lock();
6187
6188         cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
6189         if (cgrp && !cgroup_tryget(cgrp))
6190                 cgrp = NULL;
6191
6192         rcu_read_unlock();
6193         kernfs_put(kn);
6194
6195         if (!cgrp)
6196                 return ERR_PTR(-ENOENT);
6197
6198         root_cgrp = current_cgns_cgroup_dfl();
6199         if (!cgroup_is_descendant(cgrp, root_cgrp)) {
6200                 cgroup_put(cgrp);
6201                 return ERR_PTR(-ENOENT);
6202         }
6203
6204         return cgrp;
6205 }
6206 EXPORT_SYMBOL_GPL(cgroup_get_from_id);
6207
6208 /*
6209  * proc_cgroup_show()
6210  *  - Print task's cgroup paths into seq_file, one line for each hierarchy
6211  *  - Used for /proc/<pid>/cgroup.
6212  */
6213 int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
6214                      struct pid *pid, struct task_struct *tsk)
6215 {
6216         char *buf;
6217         int retval;
6218         struct cgroup_root *root;
6219
6220         retval = -ENOMEM;
6221         buf = kmalloc(PATH_MAX, GFP_KERNEL);
6222         if (!buf)
6223                 goto out;
6224
6225         cgroup_lock();
6226         spin_lock_irq(&css_set_lock);
6227
6228         for_each_root(root) {
6229                 struct cgroup_subsys *ss;
6230                 struct cgroup *cgrp;
6231                 int ssid, count = 0;
6232
6233                 if (root == &cgrp_dfl_root && !READ_ONCE(cgrp_dfl_visible))
6234                         continue;
6235
6236                 seq_printf(m, "%d:", root->hierarchy_id);
6237                 if (root != &cgrp_dfl_root)
6238                         for_each_subsys(ss, ssid)
6239                                 if (root->subsys_mask & (1 << ssid))
6240                                         seq_printf(m, "%s%s", count++ ? "," : "",
6241                                                    ss->legacy_name);
6242                 if (strlen(root->name))
6243                         seq_printf(m, "%sname=%s", count ? "," : "",
6244                                    root->name);
6245                 seq_putc(m, ':');
6246
6247                 cgrp = task_cgroup_from_root(tsk, root);
6248
6249                 /*
6250                  * On traditional hierarchies, all zombie tasks show up as
6251                  * belonging to the root cgroup.  On the default hierarchy,
6252                  * while a zombie doesn't show up in "cgroup.procs" and
6253                  * thus can't be migrated, its /proc/PID/cgroup keeps
6254                  * reporting the cgroup it belonged to before exiting.  If
6255                  * the cgroup is removed before the zombie is reaped,
6256                  * " (deleted)" is appended to the cgroup path.
6257                  */
6258                 if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
6259                         retval = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
6260                                                 current->nsproxy->cgroup_ns);
6261                         if (retval >= PATH_MAX)
6262                                 retval = -ENAMETOOLONG;
6263                         if (retval < 0)
6264                                 goto out_unlock;
6265
6266                         seq_puts(m, buf);
6267                 } else {
6268                         seq_puts(m, "/");
6269                 }
6270
6271                 if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
6272                         seq_puts(m, " (deleted)\n");
6273                 else
6274                         seq_putc(m, '\n');
6275         }
6276
6277         retval = 0;
6278 out_unlock:
6279         spin_unlock_irq(&css_set_lock);
6280         cgroup_unlock();
6281         kfree(buf);
6282 out:
6283         return retval;
6284 }
6285
6286 /**
6287  * cgroup_fork - initialize cgroup related fields during copy_process()
6288  * @child: pointer to task_struct of forking parent process.
6289  *
6290  * A task is associated with the init_css_set until cgroup_post_fork()
6291  * attaches it to the target css_set.
6292  */
6293 void cgroup_fork(struct task_struct *child)
6294 {
6295         RCU_INIT_POINTER(child->cgroups, &init_css_set);
6296         INIT_LIST_HEAD(&child->cg_list);
6297 }
6298
6299 /**
6300  * cgroup_v1v2_get_from_file - get a cgroup pointer from a file pointer
6301  * @f: file corresponding to cgroup_dir
6302  *
6303  * Find the cgroup from a file pointer associated with a cgroup directory.
6304  * Returns a pointer to the cgroup on success. ERR_PTR is returned if the
6305  * cgroup cannot be found.
6306  */
6307 static struct cgroup *cgroup_v1v2_get_from_file(struct file *f)
6308 {
6309         struct cgroup_subsys_state *css;
6310
6311         css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
6312         if (IS_ERR(css))
6313                 return ERR_CAST(css);
6314
6315         return css->cgroup;
6316 }
6317
6318 /**
6319  * cgroup_get_from_file - same as cgroup_v1v2_get_from_file, but only supports
6320  * cgroup2.
6321  * @f: file corresponding to cgroup2_dir
6322  */
6323 static struct cgroup *cgroup_get_from_file(struct file *f)
6324 {
6325         struct cgroup *cgrp = cgroup_v1v2_get_from_file(f);
6326
6327         if (IS_ERR(cgrp))
6328                 return ERR_CAST(cgrp);
6329
6330         if (!cgroup_on_dfl(cgrp)) {
6331                 cgroup_put(cgrp);
6332                 return ERR_PTR(-EBADF);
6333         }
6334
6335         return cgrp;
6336 }
6337
6338 /**
6339  * cgroup_css_set_fork - find or create a css_set for a child process
6340  * @kargs: the arguments passed to create the child process
6341  *
6342  * This functions finds or creates a new css_set which the child
6343  * process will be attached to in cgroup_post_fork(). By default,
6344  * the child process will be given the same css_set as its parent.
6345  *
6346  * If CLONE_INTO_CGROUP is specified this function will try to find an
6347  * existing css_set which includes the requested cgroup and if not create
6348  * a new css_set that the child will be attached to later. If this function
6349  * succeeds it will hold cgroup_threadgroup_rwsem on return. If
6350  * CLONE_INTO_CGROUP is requested this function will grab cgroup mutex
6351  * before grabbing cgroup_threadgroup_rwsem and will hold a reference
6352  * to the target cgroup.
6353  */
6354 static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
6355         __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
6356 {
6357         int ret;
6358         struct cgroup *dst_cgrp = NULL;
6359         struct css_set *cset;
6360         struct super_block *sb;
6361         struct file *f;
6362
6363         if (kargs->flags & CLONE_INTO_CGROUP)
6364                 cgroup_lock();
6365
6366         cgroup_threadgroup_change_begin(current);
6367
6368         spin_lock_irq(&css_set_lock);
6369         cset = task_css_set(current);
6370         get_css_set(cset);
6371         spin_unlock_irq(&css_set_lock);
6372
6373         if (!(kargs->flags & CLONE_INTO_CGROUP)) {
6374                 kargs->cset = cset;
6375                 return 0;
6376         }
6377
6378         f = fget_raw(kargs->cgroup);
6379         if (!f) {
6380                 ret = -EBADF;
6381                 goto err;
6382         }
6383         sb = f->f_path.dentry->d_sb;
6384
6385         dst_cgrp = cgroup_get_from_file(f);
6386         if (IS_ERR(dst_cgrp)) {
6387                 ret = PTR_ERR(dst_cgrp);
6388                 dst_cgrp = NULL;
6389                 goto err;
6390         }
6391
6392         if (cgroup_is_dead(dst_cgrp)) {
6393                 ret = -ENODEV;
6394                 goto err;
6395         }
6396
6397         /*
6398          * Verify that we the target cgroup is writable for us. This is
6399          * usually done by the vfs layer but since we're not going through
6400          * the vfs layer here we need to do it "manually".
6401          */
6402         ret = cgroup_may_write(dst_cgrp, sb);
6403         if (ret)
6404                 goto err;
6405
6406         /*
6407          * Spawning a task directly into a cgroup works by passing a file
6408          * descriptor to the target cgroup directory. This can even be an O_PATH
6409          * file descriptor. But it can never be a cgroup.procs file descriptor.
6410          * This was done on purpose so spawning into a cgroup could be
6411          * conceptualized as an atomic
6412          *
6413          *   fd = openat(dfd_cgroup, "cgroup.procs", ...);
6414          *   write(fd, <child-pid>, ...);
6415          *
6416          * sequence, i.e. it's a shorthand for the caller opening and writing
6417          * cgroup.procs of the cgroup indicated by @dfd_cgroup. This allows us
6418          * to always use the caller's credentials.
6419          */
6420         ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
6421                                         !(kargs->flags & CLONE_THREAD),
6422                                         current->nsproxy->cgroup_ns);
6423         if (ret)
6424                 goto err;
6425
6426         kargs->cset = find_css_set(cset, dst_cgrp);
6427         if (!kargs->cset) {
6428                 ret = -ENOMEM;
6429                 goto err;
6430         }
6431
6432         put_css_set(cset);
6433         fput(f);
6434         kargs->cgrp = dst_cgrp;
6435         return ret;
6436
6437 err:
6438         cgroup_threadgroup_change_end(current);
6439         cgroup_unlock();
6440         if (f)
6441                 fput(f);
6442         if (dst_cgrp)
6443                 cgroup_put(dst_cgrp);
6444         put_css_set(cset);
6445         if (kargs->cset)
6446                 put_css_set(kargs->cset);
6447         return ret;
6448 }
6449
6450 /**
6451  * cgroup_css_set_put_fork - drop references we took during fork
6452  * @kargs: the arguments passed to create the child process
6453  *
6454  * Drop references to the prepared css_set and target cgroup if
6455  * CLONE_INTO_CGROUP was requested.
6456  */
6457 static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
6458         __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
6459 {
6460         struct cgroup *cgrp = kargs->cgrp;
6461         struct css_set *cset = kargs->cset;
6462
6463         cgroup_threadgroup_change_end(current);
6464
6465         if (cset) {
6466                 put_css_set(cset);
6467                 kargs->cset = NULL;
6468         }
6469
6470         if (kargs->flags & CLONE_INTO_CGROUP) {
6471                 cgroup_unlock();
6472                 if (cgrp) {
6473                         cgroup_put(cgrp);
6474                         kargs->cgrp = NULL;
6475                 }
6476         }
6477 }
6478
6479 /**
6480  * cgroup_can_fork - called on a new task before the process is exposed
6481  * @child: the child process
6482  * @kargs: the arguments passed to create the child process
6483  *
6484  * This prepares a new css_set for the child process which the child will
6485  * be attached to in cgroup_post_fork().
6486  * This calls the subsystem can_fork() callbacks. If the cgroup_can_fork()
6487  * callback returns an error, the fork aborts with that error code. This
6488  * allows for a cgroup subsystem to conditionally allow or deny new forks.
6489  */
6490 int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
6491 {
6492         struct cgroup_subsys *ss;
6493         int i, j, ret;
6494
6495         ret = cgroup_css_set_fork(kargs);
6496         if (ret)
6497                 return ret;
6498
6499         do_each_subsys_mask(ss, i, have_canfork_callback) {
6500                 ret = ss->can_fork(child, kargs->cset);
6501                 if (ret)
6502                         goto out_revert;
6503         } while_each_subsys_mask();
6504
6505         return 0;
6506
6507 out_revert:
6508         for_each_subsys(ss, j) {
6509                 if (j >= i)
6510                         break;
6511                 if (ss->cancel_fork)
6512                         ss->cancel_fork(child, kargs->cset);
6513         }
6514
6515         cgroup_css_set_put_fork(kargs);
6516
6517         return ret;
6518 }
6519
6520 /**
6521  * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
6522  * @child: the child process
6523  * @kargs: the arguments passed to create the child process
6524  *
6525  * This calls the cancel_fork() callbacks if a fork failed *after*
6526  * cgroup_can_fork() succeeded and cleans up references we took to
6527  * prepare a new css_set for the child process in cgroup_can_fork().
6528  */
6529 void cgroup_cancel_fork(struct task_struct *child,
6530                         struct kernel_clone_args *kargs)
6531 {
6532         struct cgroup_subsys *ss;
6533         int i;
6534
6535         for_each_subsys(ss, i)
6536                 if (ss->cancel_fork)
6537                         ss->cancel_fork(child, kargs->cset);
6538
6539         cgroup_css_set_put_fork(kargs);
6540 }
6541
6542 /**
6543  * cgroup_post_fork - finalize cgroup setup for the child process
6544  * @child: the child process
6545  * @kargs: the arguments passed to create the child process
6546  *
6547  * Attach the child process to its css_set calling the subsystem fork()
6548  * callbacks.
6549  */
6550 void cgroup_post_fork(struct task_struct *child,
6551                       struct kernel_clone_args *kargs)
6552         __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
6553 {
6554         unsigned long cgrp_flags = 0;
6555         bool kill = false;
6556         struct cgroup_subsys *ss;
6557         struct css_set *cset;
6558         int i;
6559
6560         cset = kargs->cset;
6561         kargs->cset = NULL;
6562
6563         spin_lock_irq(&css_set_lock);
6564
6565         /* init tasks are special, only link regular threads */
6566         if (likely(child->pid)) {
6567                 if (kargs->cgrp)
6568                         cgrp_flags = kargs->cgrp->flags;
6569                 else
6570                         cgrp_flags = cset->dfl_cgrp->flags;
6571
6572                 WARN_ON_ONCE(!list_empty(&child->cg_list));
6573                 cset->nr_tasks++;
6574                 css_set_move_task(child, NULL, cset, false);
6575         } else {
6576                 put_css_set(cset);
6577                 cset = NULL;
6578         }
6579
6580         if (!(child->flags & PF_KTHREAD)) {
6581                 if (unlikely(test_bit(CGRP_FREEZE, &cgrp_flags))) {
6582                         /*
6583                          * If the cgroup has to be frozen, the new task has
6584                          * too. Let's set the JOBCTL_TRAP_FREEZE jobctl bit to
6585                          * get the task into the frozen state.
6586                          */
6587                         spin_lock(&child->sighand->siglock);
6588                         WARN_ON_ONCE(child->frozen);
6589                         child->jobctl |= JOBCTL_TRAP_FREEZE;
6590                         spin_unlock(&child->sighand->siglock);
6591
6592                         /*
6593                          * Calling cgroup_update_frozen() isn't required here,
6594                          * because it will be called anyway a bit later from
6595                          * do_freezer_trap(). So we avoid cgroup's transient
6596                          * switch from the frozen state and back.
6597                          */
6598                 }
6599
6600                 /*
6601                  * If the cgroup is to be killed notice it now and take the
6602                  * child down right after we finished preparing it for
6603                  * userspace.
6604                  */
6605                 kill = test_bit(CGRP_KILL, &cgrp_flags);
6606         }
6607
6608         spin_unlock_irq(&css_set_lock);
6609
6610         /*
6611          * Call ss->fork().  This must happen after @child is linked on
6612          * css_set; otherwise, @child might change state between ->fork()
6613          * and addition to css_set.
6614          */
6615         do_each_subsys_mask(ss, i, have_fork_callback) {
6616                 ss->fork(child);
6617         } while_each_subsys_mask();
6618
6619         /* Make the new cset the root_cset of the new cgroup namespace. */
6620         if (kargs->flags & CLONE_NEWCGROUP) {
6621                 struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;
6622
6623                 get_css_set(cset);
6624                 child->nsproxy->cgroup_ns->root_cset = cset;
6625                 put_css_set(rcset);
6626         }
6627
6628         /* Cgroup has to be killed so take down child immediately. */
6629         if (unlikely(kill))
6630                 do_send_sig_info(SIGKILL, SEND_SIG_NOINFO, child, PIDTYPE_TGID);
6631
6632         cgroup_css_set_put_fork(kargs);
6633 }
6634
6635 /**
6636  * cgroup_exit - detach cgroup from exiting task
6637  * @tsk: pointer to task_struct of exiting process
6638  *
6639  * Description: Detach cgroup from @tsk.
6640  *
6641  */
6642 void cgroup_exit(struct task_struct *tsk)
6643 {
6644         struct cgroup_subsys *ss;
6645         struct css_set *cset;
6646         int i;
6647
6648         spin_lock_irq(&css_set_lock);
6649
6650         WARN_ON_ONCE(list_empty(&tsk->cg_list));
6651         cset = task_css_set(tsk);
6652         css_set_move_task(tsk, cset, NULL, false);
6653         list_add_tail(&tsk->cg_list, &cset->dying_tasks);
6654         cset->nr_tasks--;
6655
6656         if (dl_task(tsk))
6657                 dec_dl_tasks_cs(tsk);
6658
6659         WARN_ON_ONCE(cgroup_task_frozen(tsk));
6660         if (unlikely(!(tsk->flags & PF_KTHREAD) &&
6661                      test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))
6662                 cgroup_update_frozen(task_dfl_cgroup(tsk));
6663
6664         spin_unlock_irq(&css_set_lock);
6665
6666         /* see cgroup_post_fork() for details */
6667         do_each_subsys_mask(ss, i, have_exit_callback) {
6668                 ss->exit(tsk);
6669         } while_each_subsys_mask();
6670 }
6671
6672 void cgroup_release(struct task_struct *task)
6673 {
6674         struct cgroup_subsys *ss;
6675         int ssid;
6676
6677         do_each_subsys_mask(ss, ssid, have_release_callback) {
6678                 ss->release(task);
6679         } while_each_subsys_mask();
6680
6681         spin_lock_irq(&css_set_lock);
6682         css_set_skip_task_iters(task_css_set(task), task);
6683         list_del_init(&task->cg_list);
6684         spin_unlock_irq(&css_set_lock);
6685 }
6686
6687 void cgroup_free(struct task_struct *task)
6688 {
6689         struct css_set *cset = task_css_set(task);
6690         put_css_set(cset);
6691 }
6692
6693 static int __init cgroup_disable(char *str)
6694 {
6695         struct cgroup_subsys *ss;
6696         char *token;
6697         int i;
6698
6699         while ((token = strsep(&str, ",")) != NULL) {
6700                 if (!*token)
6701                         continue;
6702
6703                 for_each_subsys(ss, i) {
6704                         if (strcmp(token, ss->name) &&
6705                             strcmp(token, ss->legacy_name))
6706                                 continue;
6707
6708                         static_branch_disable(cgroup_subsys_enabled_key[i]);
6709                         pr_info("Disabling %s control group subsystem\n",
6710                                 ss->name);
6711                 }
6712
6713                 for (i = 0; i < OPT_FEATURE_COUNT; i++) {
6714                         if (strcmp(token, cgroup_opt_feature_names[i]))
6715                                 continue;
6716                         cgroup_feature_disable_mask |= 1 << i;
6717                         pr_info("Disabling %s control group feature\n",
6718                                 cgroup_opt_feature_names[i]);
6719                         break;
6720                 }
6721         }
6722         return 1;
6723 }
6724 __setup("cgroup_disable=", cgroup_disable);
6725
6726 void __init __weak enable_debug_cgroup(void) { }
6727
6728 static int __init enable_cgroup_debug(char *str)
6729 {
6730         cgroup_debug = true;
6731         enable_debug_cgroup();
6732         return 1;
6733 }
6734 __setup("cgroup_debug", enable_cgroup_debug);
6735
6736 /**
6737  * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
6738  * @dentry: directory dentry of interest
6739  * @ss: subsystem of interest
6740  *
6741  * If @dentry is a directory for a cgroup which has @ss enabled on it, try
6742  * to get the corresponding css and return it.  If such css doesn't exist
6743  * or can't be pinned, an ERR_PTR value is returned.
6744  */
6745 struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
6746                                                        struct cgroup_subsys *ss)
6747 {
6748         struct kernfs_node *kn = kernfs_node_from_dentry(dentry);
6749         struct file_system_type *s_type = dentry->d_sb->s_type;
6750         struct cgroup_subsys_state *css = NULL;
6751         struct cgroup *cgrp;
6752
6753         /* is @dentry a cgroup dir? */
6754         if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) ||
6755             !kn || kernfs_type(kn) != KERNFS_DIR)
6756                 return ERR_PTR(-EBADF);
6757
6758         rcu_read_lock();
6759
6760         /*
6761          * This path doesn't originate from kernfs and @kn could already
6762          * have been or be removed at any point.  @kn->priv is RCU
6763          * protected for this access.  See css_release_work_fn() for details.
6764          */
6765         cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
6766         if (cgrp)
6767                 css = cgroup_css(cgrp, ss);
6768
6769         if (!css || !css_tryget_online(css))
6770                 css = ERR_PTR(-ENOENT);
6771
6772         rcu_read_unlock();
6773         return css;
6774 }
6775
6776 /**
6777  * css_from_id - lookup css by id
6778  * @id: the cgroup id
6779  * @ss: cgroup subsys to be looked into
6780  *
6781  * Returns the css if there's valid one with @id, otherwise returns NULL.
6782  * Should be called under rcu_read_lock().
6783  */
6784 struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
6785 {
6786         WARN_ON_ONCE(!rcu_read_lock_held());
6787         return idr_find(&ss->css_idr, id);
6788 }
6789
6790 /**
6791  * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path
6792  * @path: path on the default hierarchy
6793  *
6794  * Find the cgroup at @path on the default hierarchy, increment its
6795  * reference count and return it.  Returns pointer to the found cgroup on
6796  * success, ERR_PTR(-ENOENT) if @path doesn't exist or if the cgroup has already
6797  * been released and ERR_PTR(-ENOTDIR) if @path points to a non-directory.
6798  */
6799 struct cgroup *cgroup_get_from_path(const char *path)
6800 {
6801         struct kernfs_node *kn;
6802         struct cgroup *cgrp = ERR_PTR(-ENOENT);
6803         struct cgroup *root_cgrp;
6804
6805         root_cgrp = current_cgns_cgroup_dfl();
6806         kn = kernfs_walk_and_get(root_cgrp->kn, path);
6807         if (!kn)
6808                 goto out;
6809
6810         if (kernfs_type(kn) != KERNFS_DIR) {
6811                 cgrp = ERR_PTR(-ENOTDIR);
6812                 goto out_kernfs;
6813         }
6814
6815         rcu_read_lock();
6816
6817         cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
6818         if (!cgrp || !cgroup_tryget(cgrp))
6819                 cgrp = ERR_PTR(-ENOENT);
6820
6821         rcu_read_unlock();
6822
6823 out_kernfs:
6824         kernfs_put(kn);
6825 out:
6826         return cgrp;
6827 }
6828 EXPORT_SYMBOL_GPL(cgroup_get_from_path);
6829
6830 /**
6831  * cgroup_v1v2_get_from_fd - get a cgroup pointer from a fd
6832  * @fd: fd obtained by open(cgroup_dir)
6833  *
6834  * Find the cgroup from a fd which should be obtained
6835  * by opening a cgroup directory.  Returns a pointer to the
6836  * cgroup on success. ERR_PTR is returned if the cgroup
6837  * cannot be found.
6838  */
6839 struct cgroup *cgroup_v1v2_get_from_fd(int fd)
6840 {
6841         struct cgroup *cgrp;
6842         struct fd f = fdget_raw(fd);
6843         if (!f.file)
6844                 return ERR_PTR(-EBADF);
6845
6846         cgrp = cgroup_v1v2_get_from_file(f.file);
6847         fdput(f);
6848         return cgrp;
6849 }
6850
6851 /**
6852  * cgroup_get_from_fd - same as cgroup_v1v2_get_from_fd, but only supports
6853  * cgroup2.
6854  * @fd: fd obtained by open(cgroup2_dir)
6855  */
6856 struct cgroup *cgroup_get_from_fd(int fd)
6857 {
6858         struct cgroup *cgrp = cgroup_v1v2_get_from_fd(fd);
6859
6860         if (IS_ERR(cgrp))
6861                 return ERR_CAST(cgrp);
6862
6863         if (!cgroup_on_dfl(cgrp)) {
6864                 cgroup_put(cgrp);
6865                 return ERR_PTR(-EBADF);
6866         }
6867         return cgrp;
6868 }
6869 EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
6870
6871 static u64 power_of_ten(int power)
6872 {
6873         u64 v = 1;
6874         while (power--)
6875                 v *= 10;
6876         return v;
6877 }
6878
6879 /**
6880  * cgroup_parse_float - parse a floating number
6881  * @input: input string
6882  * @dec_shift: number of decimal digits to shift
6883  * @v: output
6884  *
6885  * Parse a decimal floating point number in @input and store the result in
6886  * @v with decimal point right shifted @dec_shift times.  For example, if
6887  * @input is "12.3456" and @dec_shift is 3, *@v will be set to 12345.
6888  * Returns 0 on success, -errno otherwise.
6889  *
6890  * There's nothing cgroup specific about this function except that it's
6891  * currently the only user.
6892  */
6893 int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
6894 {
6895         s64 whole, frac = 0;
6896         int fstart = 0, fend = 0, flen;
6897
6898         if (!sscanf(input, "%lld.%n%lld%n", &whole, &fstart, &frac, &fend))
6899                 return -EINVAL;
6900         if (frac < 0)
6901                 return -EINVAL;
6902
6903         flen = fend > fstart ? fend - fstart : 0;
6904         if (flen < dec_shift)
6905                 frac *= power_of_ten(dec_shift - flen);
6906         else
6907                 frac = DIV_ROUND_CLOSEST_ULL(frac, power_of_ten(flen - dec_shift));
6908
6909         *v = whole * power_of_ten(dec_shift) + frac;
6910         return 0;
6911 }
6912
6913 /*
6914  * sock->sk_cgrp_data handling.  For more info, see sock_cgroup_data
6915  * definition in cgroup-defs.h.
6916  */
6917 #ifdef CONFIG_SOCK_CGROUP_DATA
6918
6919 void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
6920 {
6921         struct cgroup *cgroup;
6922
6923         rcu_read_lock();
6924         /* Don't associate the sock with unrelated interrupted task's cgroup. */
6925         if (in_interrupt()) {
6926                 cgroup = &cgrp_dfl_root.cgrp;
6927                 cgroup_get(cgroup);
6928                 goto out;
6929         }
6930
6931         while (true) {
6932                 struct css_set *cset;
6933
6934                 cset = task_css_set(current);
6935                 if (likely(cgroup_tryget(cset->dfl_cgrp))) {
6936                         cgroup = cset->dfl_cgrp;
6937                         break;
6938                 }
6939                 cpu_relax();
6940         }
6941 out:
6942         skcd->cgroup = cgroup;
6943         cgroup_bpf_get(cgroup);
6944         rcu_read_unlock();
6945 }
6946
6947 void cgroup_sk_clone(struct sock_cgroup_data *skcd)
6948 {
6949         struct cgroup *cgrp = sock_cgroup_ptr(skcd);
6950
6951         /*
6952          * We might be cloning a socket which is left in an empty
6953          * cgroup and the cgroup might have already been rmdir'd.
6954          * Don't use cgroup_get_live().
6955          */
6956         cgroup_get(cgrp);
6957         cgroup_bpf_get(cgrp);
6958 }
6959
6960 void cgroup_sk_free(struct sock_cgroup_data *skcd)
6961 {
6962         struct cgroup *cgrp = sock_cgroup_ptr(skcd);
6963
6964         cgroup_bpf_put(cgrp);
6965         cgroup_put(cgrp);
6966 }
6967
6968 #endif  /* CONFIG_SOCK_CGROUP_DATA */
6969
6970 #ifdef CONFIG_SYSFS
6971 static ssize_t show_delegatable_files(struct cftype *files, char *buf,
6972                                       ssize_t size, const char *prefix)
6973 {
6974         struct cftype *cft;
6975         ssize_t ret = 0;
6976
6977         for (cft = files; cft && cft->name[0] != '\0'; cft++) {
6978                 if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
6979                         continue;
6980
6981                 if (prefix)
6982                         ret += snprintf(buf + ret, size - ret, "%s.", prefix);
6983
6984                 ret += snprintf(buf + ret, size - ret, "%s\n", cft->name);
6985
6986                 if (WARN_ON(ret >= size))
6987                         break;
6988         }
6989
6990         return ret;
6991 }
6992
6993 static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
6994                               char *buf)
6995 {
6996         struct cgroup_subsys *ss;
6997         int ssid;
6998         ssize_t ret = 0;
6999
7000         ret = show_delegatable_files(cgroup_base_files, buf + ret,
7001                                      PAGE_SIZE - ret, NULL);
7002         if (cgroup_psi_enabled())
7003                 ret += show_delegatable_files(cgroup_psi_files, buf + ret,
7004                                               PAGE_SIZE - ret, NULL);
7005
7006         for_each_subsys(ss, ssid)
7007                 ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
7008                                               PAGE_SIZE - ret,
7009                                               cgroup_subsys_name[ssid]);
7010
7011         return ret;
7012 }
7013 static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
7014
7015 static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
7016                              char *buf)
7017 {
7018         return snprintf(buf, PAGE_SIZE,
7019                         "nsdelegate\n"
7020                         "favordynmods\n"
7021                         "memory_localevents\n"
7022                         "memory_recursiveprot\n");
7023 }
7024 static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
7025
7026 static struct attribute *cgroup_sysfs_attrs[] = {
7027         &cgroup_delegate_attr.attr,
7028         &cgroup_features_attr.attr,
7029         NULL,
7030 };
7031
7032 static const struct attribute_group cgroup_sysfs_attr_group = {
7033         .attrs = cgroup_sysfs_attrs,
7034         .name = "cgroup",
7035 };
7036
7037 static int __init cgroup_sysfs_init(void)
7038 {
7039         return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group);
7040 }
7041 subsys_initcall(cgroup_sysfs_init);
7042
7043 #endif /* CONFIG_SYSFS */