kernel/events/core.c

   1 /*
   2  * Performance events core code:
   3  *
   4  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
   5  *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
   6  *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
   7  *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
   8  *
   9  * For licensing details see kernel-base/COPYING
  10  */
  11
  12 #include <linux/fs.h>
  13 #include <linux/mm.h>
  14 #include <linux/cpu.h>
  15 #include <linux/smp.h>
  16 #include <linux/idr.h>
  17 #include <linux/file.h>
  18 #include <linux/poll.h>
  19 #include <linux/slab.h>
  20 #include <linux/hash.h>
  21 #include <linux/tick.h>
  22 #include <linux/sysfs.h>
  23 #include <linux/dcache.h>
  24 #include <linux/percpu.h>
  25 #include <linux/ptrace.h>
  26 #include <linux/reboot.h>
  27 #include <linux/vmstat.h>
  28 #include <linux/device.h>
  29 #include <linux/export.h>
  30 #include <linux/vmalloc.h>
  31 #include <linux/hardirq.h>
  32 #include <linux/rculist.h>
  33 #include <linux/uaccess.h>
  34 #include <linux/syscalls.h>
  35 #include <linux/anon_inodes.h>
  36 #include <linux/kernel_stat.h>
  37 #include <linux/cgroup.h>
  38 #include <linux/perf_event.h>
  39 #include <linux/trace_events.h>
  40 #include <linux/hw_breakpoint.h>
  41 #include <linux/mm_types.h>
  42 #include <linux/module.h>
  43 #include <linux/mman.h>
  44 #include <linux/compat.h>
  45 #include <linux/bpf.h>
  46 #include <linux/filter.h>
  47 #include <linux/namei.h>
  48 #include <linux/parser.h>
  49 #include <linux/sched/clock.h>
  50 #include <linux/sched/mm.h>
  51 #include <linux/proc_ns.h>
  52 #include <linux/mount.h>
  53
  54 #include "internal.h"
  55
  56 #include <asm/irq_regs.h>
  57
  58 typedef int (*remote_function_f)(void *);
  59
  60 struct remote_function_call {
  61         struct task_struct      *p;
  62         remote_function_f       func;
  63         void                    *info;
  64         int                     ret;
  65 };
  66
  67 static void remote_function(void *data)
  68 {
  69         struct remote_function_call *tfc = data;
  70         struct task_struct *p = tfc->p;
  71
  72         if (p) {
  73                 /* -EAGAIN */
  74                 if (task_cpu(p) != smp_processor_id())
  75                         return;
  76
  77                 /*
  78                  * Now that we're on right CPU with IRQs disabled, we can test
  79                  * if we hit the right task without races.
  80                  */
  81
  82                 tfc->ret = -ESRCH; /* No such (running) process */
  83                 if (p != current)
  84                         return;
  85         }
  86
  87         tfc->ret = tfc->func(tfc->info);
  88 }
  89
  90 /**
  91  * task_function_call - call a function on the cpu on which a task runs
  92  * @p:          the task to evaluate
  93  * @func:       the function to be called
  94  * @info:       the function call argument
  95  *
  96  * Calls the function @func when the task is currently running. This might
  97  * be on the current CPU, which just calls the function directly.  This will
  98  * retry due to any failures in smp_call_function_single(), such as if the
  99  * task_cpu() goes offline concurrently.
 100  *
 101  * returns @func return value or -ESRCH or -ENXIO when the process isn't running
 102  */
 103 static int
 104 task_function_call(struct task_struct *p, remote_function_f func, void *info)
 105 {
 106         struct remote_function_call data = {
 107                 .p      = p,
 108                 .func   = func,
 109                 .info   = info,
 110                 .ret    = -EAGAIN,
 111         };
 112         int ret;
 113
 114         for (;;) {
 115                 ret = smp_call_function_single(task_cpu(p), remote_function,
 116                                                &data, 1);
 117                 if (!ret)
 118                         ret = data.ret;
 119
 120                 if (ret != -EAGAIN)
 121                         break;
 122
 123                 cond_resched();
 124         }
 125
 126         return ret;
 127 }
 128
 129 /**
 130  * cpu_function_call - call a function on the cpu
 131  * @func:       the function to be called
 132  * @info:       the function call argument
 133  *
 134  * Calls the function @func on the remote cpu.
 135  *
 136  * returns: @func return value or -ENXIO when the cpu is offline
 137  */
 138 static int cpu_function_call(int cpu, remote_function_f func, void *info)
 139 {
 140         struct remote_function_call data = {
 141                 .p      = NULL,
 142                 .func   = func,
 143                 .info   = info,
 144                 .ret    = -ENXIO, /* No such CPU */
 145         };
 146
 147         smp_call_function_single(cpu, remote_function, &data, 1);
 148
 149         return data.ret;
 150 }
 151
 152 static inline struct perf_cpu_context *
 153 __get_cpu_context(struct perf_event_context *ctx)
 154 {
 155         return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
 156 }
 157
 158 static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
 159                           struct perf_event_context *ctx)
 160 {
 161         raw_spin_lock(&cpuctx->ctx.lock);
 162         if (ctx)
 163                 raw_spin_lock(&ctx->lock);
 164 }
 165
 166 static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
 167                             struct perf_event_context *ctx)
 168 {
 169         if (ctx)
 170                 raw_spin_unlock(&ctx->lock);
 171         raw_spin_unlock(&cpuctx->ctx.lock);
 172 }
 173
 174 #define TASK_TOMBSTONE ((void *)-1L)
 175
 176 static bool is_kernel_event(struct perf_event *event)
 177 {
 178         return READ_ONCE(event->owner) == TASK_TOMBSTONE;
 179 }
 180
 181 /*
 182  * On task ctx scheduling...
 183  *
 184  * When !ctx->nr_events a task context will not be scheduled. This means
 185  * we can disable the scheduler hooks (for performance) without leaving
 186  * pending task ctx state.
 187  *
 188  * This however results in two special cases:
 189  *
 190  *  - removing the last event from a task ctx; this is relatively straight
 191  *    forward and is done in __perf_remove_from_context.
 192  *
 193  *  - adding the first event to a task ctx; this is tricky because we cannot
 194  *    rely on ctx->is_active and therefore cannot use event_function_call().
 195  *    See perf_install_in_context().
 196  *
 197  * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
 198  */
 199
 200 typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
 201                         struct perf_event_context *, void *);
 202
 203 struct event_function_struct {
 204         struct perf_event *event;
 205         event_f func;
 206         void *data;
 207 };
 208
 209 static int event_function(void *info)
 210 {
 211         struct event_function_struct *efs = info;
 212         struct perf_event *event = efs->event;
 213         struct perf_event_context *ctx = event->ctx;
 214         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 215         struct perf_event_context *task_ctx = cpuctx->task_ctx;
 216         int ret = 0;
 217
 218         lockdep_assert_irqs_disabled();
 219
 220         perf_ctx_lock(cpuctx, task_ctx);
 221         /*
 222          * Since we do the IPI call without holding ctx->lock things can have
 223          * changed, double check we hit the task we set out to hit.
 224          */
 225         if (ctx->task) {
 226                 if (ctx->task != current) {
 227                         ret = -ESRCH;
 228                         goto unlock;
 229                 }
 230
 231                 /*
 232                  * We only use event_function_call() on established contexts,
 233                  * and event_function() is only ever called when active (or
 234                  * rather, we'll have bailed in task_function_call() or the
 235                  * above ctx->task != current test), therefore we must have
 236                  * ctx->is_active here.
 237                  */
 238                 WARN_ON_ONCE(!ctx->is_active);
 239                 /*
 240                  * And since we have ctx->is_active, cpuctx->task_ctx must
 241                  * match.
 242                  */
 243                 WARN_ON_ONCE(task_ctx != ctx);
 244         } else {
 245                 WARN_ON_ONCE(&cpuctx->ctx != ctx);
 246         }
 247
 248         efs->func(event, cpuctx, ctx, efs->data);
 249 unlock:
 250         perf_ctx_unlock(cpuctx, task_ctx);
 251
 252         return ret;
 253 }
 254
 255 static void event_function_call(struct perf_event *event, event_f func, void *data)
 256 {
 257         struct perf_event_context *ctx = event->ctx;
 258         struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
 259         struct event_function_struct efs = {
 260                 .event = event,
 261                 .func = func,
 262                 .data = data,
 263         };
 264
 265         if (!event->parent) {
 266                 /*
 267                  * If this is a !child event, we must hold ctx::mutex to
 268                  * stabilize the the event->ctx relation. See
 269                  * perf_event_ctx_lock().
 270                  */
 271                 lockdep_assert_held(&ctx->mutex);
 272         }
 273
 274         if (!task) {
 275                 cpu_function_call(event->cpu, event_function, &efs);
 276                 return;
 277         }
 278
 279         if (task == TASK_TOMBSTONE)
 280                 return;
 281
 282 again:
 283         if (!task_function_call(task, event_function, &efs))
 284                 return;
 285
 286         raw_spin_lock_irq(&ctx->lock);
 287         /*
 288          * Reload the task pointer, it might have been changed by
 289          * a concurrent perf_event_context_sched_out().
 290          */
 291         task = ctx->task;
 292         if (task == TASK_TOMBSTONE) {
 293                 raw_spin_unlock_irq(&ctx->lock);
 294                 return;
 295         }
 296         if (ctx->is_active) {
 297                 raw_spin_unlock_irq(&ctx->lock);
 298                 goto again;
 299         }
 300         func(event, NULL, ctx, data);
 301         raw_spin_unlock_irq(&ctx->lock);
 302 }
 303
 304 /*
 305  * Similar to event_function_call() + event_function(), but hard assumes IRQs
 306  * are already disabled and we're on the right CPU.
 307  */
 308 static void event_function_local(struct perf_event *event, event_f func, void *data)
 309 {
 310         struct perf_event_context *ctx = event->ctx;
 311         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 312         struct task_struct *task = READ_ONCE(ctx->task);
 313         struct perf_event_context *task_ctx = NULL;
 314
 315         lockdep_assert_irqs_disabled();
 316
 317         if (task) {
 318                 if (task == TASK_TOMBSTONE)
 319                         return;
 320
 321                 task_ctx = ctx;
 322         }
 323
 324         perf_ctx_lock(cpuctx, task_ctx);
 325
 326         task = ctx->task;
 327         if (task == TASK_TOMBSTONE)
 328                 goto unlock;
 329
 330         if (task) {
 331                 /*
 332                  * We must be either inactive or active and the right task,
 333                  * otherwise we're screwed, since we cannot IPI to somewhere
 334                  * else.
 335                  */
 336                 if (ctx->is_active) {
 337                         if (WARN_ON_ONCE(task != current))
 338                                 goto unlock;
 339
 340                         if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
 341                                 goto unlock;
 342                 }
 343         } else {
 344                 WARN_ON_ONCE(&cpuctx->ctx != ctx);
 345         }
 346
 347         func(event, cpuctx, ctx, data);
 348 unlock:
 349         perf_ctx_unlock(cpuctx, task_ctx);
 350 }
 351
 352 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
 353                        PERF_FLAG_FD_OUTPUT  |\
 354                        PERF_FLAG_PID_CGROUP |\
 355                        PERF_FLAG_FD_CLOEXEC)
 356
 357 /*
 358  * branch priv levels that need permission checks
 359  */
 360 #define PERF_SAMPLE_BRANCH_PERM_PLM \
 361         (PERF_SAMPLE_BRANCH_KERNEL |\
 362          PERF_SAMPLE_BRANCH_HV)
 363
 364 enum event_type_t {
 365         EVENT_FLEXIBLE = 0x1,
 366         EVENT_PINNED = 0x2,
 367         EVENT_TIME = 0x4,
 368         /* see ctx_resched() for details */
 369         EVENT_CPU = 0x8,
 370         EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
 371 };
 372
 373 /*
 374  * perf_sched_events : >0 events exist
 375  * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
 376  */
 377
 378 static void perf_sched_delayed(struct work_struct *work);
 379 DEFINE_STATIC_KEY_FALSE(perf_sched_events);
 380 static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
 381 static DEFINE_MUTEX(perf_sched_mutex);
 382 static atomic_t perf_sched_count;
 383
 384 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
 385 static DEFINE_PER_CPU(int, perf_sched_cb_usages);
 386 static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
 387
 388 static atomic_t nr_mmap_events __read_mostly;
 389 static atomic_t nr_comm_events __read_mostly;
 390 static atomic_t nr_namespaces_events __read_mostly;
 391 static atomic_t nr_task_events __read_mostly;
 392 static atomic_t nr_freq_events __read_mostly;
 393 static atomic_t nr_switch_events __read_mostly;
 394
 395 static LIST_HEAD(pmus);
 396 static DEFINE_MUTEX(pmus_lock);
 397 static struct srcu_struct pmus_srcu;
 398 static cpumask_var_t perf_online_mask;
 399
 400 /*
 401  * perf event paranoia level:
 402  *  -1 - not paranoid at all
 403  *   0 - disallow raw tracepoint access for unpriv
 404  *   1 - disallow cpu events for unpriv
 405  *   2 - disallow kernel profiling for unpriv
 406  */
 407 int sysctl_perf_event_paranoid __read_mostly = 2;
 408
 409 /* Minimum for 512 kiB + 1 user control page */
 410 int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
 411
 412 /*
 413  * max perf event sample rate
 414  */
 415 #define DEFAULT_MAX_SAMPLE_RATE         100000
 416 #define DEFAULT_SAMPLE_PERIOD_NS        (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
 417 #define DEFAULT_CPU_TIME_MAX_PERCENT    25
 418
 419 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
 420
 421 static int max_samples_per_tick __read_mostly   = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
 422 static int perf_sample_period_ns __read_mostly  = DEFAULT_SAMPLE_PERIOD_NS;
 423
 424 static int perf_sample_allowed_ns __read_mostly =
 425         DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
 426
 427 static void update_perf_cpu_limits(void)
 428 {
 429         u64 tmp = perf_sample_period_ns;
 430
 431         tmp *= sysctl_perf_cpu_time_max_percent;
 432         tmp = div_u64(tmp, 100);
 433         if (!tmp)
 434                 tmp = 1;
 435
 436         WRITE_ONCE(perf_sample_allowed_ns, tmp);
 437 }
 438
 439 static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
 440
 441 int perf_proc_update_handler(struct ctl_table *table, int write,
 442                 void __user *buffer, size_t *lenp,
 443                 loff_t *ppos)
 444 {
 445         int ret;
 446         int perf_cpu = sysctl_perf_cpu_time_max_percent;
 447         /*
 448          * If throttling is disabled don't allow the write:
 449          */
 450         if (write && (perf_cpu == 100 || perf_cpu == 0))
 451                 return -EINVAL;
 452
 453         ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 454         if (ret || !write)
 455                 return ret;
 456
 457         max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
 458         perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
 459         update_perf_cpu_limits();
 460
 461         return 0;
 462 }
 463
 464 int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
 465
 466 int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
 467                                 void __user *buffer, size_t *lenp,
 468                                 loff_t *ppos)
 469 {
 470         int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 471
 472         if (ret || !write)
 473                 return ret;
 474
 475         if (sysctl_perf_cpu_time_max_percent == 100 ||
 476             sysctl_perf_cpu_time_max_percent == 0) {
 477                 printk(KERN_WARNING
 478                        "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
 479                 WRITE_ONCE(perf_sample_allowed_ns, 0);
 480         } else {
 481                 update_perf_cpu_limits();
 482         }
 483
 484         return 0;
 485 }
 486
 487 /*
 488  * perf samples are done in some very critical code paths (NMIs).
 489  * If they take too much CPU time, the system can lock up and not
 490  * get any real work done.  This will drop the sample rate when
 491  * we detect that events are taking too long.
 492  */
 493 #define NR_ACCUMULATED_SAMPLES 128
 494 static DEFINE_PER_CPU(u64, running_sample_length);
 495
 496 static u64 __report_avg;
 497 static u64 __report_allowed;
 498
 499 static void perf_duration_warn(struct irq_work *w)
 500 {
 501         printk_ratelimited(KERN_INFO
 502                 "perf: interrupt took too long (%lld > %lld), lowering "
 503                 "kernel.perf_event_max_sample_rate to %d\n",
 504                 __report_avg, __report_allowed,
 505                 sysctl_perf_event_sample_rate);
 506 }
 507
 508 static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
 509
 510 void perf_sample_event_took(u64 sample_len_ns)
 511 {
 512         u64 max_len = READ_ONCE(perf_sample_allowed_ns);
 513         u64 running_len;
 514         u64 avg_len;
 515         u32 max;
 516
 517         if (max_len == 0)
 518                 return;
 519
 520         /* Decay the counter by 1 average sample. */
 521         running_len = __this_cpu_read(running_sample_length);
 522         running_len -= running_len/NR_ACCUMULATED_SAMPLES;
 523         running_len += sample_len_ns;
 524         __this_cpu_write(running_sample_length, running_len);
 525
 526         /*
 527          * Note: this will be biased artifically low until we have
 528          * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
 529          * from having to maintain a count.
 530          */
 531         avg_len = running_len/NR_ACCUMULATED_SAMPLES;
 532         if (avg_len <= max_len)
 533                 return;
 534
 535         __report_avg = avg_len;
 536         __report_allowed = max_len;
 537
 538         /*
 539          * Compute a throttle threshold 25% below the current duration.
 540          */
 541         avg_len += avg_len / 4;
 542         max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
 543         if (avg_len < max)
 544                 max /= (u32)avg_len;
 545         else
 546                 max = 1;
 547
 548         WRITE_ONCE(perf_sample_allowed_ns, avg_len);
 549         WRITE_ONCE(max_samples_per_tick, max);
 550
 551         sysctl_perf_event_sample_rate = max * HZ;
 552         perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
 553
 554         if (!irq_work_queue(&perf_duration_work)) {
 555                 early_printk("perf: interrupt took too long (%lld > %lld), lowering "
 556                              "kernel.perf_event_max_sample_rate to %d\n",
 557                              __report_avg, __report_allowed,
 558                              sysctl_perf_event_sample_rate);
 559         }
 560 }
 561
 562 static atomic64_t perf_event_id;
 563
 564 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
 565                               enum event_type_t event_type);
 566
 567 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
 568                              enum event_type_t event_type,
 569                              struct task_struct *task);
 570
 571 static void update_context_time(struct perf_event_context *ctx);
 572 static u64 perf_event_time(struct perf_event *event);
 573
 574 void __weak perf_event_print_debug(void)        { }
 575
 576 extern __weak const char *perf_pmu_name(void)
 577 {
 578         return "pmu";
 579 }
 580
 581 static inline u64 perf_clock(void)
 582 {
 583         return local_clock();
 584 }
 585
 586 static inline u64 perf_event_clock(struct perf_event *event)
 587 {
 588         return event->clock();
 589 }
 590
 591 /*
 592  * State based event timekeeping...
 593  *
 594  * The basic idea is to use event->state to determine which (if any) time
 595  * fields to increment with the current delta. This means we only need to
 596  * update timestamps when we change state or when they are explicitly requested
 597  * (read).
 598  *
 599  * Event groups make things a little more complicated, but not terribly so. The
 600  * rules for a group are that if the group leader is OFF the entire group is
 601  * OFF, irrespecive of what the group member states are. This results in
 602  * __perf_effective_state().
 603  *
 604  * A futher ramification is that when a group leader flips between OFF and
 605  * !OFF, we need to update all group member times.
 606  *
 607  *
 608  * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we
 609  * need to make sure the relevant context time is updated before we try and
 610  * update our timestamps.
 611  */
 612
 613 static __always_inline enum perf_event_state
 614 __perf_effective_state(struct perf_event *event)
 615 {
 616         struct perf_event *leader = event->group_leader;
 617
 618         if (leader->state <= PERF_EVENT_STATE_OFF)
 619                 return leader->state;
 620
 621         return event->state;
 622 }
 623
 624 static __always_inline void
 625 __perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
 626 {
 627         enum perf_event_state state = __perf_effective_state(event);
 628         u64 delta = now - event->tstamp;
 629
 630         *enabled = event->total_time_enabled;
 631         if (state >= PERF_EVENT_STATE_INACTIVE)
 632                 *enabled += delta;
 633
 634         *running = event->total_time_running;
 635         if (state >= PERF_EVENT_STATE_ACTIVE)
 636                 *running += delta;
 637 }
 638
 639 static void perf_event_update_time(struct perf_event *event)
 640 {
 641         u64 now = perf_event_time(event);
 642
 643         __perf_update_times(event, now, &event->total_time_enabled,
 644                                         &event->total_time_running);
 645         event->tstamp = now;
 646 }
 647
 648 static void perf_event_update_sibling_time(struct perf_event *leader)
 649 {
 650         struct perf_event *sibling;
 651
 652         for_each_sibling_event(sibling, leader)
 653                 perf_event_update_time(sibling);
 654 }
 655
 656 static void
 657 perf_event_set_state(struct perf_event *event, enum perf_event_state state)
 658 {
 659         if (event->state == state)
 660                 return;
 661
 662         perf_event_update_time(event);
 663         /*
 664          * If a group leader gets enabled/disabled all its siblings
 665          * are affected too.
 666          */
 667         if ((event->state < 0) ^ (state < 0))
 668                 perf_event_update_sibling_time(event);
 669
 670         WRITE_ONCE(event->state, state);
 671 }
 672
 673 #ifdef CONFIG_CGROUP_PERF
 674
 675 static inline bool
 676 perf_cgroup_match(struct perf_event *event)
 677 {
 678         struct perf_event_context *ctx = event->ctx;
 679         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 680
 681         /* @event doesn't care about cgroup */
 682         if (!event->cgrp)
 683                 return true;
 684
 685         /* wants specific cgroup scope but @cpuctx isn't associated with any */
 686         if (!cpuctx->cgrp)
 687                 return false;
 688
 689         /*
 690          * Cgroup scoping is recursive.  An event enabled for a cgroup is
 691          * also enabled for all its descendant cgroups.  If @cpuctx's
 692          * cgroup is a descendant of @event's (the test covers identity
 693          * case), it's a match.
 694          */
 695         return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
 696                                     event->cgrp->css.cgroup);
 697 }
 698
 699 static inline void perf_detach_cgroup(struct perf_event *event)
 700 {
 701         css_put(&event->cgrp->css);
 702         event->cgrp = NULL;
 703 }
 704
 705 static inline int is_cgroup_event(struct perf_event *event)
 706 {
 707         return event->cgrp != NULL;
 708 }
 709
 710 static inline u64 perf_cgroup_event_time(struct perf_event *event)
 711 {
 712         struct perf_cgroup_info *t;
 713
 714         t = per_cpu_ptr(event->cgrp->info, event->cpu);
 715         return t->time;
 716 }
 717
 718 static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
 719 {
 720         struct perf_cgroup_info *info;
 721         u64 now;
 722
 723         now = perf_clock();
 724
 725         info = this_cpu_ptr(cgrp->info);
 726
 727         info->time += now - info->timestamp;
 728         info->timestamp = now;
 729 }
 730
 731 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
 732 {
 733         struct perf_cgroup *cgrp = cpuctx->cgrp;
 734         struct cgroup_subsys_state *css;
 735
 736         if (cgrp) {
 737                 for (css = &cgrp->css; css; css = css->parent) {
 738                         cgrp = container_of(css, struct perf_cgroup, css);
 739                         __update_cgrp_time(cgrp);
 740                 }
 741         }
 742 }
 743
 744 static inline void update_cgrp_time_from_event(struct perf_event *event)
 745 {
 746         struct perf_cgroup *cgrp;
 747
 748         /*
 749          * ensure we access cgroup data only when needed and
 750          * when we know the cgroup is pinned (css_get)
 751          */
 752         if (!is_cgroup_event(event))
 753                 return;
 754
 755         cgrp = perf_cgroup_from_task(current, event->ctx);
 756         /*
 757          * Do not update time when cgroup is not active
 758          */
 759        if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
 760                 __update_cgrp_time(event->cgrp);
 761 }
 762
 763 static inline void
 764 perf_cgroup_set_timestamp(struct task_struct *task,
 765                           struct perf_event_context *ctx)
 766 {
 767         struct perf_cgroup *cgrp;
 768         struct perf_cgroup_info *info;
 769         struct cgroup_subsys_state *css;
 770
 771         /*
 772          * ctx->lock held by caller
 773          * ensure we do not access cgroup data
 774          * unless we have the cgroup pinned (css_get)
 775          */
 776         if (!task || !ctx->nr_cgroups)
 777                 return;
 778
 779         cgrp = perf_cgroup_from_task(task, ctx);
 780
 781         for (css = &cgrp->css; css; css = css->parent) {
 782                 cgrp = container_of(css, struct perf_cgroup, css);
 783                 info = this_cpu_ptr(cgrp->info);
 784                 info->timestamp = ctx->timestamp;
 785         }
 786 }
 787
 788 static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
 789
 790 #define PERF_CGROUP_SWOUT       0x1 /* cgroup switch out every event */
 791 #define PERF_CGROUP_SWIN        0x2 /* cgroup switch in events based on task */
 792
 793 /*
 794  * reschedule events based on the cgroup constraint of task.
 795  *
 796  * mode SWOUT : schedule out everything
 797  * mode SWIN : schedule in based on cgroup for next
 798  */
 799 static void perf_cgroup_switch(struct task_struct *task, int mode)
 800 {
 801         struct perf_cpu_context *cpuctx;
 802         struct list_head *list;
 803         unsigned long flags;
 804
 805         /*
 806          * Disable interrupts and preemption to avoid this CPU's
 807          * cgrp_cpuctx_entry to change under us.
 808          */
 809         local_irq_save(flags);
 810
 811         list = this_cpu_ptr(&cgrp_cpuctx_list);
 812         list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
 813                 WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
 814
 815                 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
 816                 perf_pmu_disable(cpuctx->ctx.pmu);
 817
 818                 if (mode & PERF_CGROUP_SWOUT) {
 819                         cpu_ctx_sched_out(cpuctx, EVENT_ALL);
 820                         /*
 821                          * must not be done before ctxswout due
 822                          * to event_filter_match() in event_sched_out()
 823                          */
 824                         cpuctx->cgrp = NULL;
 825                 }
 826
 827                 if (mode & PERF_CGROUP_SWIN) {
 828                         WARN_ON_ONCE(cpuctx->cgrp);
 829                         /*
 830                          * set cgrp before ctxsw in to allow
 831                          * event_filter_match() to not have to pass
 832                          * task around
 833                          * we pass the cpuctx->ctx to perf_cgroup_from_task()
 834                          * because cgorup events are only per-cpu
 835                          */
 836                         cpuctx->cgrp = perf_cgroup_from_task(task,
 837                                                              &cpuctx->ctx);
 838                         cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
 839                 }
 840                 perf_pmu_enable(cpuctx->ctx.pmu);
 841                 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 842         }
 843
 844         local_irq_restore(flags);
 845 }
 846
 847 static inline void perf_cgroup_sched_out(struct task_struct *task,
 848                                          struct task_struct *next)
 849 {
 850         struct perf_cgroup *cgrp1;
 851         struct perf_cgroup *cgrp2 = NULL;
 852
 853         rcu_read_lock();
 854         /*
 855          * we come here when we know perf_cgroup_events > 0
 856          * we do not need to pass the ctx here because we know
 857          * we are holding the rcu lock
 858          */
 859         cgrp1 = perf_cgroup_from_task(task, NULL);
 860         cgrp2 = perf_cgroup_from_task(next, NULL);
 861
 862         /*
 863          * only schedule out current cgroup events if we know
 864          * that we are switching to a different cgroup. Otherwise,
 865          * do no touch the cgroup events.
 866          */
 867         if (cgrp1 != cgrp2)
 868                 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
 869
 870         rcu_read_unlock();
 871 }
 872
 873 static inline void perf_cgroup_sched_in(struct task_struct *prev,
 874                                         struct task_struct *task)
 875 {
 876         struct perf_cgroup *cgrp1;
 877         struct perf_cgroup *cgrp2 = NULL;
 878
 879         rcu_read_lock();
 880         /*
 881          * we come here when we know perf_cgroup_events > 0
 882          * we do not need to pass the ctx here because we know
 883          * we are holding the rcu lock
 884          */
 885         cgrp1 = perf_cgroup_from_task(task, NULL);
 886         cgrp2 = perf_cgroup_from_task(prev, NULL);
 887
 888         /*
 889          * only need to schedule in cgroup events if we are changing
 890          * cgroup during ctxsw. Cgroup events were not scheduled
 891          * out of ctxsw out if that was not the case.
 892          */
 893         if (cgrp1 != cgrp2)
 894                 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
 895
 896         rcu_read_unlock();
 897 }
 898
 899 static inline int perf_cgroup_connect(int fd, struct perf_event *event,
 900                                       struct perf_event_attr *attr,
 901                                       struct perf_event *group_leader)
 902 {
 903         struct perf_cgroup *cgrp;
 904         struct cgroup_subsys_state *css;
 905         struct fd f = fdget(fd);
 906         int ret = 0;
 907
 908         if (!f.file)
 909                 return -EBADF;
 910
 911         css = css_tryget_online_from_dir(f.file->f_path.dentry,
 912                                          &perf_event_cgrp_subsys);
 913         if (IS_ERR(css)) {
 914                 ret = PTR_ERR(css);
 915                 goto out;
 916         }
 917
 918         cgrp = container_of(css, struct perf_cgroup, css);
 919         event->cgrp = cgrp;
 920
 921         /*
 922          * all events in a group must monitor
 923          * the same cgroup because a task belongs
 924          * to only one perf cgroup at a time
 925          */
 926         if (group_leader && group_leader->cgrp != cgrp) {
 927                 perf_detach_cgroup(event);
 928                 ret = -EINVAL;
 929         }
 930 out:
 931         fdput(f);
 932         return ret;
 933 }
 934
 935 static inline void
 936 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
 937 {
 938         struct perf_cgroup_info *t;
 939         t = per_cpu_ptr(event->cgrp->info, event->cpu);
 940         event->shadow_ctx_time = now - t->timestamp;
 941 }
 942
 943 /*
 944  * Update cpuctx->cgrp so that it is set when first cgroup event is added and
 945  * cleared when last cgroup event is removed.
 946  */
 947 static inline void
 948 list_update_cgroup_event(struct perf_event *event,
 949                          struct perf_event_context *ctx, bool add)
 950 {
 951         struct perf_cpu_context *cpuctx;
 952         struct list_head *cpuctx_entry;
 953
 954         if (!is_cgroup_event(event))
 955                 return;
 956
 957         /*
 958          * Because cgroup events are always per-cpu events,
 959          * this will always be called from the right CPU.
 960          */
 961         cpuctx = __get_cpu_context(ctx);
 962
 963         /*
 964          * Since setting cpuctx->cgrp is conditional on the current @cgrp
 965          * matching the event's cgroup, we must do this for every new event,
 966          * because if the first would mismatch, the second would not try again
 967          * and we would leave cpuctx->cgrp unset.
 968          */
 969         if (add && !cpuctx->cgrp) {
 970                 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
 971
 972                 if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
 973                         cpuctx->cgrp = cgrp;
 974         }
 975
 976         if (add && ctx->nr_cgroups++)
 977                 return;
 978         else if (!add && --ctx->nr_cgroups)
 979                 return;
 980
 981         /* no cgroup running */
 982         if (!add)
 983                 cpuctx->cgrp = NULL;
 984
 985         cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
 986         if (add)
 987                 list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
 988         else
 989                 list_del(cpuctx_entry);
 990 }
 991
 992 #else /* !CONFIG_CGROUP_PERF */
 993
 994 static inline bool
 995 perf_cgroup_match(struct perf_event *event)
 996 {
 997         return true;
 998 }
 999
1000 static inline void perf_detach_cgroup(struct perf_event *event)
1001 {}
1002
1003 static inline int is_cgroup_event(struct perf_event *event)
1004 {
1005         return 0;
1006 }
1007
1008 static inline void update_cgrp_time_from_event(struct perf_event *event)
1009 {
1010 }
1011
1012 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
1013 {
1014 }
1015
1016 static inline void perf_cgroup_sched_out(struct task_struct *task,
1017                                          struct task_struct *next)
1018 {
1019 }
1020
1021 static inline void perf_cgroup_sched_in(struct task_struct *prev,
1022                                         struct task_struct *task)
1023 {
1024 }
1025
1026 static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
1027                                       struct perf_event_attr *attr,
1028                                       struct perf_event *group_leader)
1029 {
1030         return -EINVAL;
1031 }
1032
1033 static inline void
1034 perf_cgroup_set_timestamp(struct task_struct *task,
1035                           struct perf_event_context *ctx)
1036 {
1037 }
1038
1039 void
1040 perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
1041 {
1042 }
1043
1044 static inline void
1045 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
1046 {
1047 }
1048
1049 static inline u64 perf_cgroup_event_time(struct perf_event *event)
1050 {
1051         return 0;
1052 }
1053
1054 static inline void
1055 list_update_cgroup_event(struct perf_event *event,
1056                          struct perf_event_context *ctx, bool add)
1057 {
1058 }
1059
1060 #endif
1061
1062 /*
1063  * set default to be dependent on timer tick just
1064  * like original code
1065  */
1066 #define PERF_CPU_HRTIMER (1000 / HZ)
1067 /*
1068  * function must be called with interrupts disabled
1069  */
1070 static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
1071 {
1072         struct perf_cpu_context *cpuctx;
1073         bool rotations;
1074
1075         lockdep_assert_irqs_disabled();
1076
1077         cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
1078         rotations = perf_rotate_context(cpuctx);
1079
1080         raw_spin_lock(&cpuctx->hrtimer_lock);
1081         if (rotations)
1082                 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
1083         else
1084                 cpuctx->hrtimer_active = 0;
1085         raw_spin_unlock(&cpuctx->hrtimer_lock);
1086
1087         return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
1088 }
1089
1090 static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
1091 {
1092         struct hrtimer *timer = &cpuctx->hrtimer;
1093         struct pmu *pmu = cpuctx->ctx.pmu;
1094         u64 interval;
1095
1096         /* no multiplexing needed for SW PMU */
1097         if (pmu->task_ctx_nr == perf_sw_context)
1098                 return;
1099
1100         /*
1101          * check default is sane, if not set then force to
1102          * default interval (1/tick)
1103          */
1104         interval = pmu->hrtimer_interval_ms;
1105         if (interval < 1)
1106                 interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
1107
1108         cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
1109
1110         raw_spin_lock_init(&cpuctx->hrtimer_lock);
1111         hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1112         timer->function = perf_mux_hrtimer_handler;
1113 }
1114
1115 static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
1116 {
1117         struct hrtimer *timer = &cpuctx->hrtimer;
1118         struct pmu *pmu = cpuctx->ctx.pmu;
1119         unsigned long flags;
1120
1121         /* not for SW PMU */
1122         if (pmu->task_ctx_nr == perf_sw_context)
1123                 return 0;
1124
1125         raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
1126         if (!cpuctx->hrtimer_active) {
1127                 cpuctx->hrtimer_active = 1;
1128                 hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
1129                 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
1130         }
1131         raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
1132
1133         return 0;
1134 }
1135
1136 void perf_pmu_disable(struct pmu *pmu)
1137 {
1138         int *count = this_cpu_ptr(pmu->pmu_disable_count);
1139         if (!(*count)++)
1140                 pmu->pmu_disable(pmu);
1141 }
1142
1143 void perf_pmu_enable(struct pmu *pmu)
1144 {
1145         int *count = this_cpu_ptr(pmu->pmu_disable_count);
1146         if (!--(*count))
1147                 pmu->pmu_enable(pmu);
1148 }
1149
1150 static DEFINE_PER_CPU(struct list_head, active_ctx_list);
1151
1152 /*
1153  * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
1154  * perf_event_task_tick() are fully serialized because they're strictly cpu
1155  * affine and perf_event_ctx{activate,deactivate} are called with IRQs
1156  * disabled, while perf_event_task_tick is called from IRQ context.
1157  */
1158 static void perf_event_ctx_activate(struct perf_event_context *ctx)
1159 {
1160         struct list_head *head = this_cpu_ptr(&active_ctx_list);
1161
1162         lockdep_assert_irqs_disabled();
1163
1164         WARN_ON(!list_empty(&ctx->active_ctx_list));
1165
1166         list_add(&ctx->active_ctx_list, head);
1167 }
1168
1169 static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
1170 {
1171         lockdep_assert_irqs_disabled();
1172
1173         WARN_ON(list_empty(&ctx->active_ctx_list));
1174
1175         list_del_init(&ctx->active_ctx_list);
1176 }
1177
1178 static void get_ctx(struct perf_event_context *ctx)
1179 {
1180         WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
1181 }
1182
1183 static void free_ctx(struct rcu_head *head)
1184 {
1185         struct perf_event_context *ctx;
1186
1187         ctx = container_of(head, struct perf_event_context, rcu_head);
1188         kfree(ctx->task_ctx_data);
1189         kfree(ctx);
1190 }
1191
1192 static void put_ctx(struct perf_event_context *ctx)
1193 {
1194         if (atomic_dec_and_test(&ctx->refcount)) {
1195                 if (ctx->parent_ctx)
1196                         put_ctx(ctx->parent_ctx);
1197                 if (ctx->task && ctx->task != TASK_TOMBSTONE)
1198                         put_task_struct(ctx->task);
1199                 call_rcu(&ctx->rcu_head, free_ctx);
1200         }
1201 }
1202
1203 /*
1204  * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
1205  * perf_pmu_migrate_context() we need some magic.
1206  *
1207  * Those places that change perf_event::ctx will hold both
1208  * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
1209  *
1210  * Lock ordering is by mutex address. There are two other sites where
1211  * perf_event_context::mutex nests and those are:
1212  *
1213  *  - perf_event_exit_task_context()    [ child , 0 ]
1214  *      perf_event_exit_event()
1215  *        put_event()                   [ parent, 1 ]
1216  *
1217  *  - perf_event_init_context()         [ parent, 0 ]
1218  *      inherit_task_group()
1219  *        inherit_group()
1220  *          inherit_event()
1221  *            perf_event_alloc()
1222  *              perf_init_event()
1223  *                perf_try_init_event() [ child , 1 ]
1224  *
1225  * While it appears there is an obvious deadlock here -- the parent and child
1226  * nesting levels are inverted between the two. This is in fact safe because
1227  * life-time rules separate them. That is an exiting task cannot fork, and a
1228  * spawning task cannot (yet) exit.
1229  *
1230  * But remember that that these are parent<->child context relations, and
1231  * migration does not affect children, therefore these two orderings should not
1232  * interact.
1233  *
1234  * The change in perf_event::ctx does not affect children (as claimed above)
1235  * because the sys_perf_event_open() case will install a new event and break
1236  * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
1237  * concerned with cpuctx and that doesn't have children.
1238  *
1239  * The places that change perf_event::ctx will issue:
1240  *
1241  *   perf_remove_from_context();
1242  *   synchronize_rcu();
1243  *   perf_install_in_context();
1244  *
1245  * to affect the change. The remove_from_context() + synchronize_rcu() should
1246  * quiesce the event, after which we can install it in the new location. This
1247  * means that only external vectors (perf_fops, prctl) can perturb the event
1248  * while in transit. Therefore all such accessors should also acquire
1249  * perf_event_context::mutex to serialize against this.
1250  *
1251  * However; because event->ctx can change while we're waiting to acquire
1252  * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
1253  * function.
1254  *
1255  * Lock order:
1256  *    cred_guard_mutex
1257  *      task_struct::perf_event_mutex
1258  *        perf_event_context::mutex
1259  *          perf_event::child_mutex;
1260  *            perf_event_context::lock
1261  *          perf_event::mmap_mutex
1262  *          mmap_sem
1263  *            perf_addr_filters_head::lock
1264  *
1265  *    cpu_hotplug_lock
1266  *      pmus_lock
1267  *        cpuctx->mutex / perf_event_context::mutex
1268  */
1269 static struct perf_event_context *
1270 perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
1271 {
1272         struct perf_event_context *ctx;
1273
1274 again:
1275         rcu_read_lock();
1276         ctx = READ_ONCE(event->ctx);
1277         if (!atomic_inc_not_zero(&ctx->refcount)) {
1278                 rcu_read_unlock();
1279                 goto again;
1280         }
1281         rcu_read_unlock();
1282
1283         mutex_lock_nested(&ctx->mutex, nesting);
1284         if (event->ctx != ctx) {
1285                 mutex_unlock(&ctx->mutex);
1286                 put_ctx(ctx);
1287                 goto again;
1288         }
1289
1290         return ctx;
1291 }
1292
1293 static inline struct perf_event_context *
1294 perf_event_ctx_lock(struct perf_event *event)
1295 {
1296         return perf_event_ctx_lock_nested(event, 0);
1297 }
1298
1299 static void perf_event_ctx_unlock(struct perf_event *event,
1300                                   struct perf_event_context *ctx)
1301 {
1302         mutex_unlock(&ctx->mutex);
1303         put_ctx(ctx);
1304 }
1305
1306 /*
1307  * This must be done under the ctx->lock, such as to serialize against
1308  * context_equiv(), therefore we cannot call put_ctx() since that might end up
1309  * calling scheduler related locks and ctx->lock nests inside those.
1310  */
1311 static __must_check struct perf_event_context *
1312 unclone_ctx(struct perf_event_context *ctx)
1313 {
1314         struct perf_event_context *parent_ctx = ctx->parent_ctx;
1315
1316         lockdep_assert_held(&ctx->lock);
1317
1318         if (parent_ctx)
1319                 ctx->parent_ctx = NULL;
1320         ctx->generation++;
1321
1322         return parent_ctx;
1323 }
1324
1325 static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
1326                                 enum pid_type type)
1327 {
1328         u32 nr;
1329         /*
1330          * only top level events have the pid namespace they were created in
1331          */
1332         if (event->parent)
1333                 event = event->parent;
1334
1335         nr = __task_pid_nr_ns(p, type, event->ns);
1336         /* avoid -1 if it is idle thread or runs in another ns */
1337         if (!nr && !pid_alive(p))
1338                 nr = -1;
1339         return nr;
1340 }
1341
1342 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1343 {
1344         return perf_event_pid_type(event, p, PIDTYPE_TGID);
1345 }
1346
1347 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1348 {
1349         return perf_event_pid_type(event, p, PIDTYPE_PID);
1350 }
1351
1352 /*
1353  * If we inherit events we want to return the parent event id
1354  * to userspace.
1355  */
1356 static u64 primary_event_id(struct perf_event *event)
1357 {
1358         u64 id = event->id;
1359
1360         if (event->parent)
1361                 id = event->parent->id;
1362
1363         return id;
1364 }
1365
1366 /*
1367  * Get the perf_event_context for a task and lock it.
1368  *
1369  * This has to cope with with the fact that until it is locked,
1370  * the context could get moved to another task.
1371  */
1372 static struct perf_event_context *
1373 perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
1374 {
1375         struct perf_event_context *ctx;
1376
1377 retry:
1378         /*
1379          * One of the few rules of preemptible RCU is that one cannot do
1380          * rcu_read_unlock() while holding a scheduler (or nested) lock when
1381          * part of the read side critical section was irqs-enabled -- see
1382          * rcu_read_unlock_special().
1383          *
1384          * Since ctx->lock nests under rq->lock we must ensure the entire read
1385          * side critical section has interrupts disabled.
1386          */
1387         local_irq_save(*flags);
1388         rcu_read_lock();
1389         ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
1390         if (ctx) {
1391                 /*
1392                  * If this context is a clone of another, it might
1393                  * get swapped for another underneath us by
1394                  * perf_event_task_sched_out, though the
1395                  * rcu_read_lock() protects us from any context
1396                  * getting freed.  Lock the context and check if it
1397                  * got swapped before we could get the lock, and retry
1398                  * if so.  If we locked the right context, then it
1399                  * can't get swapped on us any more.
1400                  */
1401                 raw_spin_lock(&ctx->lock);
1402                 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
1403                         raw_spin_unlock(&ctx->lock);
1404                         rcu_read_unlock();
1405                         local_irq_restore(*flags);
1406                         goto retry;
1407                 }
1408
1409                 if (ctx->task == TASK_TOMBSTONE ||
1410                     !atomic_inc_not_zero(&ctx->refcount)) {
1411                         raw_spin_unlock(&ctx->lock);
1412                         ctx = NULL;
1413                 } else {
1414                         WARN_ON_ONCE(ctx->task != task);
1415                 }
1416         }
1417         rcu_read_unlock();
1418         if (!ctx)
1419                 local_irq_restore(*flags);
1420         return ctx;
1421 }
1422
1423 /*
1424  * Get the context for a task and increment its pin_count so it
1425  * can't get swapped to another task.  This also increments its
1426  * reference count so that the context can't get freed.
1427  */
1428 static struct perf_event_context *
1429 perf_pin_task_context(struct task_struct *task, int ctxn)
1430 {
1431         struct perf_event_context *ctx;
1432         unsigned long flags;
1433
1434         ctx = perf_lock_task_context(task, ctxn, &flags);
1435         if (ctx) {
1436                 ++ctx->pin_count;
1437                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1438         }
1439         return ctx;
1440 }
1441
1442 static void perf_unpin_context(struct perf_event_context *ctx)
1443 {
1444         unsigned long flags;
1445
1446         raw_spin_lock_irqsave(&ctx->lock, flags);
1447         --ctx->pin_count;
1448         raw_spin_unlock_irqrestore(&ctx->lock, flags);
1449 }
1450
1451 /*
1452  * Update the record of the current time in a context.
1453  */
1454 static void update_context_time(struct perf_event_context *ctx)
1455 {
1456         u64 now = perf_clock();
1457
1458         ctx->time += now - ctx->timestamp;
1459         ctx->timestamp = now;
1460 }
1461
1462 static u64 perf_event_time(struct perf_event *event)
1463 {
1464         struct perf_event_context *ctx = event->ctx;
1465
1466         if (is_cgroup_event(event))
1467                 return perf_cgroup_event_time(event);
1468
1469         return ctx ? ctx->time : 0;
1470 }
1471
1472 static enum event_type_t get_event_type(struct perf_event *event)
1473 {
1474         struct perf_event_context *ctx = event->ctx;
1475         enum event_type_t event_type;
1476
1477         lockdep_assert_held(&ctx->lock);
1478
1479         /*
1480          * It's 'group type', really, because if our group leader is
1481          * pinned, so are we.
1482          */
1483         if (event->group_leader != event)
1484                 event = event->group_leader;
1485
1486         event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
1487         if (!ctx->task)
1488                 event_type |= EVENT_CPU;
1489
1490         return event_type;
1491 }
1492
1493 /*
1494  * Helper function to initialize event group nodes.
1495  */
1496 static void init_event_group(struct perf_event *event)
1497 {
1498         RB_CLEAR_NODE(&event->group_node);
1499         event->group_index = 0;
1500 }
1501
1502 /*
1503  * Extract pinned or flexible groups from the context
1504  * based on event attrs bits.
1505  */
1506 static struct perf_event_groups *
1507 get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
1508 {
1509         if (event->attr.pinned)
1510                 return &ctx->pinned_groups;
1511         else
1512                 return &ctx->flexible_groups;
1513 }
1514
1515 /*
1516  * Helper function to initializes perf_event_group trees.
1517  */
1518 static void perf_event_groups_init(struct perf_event_groups *groups)
1519 {
1520         groups->tree = RB_ROOT;
1521         groups->index = 0;
1522 }
1523
1524 /*
1525  * Compare function for event groups;
1526  *
1527  * Implements complex key that first sorts by CPU and then by virtual index
1528  * which provides ordering when rotating groups for the same CPU.
1529  */
1530 static bool
1531 perf_event_groups_less(struct perf_event *left, struct perf_event *right)
1532 {
1533         if (left->cpu < right->cpu)
1534                 return true;
1535         if (left->cpu > right->cpu)
1536                 return false;
1537
1538         if (left->group_index < right->group_index)
1539                 return true;
1540         if (left->group_index > right->group_index)
1541                 return false;
1542
1543         return false;
1544 }
1545
1546 /*
1547  * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
1548  * key (see perf_event_groups_less). This places it last inside the CPU
1549  * subtree.
1550  */
1551 static void
1552 perf_event_groups_insert(struct perf_event_groups *groups,
1553                          struct perf_event *event)
1554 {
1555         struct perf_event *node_event;
1556         struct rb_node *parent;
1557         struct rb_node **node;
1558
1559         event->group_index = ++groups->index;
1560
1561         node = &groups->tree.rb_node;
1562         parent = *node;
1563
1564         while (*node) {
1565                 parent = *node;
1566                 node_event = container_of(*node, struct perf_event, group_node);
1567
1568                 if (perf_event_groups_less(event, node_event))
1569                         node = &parent->rb_left;
1570                 else
1571                         node = &parent->rb_right;
1572         }
1573
1574         rb_link_node(&event->group_node, parent, node);
1575         rb_insert_color(&event->group_node, &groups->tree);
1576 }
1577
1578 /*
1579  * Helper function to insert event into the pinned or flexible groups.
1580  */
1581 static void
1582 add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
1583 {
1584         struct perf_event_groups *groups;
1585
1586         groups = get_event_groups(event, ctx);
1587         perf_event_groups_insert(groups, event);
1588 }
1589
1590 /*
1591  * Delete a group from a tree.
1592  */
1593 static void
1594 perf_event_groups_delete(struct perf_event_groups *groups,
1595                          struct perf_event *event)
1596 {
1597         WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
1598                      RB_EMPTY_ROOT(&groups->tree));
1599
1600         rb_erase(&event->group_node, &groups->tree);
1601         init_event_group(event);
1602 }
1603
1604 /*
1605  * Helper function to delete event from its groups.
1606  */
1607 static void
1608 del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
1609 {
1610         struct perf_event_groups *groups;
1611
1612         groups = get_event_groups(event, ctx);
1613         perf_event_groups_delete(groups, event);
1614 }
1615
1616 /*
1617  * Get the leftmost event in the @cpu subtree.
1618  */
1619 static struct perf_event *
1620 perf_event_groups_first(struct perf_event_groups *groups, int cpu)
1621 {
1622         struct perf_event *node_event = NULL, *match = NULL;
1623         struct rb_node *node = groups->tree.rb_node;
1624
1625         while (node) {
1626                 node_event = container_of(node, struct perf_event, group_node);
1627
1628                 if (cpu < node_event->cpu) {
1629                         node = node->rb_left;
1630                 } else if (cpu > node_event->cpu) {
1631                         node = node->rb_right;
1632                 } else {
1633                         match = node_event;
1634                         node = node->rb_left;
1635                 }
1636         }
1637
1638         return match;
1639 }
1640
1641 /*
1642  * Like rb_entry_next_safe() for the @cpu subtree.
1643  */
1644 static struct perf_event *
1645 perf_event_groups_next(struct perf_event *event)
1646 {
1647         struct perf_event *next;
1648
1649         next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
1650         if (next && next->cpu == event->cpu)
1651                 return next;
1652
1653         return NULL;
1654 }
1655
1656 /*
1657  * Iterate through the whole groups tree.
1658  */
1659 #define perf_event_groups_for_each(event, groups)                       \
1660         for (event = rb_entry_safe(rb_first(&((groups)->tree)),         \
1661                                 typeof(*event), group_node); event;     \
1662                 event = rb_entry_safe(rb_next(&event->group_node),      \
1663                                 typeof(*event), group_node))
1664
1665 /*
1666  * Add an event from the lists for its context.
1667  * Must be called with ctx->mutex and ctx->lock held.
1668  */
1669 static void
1670 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1671 {
1672         lockdep_assert_held(&ctx->lock);
1673
1674         WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1675         event->attach_state |= PERF_ATTACH_CONTEXT;
1676
1677         event->tstamp = perf_event_time(event);
1678
1679         /*
1680          * If we're a stand alone event or group leader, we go to the context
1681          * list, group events are kept attached to the group so that
1682          * perf_group_detach can, at all times, locate all siblings.
1683          */
1684         if (event->group_leader == event) {
1685                 event->group_caps = event->event_caps;
1686                 add_event_to_groups(event, ctx);
1687         }
1688
1689         list_update_cgroup_event(event, ctx, true);
1690
1691         list_add_rcu(&event->event_entry, &ctx->event_list);
1692         ctx->nr_events++;
1693         if (event->attr.inherit_stat)
1694                 ctx->nr_stat++;
1695
1696         ctx->generation++;
1697 }
1698
1699 /*
1700  * Initialize event state based on the perf_event_attr::disabled.
1701  */
1702 static inline void perf_event__state_init(struct perf_event *event)
1703 {
1704         event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1705                                               PERF_EVENT_STATE_INACTIVE;
1706 }
1707
1708 static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
1709 {
1710         int entry = sizeof(u64); /* value */
1711         int size = 0;
1712         int nr = 1;
1713
1714         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1715                 size += sizeof(u64);
1716
1717         if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1718                 size += sizeof(u64);
1719
1720         if (event->attr.read_format & PERF_FORMAT_ID)
1721                 entry += sizeof(u64);
1722
1723         if (event->attr.read_format & PERF_FORMAT_GROUP) {
1724                 nr += nr_siblings;
1725                 size += sizeof(u64);
1726         }
1727
1728         size += entry * nr;
1729         event->read_size = size;
1730 }
1731
1732 static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
1733 {
1734         struct perf_sample_data *data;
1735         u16 size = 0;
1736
1737         if (sample_type & PERF_SAMPLE_IP)
1738                 size += sizeof(data->ip);
1739
1740         if (sample_type & PERF_SAMPLE_ADDR)
1741                 size += sizeof(data->addr);
1742
1743         if (sample_type & PERF_SAMPLE_PERIOD)
1744                 size += sizeof(data->period);
1745
1746         if (sample_type & PERF_SAMPLE_WEIGHT)
1747                 size += sizeof(data->weight);
1748
1749         if (sample_type & PERF_SAMPLE_READ)
1750                 size += event->read_size;
1751
1752         if (sample_type & PERF_SAMPLE_DATA_SRC)
1753                 size += sizeof(data->data_src.val);
1754
1755         if (sample_type & PERF_SAMPLE_TRANSACTION)
1756                 size += sizeof(data->txn);
1757
1758         if (sample_type & PERF_SAMPLE_PHYS_ADDR)
1759                 size += sizeof(data->phys_addr);
1760
1761         event->header_size = size;
1762 }
1763
1764 /*
1765  * Called at perf_event creation and when events are attached/detached from a
1766  * group.
1767  */
1768 static void perf_event__header_size(struct perf_event *event)
1769 {
1770         __perf_event_read_size(event,
1771                                event->group_leader->nr_siblings);
1772         __perf_event_header_size(event, event->attr.sample_type);
1773 }
1774
1775 static void perf_event__id_header_size(struct perf_event *event)
1776 {
1777         struct perf_sample_data *data;
1778         u64 sample_type = event->attr.sample_type;
1779         u16 size = 0;
1780
1781         if (sample_type & PERF_SAMPLE_TID)
1782                 size += sizeof(data->tid_entry);
1783
1784         if (sample_type & PERF_SAMPLE_TIME)
1785                 size += sizeof(data->time);
1786
1787         if (sample_type & PERF_SAMPLE_IDENTIFIER)
1788                 size += sizeof(data->id);
1789
1790         if (sample_type & PERF_SAMPLE_ID)
1791                 size += sizeof(data->id);
1792
1793         if (sample_type & PERF_SAMPLE_STREAM_ID)
1794                 size += sizeof(data->stream_id);
1795
1796         if (sample_type & PERF_SAMPLE_CPU)
1797                 size += sizeof(data->cpu_entry);
1798
1799         event->id_header_size = size;
1800 }
1801
1802 static bool perf_event_validate_size(struct perf_event *event)
1803 {
1804         /*
1805          * The values computed here will be over-written when we actually
1806          * attach the event.
1807          */
1808         __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
1809         __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
1810         perf_event__id_header_size(event);
1811
1812         /*
1813          * Sum the lot; should not exceed the 64k limit we have on records.
1814          * Conservative limit to allow for callchains and other variable fields.
1815          */
1816         if (event->read_size + event->header_size +
1817             event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
1818                 return false;
1819
1820         return true;
1821 }
1822
1823 static void perf_group_attach(struct perf_event *event)
1824 {
1825         struct perf_event *group_leader = event->group_leader, *pos;
1826
1827         lockdep_assert_held(&event->ctx->lock);
1828
1829         /*
1830          * We can have double attach due to group movement in perf_event_open.
1831          */
1832         if (event->attach_state & PERF_ATTACH_GROUP)
1833                 return;
1834
1835         event->attach_state |= PERF_ATTACH_GROUP;
1836
1837         if (group_leader == event)
1838                 return;
1839
1840         WARN_ON_ONCE(group_leader->ctx != event->ctx);
1841
1842         group_leader->group_caps &= event->event_caps;
1843
1844         list_add_tail(&event->sibling_list, &group_leader->sibling_list);
1845         group_leader->nr_siblings++;
1846
1847         perf_event__header_size(group_leader);
1848
1849         for_each_sibling_event(pos, group_leader)
1850                 perf_event__header_size(pos);
1851 }
1852
1853 /*
1854  * Remove an event from the lists for its context.
1855  * Must be called with ctx->mutex and ctx->lock held.
1856  */
1857 static void
1858 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1859 {
1860         WARN_ON_ONCE(event->ctx != ctx);
1861         lockdep_assert_held(&ctx->lock);
1862
1863         /*
1864          * We can have double detach due to exit/hot-unplug + close.
1865          */
1866         if (!(event->attach_state & PERF_ATTACH_CONTEXT))
1867                 return;
1868
1869         event->attach_state &= ~PERF_ATTACH_CONTEXT;
1870
1871         list_update_cgroup_event(event, ctx, false);
1872
1873         ctx->nr_events--;
1874         if (event->attr.inherit_stat)
1875                 ctx->nr_stat--;
1876
1877         list_del_rcu(&event->event_entry);
1878
1879         if (event->group_leader == event)
1880                 del_event_from_groups(event, ctx);
1881
1882         /*
1883          * If event was in error state, then keep it
1884          * that way, otherwise bogus counts will be
1885          * returned on read(). The only way to get out
1886          * of error state is by explicit re-enabling
1887          * of the event
1888          */
1889         if (event->state > PERF_EVENT_STATE_OFF)
1890                 perf_event_set_state(event, PERF_EVENT_STATE_OFF);
1891
1892         ctx->generation++;
1893 }
1894
1895 static void perf_group_detach(struct perf_event *event)
1896 {
1897         struct perf_event *sibling, *tmp;
1898         struct perf_event_context *ctx = event->ctx;
1899
1900         lockdep_assert_held(&ctx->lock);
1901
1902         /*
1903          * We can have double detach due to exit/hot-unplug + close.
1904          */
1905         if (!(event->attach_state & PERF_ATTACH_GROUP))
1906                 return;
1907
1908         event->attach_state &= ~PERF_ATTACH_GROUP;
1909
1910         /*
1911          * If this is a sibling, remove it from its group.
1912          */
1913         if (event->group_leader != event) {
1914                 list_del_init(&event->sibling_list);
1915                 event->group_leader->nr_siblings--;
1916                 goto out;
1917         }
1918
1919         /*
1920          * If this was a group event with sibling events then
1921          * upgrade the siblings to singleton events by adding them
1922          * to whatever list we are on.
1923          */
1924         list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
1925
1926                 sibling->group_leader = sibling;
1927                 list_del_init(&sibling->sibling_list);
1928
1929                 /* Inherit group flags from the previous leader */
1930                 sibling->group_caps = event->group_caps;
1931
1932                 if (!RB_EMPTY_NODE(&event->group_node)) {
1933                         add_event_to_groups(sibling, event->ctx);
1934
1935                         if (sibling->state == PERF_EVENT_STATE_ACTIVE) {
1936                                 struct list_head *list = sibling->attr.pinned ?
1937                                         &ctx->pinned_active : &ctx->flexible_active;
1938
1939                                 list_add_tail(&sibling->active_list, list);
1940                         }
1941                 }
1942
1943                 WARN_ON_ONCE(sibling->ctx != event->ctx);
1944         }
1945
1946 out:
1947         perf_event__header_size(event->group_leader);
1948
1949         for_each_sibling_event(tmp, event->group_leader)
1950                 perf_event__header_size(tmp);
1951 }
1952
1953 static bool is_orphaned_event(struct perf_event *event)
1954 {
1955         return event->state == PERF_EVENT_STATE_DEAD;
1956 }
1957
1958 static inline int __pmu_filter_match(struct perf_event *event)
1959 {
1960         struct pmu *pmu = event->pmu;
1961         return pmu->filter_match ? pmu->filter_match(event) : 1;
1962 }
1963
1964 /*
1965  * Check whether we should attempt to schedule an event group based on
1966  * PMU-specific filtering. An event group can consist of HW and SW events,
1967  * potentially with a SW leader, so we must check all the filters, to
1968  * determine whether a group is schedulable:
1969  */
1970 static inline int pmu_filter_match(struct perf_event *event)
1971 {
1972         struct perf_event *sibling;
1973
1974         if (!__pmu_filter_match(event))
1975                 return 0;
1976
1977         for_each_sibling_event(sibling, event) {
1978                 if (!__pmu_filter_match(sibling))
1979                         return 0;
1980         }
1981
1982         return 1;
1983 }
1984
1985 static inline int
1986 event_filter_match(struct perf_event *event)
1987 {
1988         return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
1989                perf_cgroup_match(event) && pmu_filter_match(event);
1990 }
1991
1992 static void
1993 event_sched_out(struct perf_event *event,
1994                   struct perf_cpu_context *cpuctx,
1995                   struct perf_event_context *ctx)
1996 {
1997         enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
1998
1999         WARN_ON_ONCE(event->ctx != ctx);
2000         lockdep_assert_held(&ctx->lock);
2001
2002         if (event->state != PERF_EVENT_STATE_ACTIVE)
2003                 return;
2004
2005         /*
2006          * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but
2007          * we can schedule events _OUT_ individually through things like
2008          * __perf_remove_from_context().
2009          */
2010         list_del_init(&event->active_list);
2011
2012         perf_pmu_disable(event->pmu);
2013
2014         event->pmu->del(event, 0);
2015         event->oncpu = -1;
2016
2017         if (READ_ONCE(event->pending_disable) >= 0) {
2018                 WRITE_ONCE(event->pending_disable, -1);
2019                 state = PERF_EVENT_STATE_OFF;
2020         }
2021         perf_event_set_state(event, state);
2022
2023         if (!is_software_event(event))
2024                 cpuctx->active_oncpu--;
2025         if (!--ctx->nr_active)
2026                 perf_event_ctx_deactivate(ctx);
2027         if (event->attr.freq && event->attr.sample_freq)
2028                 ctx->nr_freq--;
2029         if (event->attr.exclusive || !cpuctx->active_oncpu)
2030                 cpuctx->exclusive = 0;
2031
2032         perf_pmu_enable(event->pmu);
2033 }
2034
2035 static void
2036 group_sched_out(struct perf_event *group_event,
2037                 struct perf_cpu_context *cpuctx,
2038                 struct perf_event_context *ctx)
2039 {
2040         struct perf_event *event;
2041
2042         if (group_event->state != PERF_EVENT_STATE_ACTIVE)
2043                 return;
2044
2045         perf_pmu_disable(ctx->pmu);
2046
2047         event_sched_out(group_event, cpuctx, ctx);
2048
2049         /*
2050          * Schedule out siblings (if any):
2051          */
2052         for_each_sibling_event(event, group_event)
2053                 event_sched_out(event, cpuctx, ctx);
2054
2055         perf_pmu_enable(ctx->pmu);
2056
2057         if (group_event->attr.exclusive)
2058                 cpuctx->exclusive = 0;
2059 }
2060
2061 #define DETACH_GROUP    0x01UL
2062
2063 /*
2064  * Cross CPU call to remove a performance event
2065  *
2066  * We disable the event on the hardware level first. After that we
2067  * remove it from the context list.
2068  */
2069 static void
2070 __perf_remove_from_context(struct perf_event *event,
2071                            struct perf_cpu_context *cpuctx,
2072                            struct perf_event_context *ctx,
2073                            void *info)
2074 {
2075         unsigned long flags = (unsigned long)info;
2076
2077         if (ctx->is_active & EVENT_TIME) {
2078                 update_context_time(ctx);
2079                 update_cgrp_time_from_cpuctx(cpuctx);
2080         }
2081
2082         event_sched_out(event, cpuctx, ctx);
2083         if (flags & DETACH_GROUP)
2084                 perf_group_detach(event);
2085         list_del_event(event, ctx);
2086
2087         if (!ctx->nr_events && ctx->is_active) {
2088                 ctx->is_active = 0;
2089                 if (ctx->task) {
2090                         WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2091                         cpuctx->task_ctx = NULL;
2092                 }
2093         }
2094 }
2095
2096 /*
2097  * Remove the event from a task's (or a CPU's) list of events.
2098  *
2099  * If event->ctx is a cloned context, callers must make sure that
2100  * every task struct that event->ctx->task could possibly point to
2101  * remains valid.  This is OK when called from perf_release since
2102  * that only calls us on the top-level context, which can't be a clone.
2103  * When called from perf_event_exit_task, it's OK because the
2104  * context has been detached from its task.
2105  */
2106 static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
2107 {
2108         struct perf_event_context *ctx = event->ctx;
2109
2110         lockdep_assert_held(&ctx->mutex);
2111
2112         event_function_call(event, __perf_remove_from_context, (void *)flags);
2113
2114         /*
2115          * The above event_function_call() can NO-OP when it hits
2116          * TASK_TOMBSTONE. In that case we must already have been detached
2117          * from the context (by perf_event_exit_event()) but the grouping
2118          * might still be in-tact.
2119          */
2120         WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
2121         if ((flags & DETACH_GROUP) &&
2122             (event->attach_state & PERF_ATTACH_GROUP)) {
2123                 /*
2124                  * Since in that case we cannot possibly be scheduled, simply
2125                  * detach now.
2126                  */
2127                 raw_spin_lock_irq(&ctx->lock);
2128                 perf_group_detach(event);
2129                 raw_spin_unlock_irq(&ctx->lock);
2130         }
2131 }
2132
2133 /*
2134  * Cross CPU call to disable a performance event
2135  */
2136 static void __perf_event_disable(struct perf_event *event,
2137                                  struct perf_cpu_context *cpuctx,
2138                                  struct perf_event_context *ctx,
2139                                  void *info)
2140 {
2141         if (event->state < PERF_EVENT_STATE_INACTIVE)
2142                 return;
2143
2144         if (ctx->is_active & EVENT_TIME) {
2145                 update_context_time(ctx);
2146                 update_cgrp_time_from_event(event);
2147         }
2148
2149         if (event == event->group_leader)
2150                 group_sched_out(event, cpuctx, ctx);
2151         else
2152                 event_sched_out(event, cpuctx, ctx);
2153
2154         perf_event_set_state(event, PERF_EVENT_STATE_OFF);
2155 }
2156
2157 /*
2158  * Disable an event.
2159  *
2160  * If event->ctx is a cloned context, callers must make sure that
2161  * every task struct that event->ctx->task could possibly point to
2162  * remains valid.  This condition is satisifed when called through
2163  * perf_event_for_each_child or perf_event_for_each because they
2164  * hold the top-level event's child_mutex, so any descendant that
2165  * goes to exit will block in perf_event_exit_event().
2166  *
2167  * When called from perf_pending_event it's OK because event->ctx
2168  * is the current context on this CPU and preemption is disabled,
2169  * hence we can't get into perf_event_task_sched_out for this context.
2170  */
2171 static void _perf_event_disable(struct perf_event *event)
2172 {
2173         struct perf_event_context *ctx = event->ctx;
2174
2175         raw_spin_lock_irq(&ctx->lock);
2176         if (event->state <= PERF_EVENT_STATE_OFF) {
2177                 raw_spin_unlock_irq(&ctx->lock);
2178                 return;
2179         }
2180         raw_spin_unlock_irq(&ctx->lock);
2181
2182         event_function_call(event, __perf_event_disable, NULL);
2183 }
2184
2185 void perf_event_disable_local(struct perf_event *event)
2186 {
2187         event_function_local(event, __perf_event_disable, NULL);
2188 }
2189
2190 /*
2191  * Strictly speaking kernel users cannot create groups and therefore this
2192  * interface does not need the perf_event_ctx_lock() magic.
2193  */
2194 void perf_event_disable(struct perf_event *event)
2195 {
2196         struct perf_event_context *ctx;
2197
2198         ctx = perf_event_ctx_lock(event);
2199         _perf_event_disable(event);
2200         perf_event_ctx_unlock(event, ctx);
2201 }
2202 EXPORT_SYMBOL_GPL(perf_event_disable);
2203
2204 void perf_event_disable_inatomic(struct perf_event *event)
2205 {
2206         WRITE_ONCE(event->pending_disable, smp_processor_id());
2207         /* can fail, see perf_pending_event_disable() */
2208         irq_work_queue(&event->pending);
2209 }
2210
2211 static void perf_set_shadow_time(struct perf_event *event,
2212                                  struct perf_event_context *ctx)
2213 {
2214         /*
2215          * use the correct time source for the time snapshot
2216          *
2217          * We could get by without this by leveraging the
2218          * fact that to get to this function, the caller
2219          * has most likely already called update_context_time()
2220          * and update_cgrp_time_xx() and thus both timestamp
2221          * are identical (or very close). Given that tstamp is,
2222          * already adjusted for cgroup, we could say that:
2223          *    tstamp - ctx->timestamp
2224          * is equivalent to
2225          *    tstamp - cgrp->timestamp.
2226          *
2227          * Then, in perf_output_read(), the calculation would
2228          * work with no changes because:
2229          * - event is guaranteed scheduled in
2230          * - no scheduled out in between
2231          * - thus the timestamp would be the same
2232          *
2233          * But this is a bit hairy.
2234          *
2235          * So instead, we have an explicit cgroup call to remain
2236          * within the time time source all along. We believe it
2237          * is cleaner and simpler to understand.
2238          */
2239         if (is_cgroup_event(event))
2240                 perf_cgroup_set_shadow_time(event, event->tstamp);
2241         else
2242                 event->shadow_ctx_time = event->tstamp - ctx->timestamp;
2243 }
2244
2245 #define MAX_INTERRUPTS (~0ULL)
2246
2247 static void perf_log_throttle(struct perf_event *event, int enable);
2248 static void perf_log_itrace_start(struct perf_event *event);
2249
2250 static int
2251 event_sched_in(struct perf_event *event,
2252                  struct perf_cpu_context *cpuctx,
2253                  struct perf_event_context *ctx)
2254 {
2255         int ret = 0;
2256
2257         lockdep_assert_held(&ctx->lock);
2258
2259         if (event->state <= PERF_EVENT_STATE_OFF)
2260                 return 0;
2261
2262         WRITE_ONCE(event->oncpu, smp_processor_id());
2263         /*
2264          * Order event::oncpu write to happen before the ACTIVE state is
2265          * visible. This allows perf_event_{stop,read}() to observe the correct
2266          * ->oncpu if it sees ACTIVE.
2267          */
2268         smp_wmb();
2269         perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);
2270
2271         /*
2272          * Unthrottle events, since we scheduled we might have missed several
2273          * ticks already, also for a heavily scheduling task there is little
2274          * guarantee it'll get a tick in a timely manner.
2275          */
2276         if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
2277                 perf_log_throttle(event, 1);
2278                 event->hw.interrupts = 0;
2279         }
2280
2281         perf_pmu_disable(event->pmu);
2282
2283         perf_set_shadow_time(event, ctx);
2284
2285         perf_log_itrace_start(event);
2286
2287         if (event->pmu->add(event, PERF_EF_START)) {
2288                 perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2289                 event->oncpu = -1;
2290                 ret = -EAGAIN;
2291                 goto out;
2292         }
2293
2294         if (!is_software_event(event))
2295                 cpuctx->active_oncpu++;
2296         if (!ctx->nr_active++)
2297                 perf_event_ctx_activate(ctx);
2298         if (event->attr.freq && event->attr.sample_freq)
2299                 ctx->nr_freq++;
2300
2301         if (event->attr.exclusive)
2302                 cpuctx->exclusive = 1;
2303
2304 out:
2305         perf_pmu_enable(event->pmu);
2306
2307         return ret;
2308 }
2309
2310 static int
2311 group_sched_in(struct perf_event *group_event,
2312                struct perf_cpu_context *cpuctx,
2313                struct perf_event_context *ctx)
2314 {
2315         struct perf_event *event, *partial_group = NULL;
2316         struct pmu *pmu = ctx->pmu;
2317
2318         if (group_event->state == PERF_EVENT_STATE_OFF)
2319                 return 0;
2320
2321         pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
2322
2323         if (event_sched_in(group_event, cpuctx, ctx)) {
2324                 pmu->cancel_txn(pmu);
2325                 perf_mux_hrtimer_restart(cpuctx);
2326                 return -EAGAIN;
2327         }
2328
2329         /*
2330          * Schedule in siblings as one group (if any):
2331          */
2332         for_each_sibling_event(event, group_event) {
2333                 if (event_sched_in(event, cpuctx, ctx)) {
2334                         partial_group = event;
2335                         goto group_error;
2336                 }
2337         }
2338
2339         if (!pmu->commit_txn(pmu))
2340                 return 0;
2341
2342 group_error:
2343         /*
2344          * Groups can be scheduled in as one unit only, so undo any
2345          * partial group before returning:
2346          * The events up to the failed event are scheduled out normally.
2347          */
2348         for_each_sibling_event(event, group_event) {
2349                 if (event == partial_group)
2350                         break;
2351
2352                 event_sched_out(event, cpuctx, ctx);
2353         }
2354         event_sched_out(group_event, cpuctx, ctx);
2355
2356         pmu->cancel_txn(pmu);
2357
2358         perf_mux_hrtimer_restart(cpuctx);
2359
2360         return -EAGAIN;
2361 }
2362
2363 /*
2364  * Work out whether we can put this event group on the CPU now.
2365  */
2366 static int group_can_go_on(struct perf_event *event,
2367                            struct perf_cpu_context *cpuctx,
2368                            int can_add_hw)
2369 {
2370         /*
2371          * Groups consisting entirely of software events can always go on.
2372          */
2373         if (event->group_caps & PERF_EV_CAP_SOFTWARE)
2374                 return 1;
2375         /*
2376          * If an exclusive group is already on, no other hardware
2377          * events can go on.
2378          */
2379         if (cpuctx->exclusive)
2380                 return 0;
2381         /*
2382          * If this group is exclusive and there are already
2383          * events on the CPU, it can't go on.
2384          */
2385         if (event->attr.exclusive && cpuctx->active_oncpu)
2386                 return 0;
2387         /*
2388          * Otherwise, try to add it if all previous groups were able
2389          * to go on.
2390          */
2391         return can_add_hw;
2392 }
2393
2394 static void add_event_to_ctx(struct perf_event *event,
2395                                struct perf_event_context *ctx)
2396 {
2397         list_add_event(event, ctx);
2398         perf_group_attach(event);
2399 }
2400
2401 static void ctx_sched_out(struct perf_event_context *ctx,
2402                           struct perf_cpu_context *cpuctx,
2403                           enum event_type_t event_type);
2404 static void
2405 ctx_sched_in(struct perf_event_context *ctx,
2406              struct perf_cpu_context *cpuctx,
2407              enum event_type_t event_type,
2408              struct task_struct *task);
2409
2410 static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
2411                                struct perf_event_context *ctx,
2412                                enum event_type_t event_type)
2413 {
2414         if (!cpuctx->task_ctx)
2415                 return;
2416
2417         if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2418                 return;
2419
2420         ctx_sched_out(ctx, cpuctx, event_type);
2421 }
2422
2423 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2424                                 struct perf_event_context *ctx,
2425                                 struct task_struct *task)
2426 {
2427         cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
2428         if (ctx)
2429                 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2430         cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2431         if (ctx)
2432                 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2433 }
2434
2435 /*
2436  * We want to maintain the following priority of scheduling:
2437  *  - CPU pinned (EVENT_CPU | EVENT_PINNED)
2438  *  - task pinned (EVENT_PINNED)
2439  *  - CPU flexible (EVENT_CPU | EVENT_FLEXIBLE)
2440  *  - task flexible (EVENT_FLEXIBLE).
2441  *
2442  * In order to avoid unscheduling and scheduling back in everything every
2443  * time an event is added, only do it for the groups of equal priority and
2444  * below.
2445  *
2446  * This can be called after a batch operation on task events, in which case
2447  * event_type is a bit mask of the types of events involved. For CPU events,
2448  * event_type is only either EVENT_PINNED or EVENT_FLEXIBLE.
2449  */
2450 static void ctx_resched(struct perf_cpu_context *cpuctx,
2451                         struct perf_event_context *task_ctx,
2452                         enum event_type_t event_type)
2453 {
2454         enum event_type_t ctx_event_type;
2455         bool cpu_event = !!(event_type & EVENT_CPU);
2456
2457         /*
2458          * If pinned groups are involved, flexible groups also need to be
2459          * scheduled out.
2460          */
2461         if (event_type & EVENT_PINNED)
2462                 event_type |= EVENT_FLEXIBLE;
2463
2464         ctx_event_type = event_type & EVENT_ALL;
2465
2466         perf_pmu_disable(cpuctx->ctx.pmu);
2467         if (task_ctx)
2468                 task_ctx_sched_out(cpuctx, task_ctx, event_type);
2469
2470         /*
2471          * Decide which cpu ctx groups to schedule out based on the types
2472          * of events that caused rescheduling:
2473          *  - EVENT_CPU: schedule out corresponding groups;
2474          *  - EVENT_PINNED task events: schedule out EVENT_FLEXIBLE groups;
2475          *  - otherwise, do nothing more.
2476          */
2477         if (cpu_event)
2478                 cpu_ctx_sched_out(cpuctx, ctx_event_type);
2479         else if (ctx_event_type & EVENT_PINNED)
2480                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2481
2482         perf_event_sched_in(cpuctx, task_ctx, current);
2483         perf_pmu_enable(cpuctx->ctx.pmu);
2484 }
2485
2486 /*
2487  * Cross CPU call to install and enable a performance event
2488  *
2489  * Very similar to remote_function() + event_function() but cannot assume that
2490  * things like ctx->is_active and cpuctx->task_ctx are set.
2491  */
2492 static int  __perf_install_in_context(void *info)
2493 {
2494         struct perf_event *event = info;
2495         struct perf_event_context *ctx = event->ctx;
2496         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2497         struct perf_event_context *task_ctx = cpuctx->task_ctx;
2498         bool reprogram = true;
2499         int ret = 0;
2500
2501         raw_spin_lock(&cpuctx->ctx.lock);
2502         if (ctx->task) {
2503                 raw_spin_lock(&ctx->lock);
2504                 task_ctx = ctx;
2505
2506                 reprogram = (ctx->task == current);
2507
2508                 /*
2509                  * If the task is running, it must be running on this CPU,
2510                  * otherwise we cannot reprogram things.
2511                  *
2512                  * If its not running, we don't care, ctx->lock will
2513                  * serialize against it becoming runnable.
2514                  */
2515                 if (task_curr(ctx->task) && !reprogram) {
2516                         ret = -ESRCH;
2517                         goto unlock;
2518                 }
2519
2520                 WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
2521         } else if (task_ctx) {
2522                 raw_spin_lock(&task_ctx->lock);
2523         }
2524
2525 #ifdef CONFIG_CGROUP_PERF
2526         if (is_cgroup_event(event)) {
2527                 /*
2528                  * If the current cgroup doesn't match the event's
2529                  * cgroup, we should not try to schedule it.
2530                  */
2531                 struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
2532                 reprogram = cgroup_is_descendant(cgrp->css.cgroup,
2533                                         event->cgrp->css.cgroup);
2534         }
2535 #endif
2536
2537         if (reprogram) {
2538                 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2539                 add_event_to_ctx(event, ctx);
2540                 ctx_resched(cpuctx, task_ctx, get_event_type(event));
2541         } else {
2542                 add_event_to_ctx(event, ctx);
2543         }
2544
2545 unlock:
2546         perf_ctx_unlock(cpuctx, task_ctx);
2547
2548         return ret;
2549 }
2550
2551 static bool exclusive_event_installable(struct perf_event *event,
2552                                         struct perf_event_context *ctx);
2553
2554 /*
2555  * Attach a performance event to a context.
2556  *
2557  * Very similar to event_function_call, see comment there.
2558  */
2559 static void
2560 perf_install_in_context(struct perf_event_context *ctx,
2561                         struct perf_event *event,
2562                         int cpu)
2563 {
2564         struct task_struct *task = READ_ONCE(ctx->task);
2565
2566         lockdep_assert_held(&ctx->mutex);
2567
2568         WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
2569
2570         if (event->cpu != -1)
2571                 event->cpu = cpu;
2572
2573         /*
2574          * Ensures that if we can observe event->ctx, both the event and ctx
2575          * will be 'complete'. See perf_iterate_sb_cpu().
2576          */
2577         smp_store_release(&event->ctx, ctx);
2578
2579         if (!task) {
2580                 cpu_function_call(cpu, __perf_install_in_context, event);
2581                 return;
2582         }
2583
2584         /*
2585          * Should not happen, we validate the ctx is still alive before calling.
2586          */
2587         if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
2588                 return;
2589
2590         /*
2591          * Installing events is tricky because we cannot rely on ctx->is_active
2592          * to be set in case this is the nr_events 0 -> 1 transition.
2593          *
2594          * Instead we use task_curr(), which tells us if the task is running.
2595          * However, since we use task_curr() outside of rq::lock, we can race
2596          * against the actual state. This means the result can be wrong.
2597          *
2598          * If we get a false positive, we retry, this is harmless.
2599          *
2600          * If we get a false negative, things are complicated. If we are after
2601          * perf_event_context_sched_in() ctx::lock will serialize us, and the
2602          * value must be correct. If we're before, it doesn't matter since
2603          * perf_event_context_sched_in() will program the counter.
2604          *
2605          * However, this hinges on the remote context switch having observed
2606          * our task->perf_event_ctxp[] store, such that it will in fact take
2607          * ctx::lock in perf_event_context_sched_in().
2608          *
2609          * We do this by task_function_call(), if the IPI fails to hit the task
2610          * we know any future context switch of task must see the
2611          * perf_event_ctpx[] store.
2612          */
2613
2614         /*
2615          * This smp_mb() orders the task->perf_event_ctxp[] store with the
2616          * task_cpu() load, such that if the IPI then does not find the task
2617          * running, a future context switch of that task must observe the
2618          * store.
2619          */
2620         smp_mb();
2621 again:
2622         if (!task_function_call(task, __perf_install_in_context, event))
2623                 return;
2624
2625         raw_spin_lock_irq(&ctx->lock);
2626         task = ctx->task;
2627         if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
2628                 /*
2629                  * Cannot happen because we already checked above (which also
2630                  * cannot happen), and we hold ctx->mutex, which serializes us
2631                  * against perf_event_exit_task_context().
2632                  */
2633                 raw_spin_unlock_irq(&ctx->lock);
2634                 return;
2635         }
2636         /*
2637          * If the task is not running, ctx->lock will avoid it becoming so,
2638          * thus we can safely install the event.
2639          */
2640         if (task_curr(task)) {
2641                 raw_spin_unlock_irq(&ctx->lock);
2642                 goto again;
2643         }
2644         add_event_to_ctx(event, ctx);
2645         raw_spin_unlock_irq(&ctx->lock);
2646 }
2647
2648 /*
2649  * Cross CPU call to enable a performance event
2650  */
2651 static void __perf_event_enable(struct perf_event *event,
2652                                 struct perf_cpu_context *cpuctx,
2653                                 struct perf_event_context *ctx,
2654                                 void *info)
2655 {
2656         struct perf_event *leader = event->group_leader;
2657         struct perf_event_context *task_ctx;
2658
2659         if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2660             event->state <= PERF_EVENT_STATE_ERROR)
2661                 return;
2662
2663         if (ctx->is_active)
2664                 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2665
2666         perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
2667
2668         if (!ctx->is_active)
2669                 return;
2670
2671         if (!event_filter_match(event)) {
2672                 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2673                 return;
2674         }
2675
2676         /*
2677          * If the event is in a group and isn't the group leader,
2678          * then don't put it on unless the group is on.
2679          */
2680         if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
2681                 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2682                 return;
2683         }
2684
2685         task_ctx = cpuctx->task_ctx;
2686         if (ctx->task)
2687                 WARN_ON_ONCE(task_ctx != ctx);
2688
2689         ctx_resched(cpuctx, task_ctx, get_event_type(event));
2690 }
2691
2692 /*
2693  * Enable an event.
2694  *
2695  * If event->ctx is a cloned context, callers must make sure that
2696  * every task struct that event->ctx->task could possibly point to
2697  * remains valid.  This condition is satisfied when called through
2698  * perf_event_for_each_child or perf_event_for_each as described
2699  * for perf_event_disable.
2700  */
2701 static void _perf_event_enable(struct perf_event *event)
2702 {
2703         struct perf_event_context *ctx = event->ctx;
2704
2705         raw_spin_lock_irq(&ctx->lock);
2706         if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2707             event->state <  PERF_EVENT_STATE_ERROR) {
2708                 raw_spin_unlock_irq(&ctx->lock);
2709                 return;
2710         }
2711
2712         /*
2713          * If the event is in error state, clear that first.
2714          *
2715          * That way, if we see the event in error state below, we know that it
2716          * has gone back into error state, as distinct from the task having
2717          * been scheduled away before the cross-call arrived.
2718          */
2719         if (event->state == PERF_EVENT_STATE_ERROR)
2720                 event->state = PERF_EVENT_STATE_OFF;
2721         raw_spin_unlock_irq(&ctx->lock);
2722
2723         event_function_call(event, __perf_event_enable, NULL);
2724 }
2725
2726 /*
2727  * See perf_event_disable();
2728  */
2729 void perf_event_enable(struct perf_event *event)
2730 {
2731         struct perf_event_context *ctx;
2732
2733         ctx = perf_event_ctx_lock(event);
2734         _perf_event_enable(event);
2735         perf_event_ctx_unlock(event, ctx);
2736 }
2737 EXPORT_SYMBOL_GPL(perf_event_enable);
2738
2739 struct stop_event_data {
2740         struct perf_event       *event;
2741         unsigned int            restart;
2742 };
2743
2744 static int __perf_event_stop(void *info)
2745 {
2746         struct stop_event_data *sd = info;
2747         struct perf_event *event = sd->event;
2748
2749         /* if it's already INACTIVE, do nothing */
2750         if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
2751                 return 0;
2752
2753         /* matches smp_wmb() in event_sched_in() */
2754         smp_rmb();
2755
2756         /*
2757          * There is a window with interrupts enabled before we get here,
2758          * so we need to check again lest we try to stop another CPU's event.
2759          */
2760         if (READ_ONCE(event->oncpu) != smp_processor_id())
2761                 return -EAGAIN;
2762
2763         event->pmu->stop(event, PERF_EF_UPDATE);
2764
2765         /*
2766          * May race with the actual stop (through perf_pmu_output_stop()),
2767          * but it is only used for events with AUX ring buffer, and such
2768          * events will refuse to restart because of rb::aux_mmap_count==0,
2769          * see comments in perf_aux_output_begin().
2770          *
2771          * Since this is happening on an event-local CPU, no trace is lost
2772          * while restarting.
2773          */
2774         if (sd->restart)
2775                 event->pmu->start(event, 0);
2776
2777         return 0;
2778 }
2779
2780 static int perf_event_stop(struct perf_event *event, int restart)
2781 {
2782         struct stop_event_data sd = {
2783                 .event          = event,
2784                 .restart        = restart,
2785         };
2786         int ret = 0;
2787
2788         do {
2789                 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
2790                         return 0;
2791
2792                 /* matches smp_wmb() in event_sched_in() */
2793                 smp_rmb();
2794
2795                 /*
2796                  * We only want to restart ACTIVE events, so if the event goes
2797                  * inactive here (event->oncpu==-1), there's nothing more to do;
2798                  * fall through with ret==-ENXIO.
2799                  */
2800                 ret = cpu_function_call(READ_ONCE(event->oncpu),
2801                                         __perf_event_stop, &sd);
2802         } while (ret == -EAGAIN);
2803
2804         return ret;
2805 }
2806
2807 /*
2808  * In order to contain the amount of racy and tricky in the address filter
2809  * configuration management, it is a two part process:
2810  *
2811  * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
2812  *      we update the addresses of corresponding vmas in
2813  *      event::addr_filter_ranges array and bump the event::addr_filters_gen;
2814  * (p2) when an event is scheduled in (pmu::add), it calls
2815  *      perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
2816  *      if the generation has changed since the previous call.
2817  *
2818  * If (p1) happens while the event is active, we restart it to force (p2).
2819  *
2820  * (1) perf_addr_filters_apply(): adjusting filters' offsets based on
2821  *     pre-existing mappings, called once when new filters arrive via SET_FILTER
2822  *     ioctl;
2823  * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
2824  *     registered mapping, called for every new mmap(), with mm::mmap_sem down
2825  *     for reading;
2826  * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
2827  *     of exec.
2828  */
2829 void perf_event_addr_filters_sync(struct perf_event *event)
2830 {
2831         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
2832
2833         if (!has_addr_filter(event))
2834                 return;
2835
2836         raw_spin_lock(&ifh->lock);
2837         if (event->addr_filters_gen != event->hw.addr_filters_gen) {
2838                 event->pmu->addr_filters_sync(event);
2839                 event->hw.addr_filters_gen = event->addr_filters_gen;
2840         }
2841         raw_spin_unlock(&ifh->lock);
2842 }
2843 EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
2844
2845 static int _perf_event_refresh(struct perf_event *event, int refresh)
2846 {
2847         /*
2848          * not supported on inherited events
2849          */
2850         if (event->attr.inherit || !is_sampling_event(event))
2851                 return -EINVAL;
2852
2853         atomic_add(refresh, &event->event_limit);
2854         _perf_event_enable(event);
2855
2856         return 0;
2857 }
2858
2859 /*
2860  * See perf_event_disable()
2861  */
2862 int perf_event_refresh(struct perf_event *event, int refresh)
2863 {
2864         struct perf_event_context *ctx;
2865         int ret;
2866
2867         ctx = perf_event_ctx_lock(event);
2868         ret = _perf_event_refresh(event, refresh);
2869         perf_event_ctx_unlock(event, ctx);
2870
2871         return ret;
2872 }
2873 EXPORT_SYMBOL_GPL(perf_event_refresh);
2874
2875 static int perf_event_modify_breakpoint(struct perf_event *bp,
2876                                          struct perf_event_attr *attr)
2877 {
2878         int err;
2879
2880         _perf_event_disable(bp);
2881
2882         err = modify_user_hw_breakpoint_check(bp, attr, true);
2883
2884         if (!bp->attr.disabled)
2885                 _perf_event_enable(bp);
2886
2887         return err;
2888 }
2889
2890 static int perf_event_modify_attr(struct perf_event *event,
2891                                   struct perf_event_attr *attr)
2892 {
2893         if (event->attr.type != attr->type)
2894                 return -EINVAL;
2895
2896         switch (event->attr.type) {
2897         case PERF_TYPE_BREAKPOINT:
2898                 return perf_event_modify_breakpoint(event, attr);
2899         default:
2900                 /* Place holder for future additions. */
2901                 return -EOPNOTSUPP;
2902         }
2903 }
2904
2905 static void ctx_sched_out(struct perf_event_context *ctx,
2906                           struct perf_cpu_context *cpuctx,
2907                           enum event_type_t event_type)
2908 {
2909         struct perf_event *event, *tmp;
2910         int is_active = ctx->is_active;
2911
2912         lockdep_assert_held(&ctx->lock);
2913
2914         if (likely(!ctx->nr_events)) {
2915                 /*
2916                  * See __perf_remove_from_context().
2917                  */
2918                 WARN_ON_ONCE(ctx->is_active);
2919                 if (ctx->task)
2920                         WARN_ON_ONCE(cpuctx->task_ctx);
2921                 return;
2922         }
2923
2924         ctx->is_active &= ~event_type;
2925         if (!(ctx->is_active & EVENT_ALL))
2926                 ctx->is_active = 0;
2927
2928         if (ctx->task) {
2929                 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2930                 if (!ctx->is_active)
2931                         cpuctx->task_ctx = NULL;
2932         }
2933
2934         /*
2935          * Always update time if it was set; not only when it changes.
2936          * Otherwise we can 'forget' to update time for any but the last
2937          * context we sched out. For example:
2938          *
2939          *   ctx_sched_out(.event_type = EVENT_FLEXIBLE)
2940          *   ctx_sched_out(.event_type = EVENT_PINNED)
2941          *
2942          * would only update time for the pinned events.
2943          */
2944         if (is_active & EVENT_TIME) {
2945                 /* update (and stop) ctx time */
2946                 update_context_time(ctx);
2947                 update_cgrp_time_from_cpuctx(cpuctx);
2948         }
2949
2950         is_active ^= ctx->is_active; /* changed bits */
2951
2952         if (!ctx->nr_active || !(is_active & EVENT_ALL))
2953                 return;
2954
2955         /*
2956          * If we had been multiplexing, no rotations are necessary, now no events
2957          * are active.
2958          */
2959         ctx->rotate_necessary = 0;
2960
2961         perf_pmu_disable(ctx->pmu);
2962         if (is_active & EVENT_PINNED) {
2963                 list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
2964                         group_sched_out(event, cpuctx, ctx);
2965         }
2966
2967         if (is_active & EVENT_FLEXIBLE) {
2968                 list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
2969                         group_sched_out(event, cpuctx, ctx);
2970         }
2971         perf_pmu_enable(ctx->pmu);
2972 }
2973
2974 /*
2975  * Test whether two contexts are equivalent, i.e. whether they have both been
2976  * cloned from the same version of the same context.
2977  *
2978  * Equivalence is measured using a generation number in the context that is
2979  * incremented on each modification to it; see unclone_ctx(), list_add_event()
2980  * and list_del_event().
2981  */
2982 static int context_equiv(struct perf_event_context *ctx1,
2983                          struct perf_event_context *ctx2)
2984 {
2985         lockdep_assert_held(&ctx1->lock);
2986         lockdep_assert_held(&ctx2->lock);
2987
2988         /* Pinning disables the swap optimization */
2989         if (ctx1->pin_count || ctx2->pin_count)
2990                 return 0;
2991
2992         /* If ctx1 is the parent of ctx2 */
2993         if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
2994                 return 1;
2995
2996         /* If ctx2 is the parent of ctx1 */
2997         if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
2998                 return 1;
2999
3000         /*
3001          * If ctx1 and ctx2 have the same parent; we flatten the parent
3002          * hierarchy, see perf_event_init_context().
3003          */
3004         if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
3005                         ctx1->parent_gen == ctx2->parent_gen)
3006                 return 1;
3007
3008         /* Unmatched */
3009         return 0;
3010 }
3011
3012 static void __perf_event_sync_stat(struct perf_event *event,
3013                                      struct perf_event *next_event)
3014 {
3015         u64 value;
3016
3017         if (!event->attr.inherit_stat)
3018                 return;
3019
3020         /*
3021          * Update the event value, we cannot use perf_event_read()
3022          * because we're in the middle of a context switch and have IRQs
3023          * disabled, which upsets smp_call_function_single(), however
3024          * we know the event must be on the current CPU, therefore we
3025          * don't need to use it.
3026          */
3027         if (event->state == PERF_EVENT_STATE_ACTIVE)
3028                 event->pmu->read(event);
3029
3030         perf_event_update_time(event);
3031
3032         /*
3033          * In order to keep per-task stats reliable we need to flip the event
3034          * values when we flip the contexts.
3035          */
3036         value = local64_read(&next_event->count);
3037         value = local64_xchg(&event->count, value);
3038         local64_set(&next_event->count, value);
3039
3040         swap(event->total_time_enabled, next_event->total_time_enabled);
3041         swap(event->total_time_running, next_event->total_time_running);
3042
3043         /*
3044          * Since we swizzled the values, update the user visible data too.
3045          */
3046         perf_event_update_userpage(event);
3047         perf_event_update_userpage(next_event);
3048 }
3049
3050 static void perf_event_sync_stat(struct perf_event_context *ctx,
3051                                    struct perf_event_context *next_ctx)
3052 {
3053         struct perf_event *event, *next_event;
3054
3055         if (!ctx->nr_stat)
3056                 return;
3057
3058         update_context_time(ctx);
3059
3060         event = list_first_entry(&ctx->event_list,
3061                                    struct perf_event, event_entry);
3062
3063         next_event = list_first_entry(&next_ctx->event_list,
3064                                         struct perf_event, event_entry);
3065
3066         while (&event->event_entry != &ctx->event_list &&
3067                &next_event->event_entry != &next_ctx->event_list) {
3068
3069                 __perf_event_sync_stat(event, next_event);
3070
3071                 event = list_next_entry(event, event_entry);
3072                 next_event = list_next_entry(next_event, event_entry);
3073         }
3074 }
3075
3076 static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
3077                                          struct task_struct *next)
3078 {
3079         struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
3080         struct perf_event_context *next_ctx;
3081         struct perf_event_context *parent, *next_parent;
3082         struct perf_cpu_context *cpuctx;
3083         int do_switch = 1;
3084
3085         if (likely(!ctx))
3086                 return;
3087
3088         cpuctx = __get_cpu_context(ctx);
3089         if (!cpuctx->task_ctx)
3090                 return;
3091
3092         rcu_read_lock();
3093         next_ctx = next->perf_event_ctxp[ctxn];
3094         if (!next_ctx)
3095                 goto unlock;
3096
3097         parent = rcu_dereference(ctx->parent_ctx);
3098         next_parent = rcu_dereference(next_ctx->parent_ctx);
3099
3100         /* If neither context have a parent context; they cannot be clones. */
3101         if (!parent && !next_parent)
3102                 goto unlock;
3103
3104         if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
3105                 /*
3106                  * Looks like the two contexts are clones, so we might be
3107                  * able to optimize the context switch.  We lock both
3108                  * contexts and check that they are clones under the
3109                  * lock (including re-checking that neither has been
3110                  * uncloned in the meantime).  It doesn't matter which
3111                  * order we take the locks because no other cpu could
3112                  * be trying to lock both of these tasks.
3113                  */
3114                 raw_spin_lock(&ctx->lock);
3115                 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
3116                 if (context_equiv(ctx, next_ctx)) {
3117                         WRITE_ONCE(ctx->task, next);
3118                         WRITE_ONCE(next_ctx->task, task);
3119
3120                         swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
3121
3122                         /*
3123                          * RCU_INIT_POINTER here is safe because we've not
3124                          * modified the ctx and the above modification of
3125                          * ctx->task and ctx->task_ctx_data are immaterial
3126                          * since those values are always verified under
3127                          * ctx->lock which we're now holding.
3128                          */
3129                         RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
3130                         RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
3131
3132                         do_switch = 0;
3133
3134                         perf_event_sync_stat(ctx, next_ctx);
3135                 }
3136                 raw_spin_unlock(&next_ctx->lock);
3137                 raw_spin_unlock(&ctx->lock);
3138         }
3139 unlock:
3140         rcu_read_unlock();
3141
3142         if (do_switch) {
3143                 raw_spin_lock(&ctx->lock);
3144                 task_ctx_sched_out(cpuctx, ctx, EVENT_ALL);
3145                 raw_spin_unlock(&ctx->lock);
3146         }
3147 }
3148
3149 static DEFINE_PER_CPU(struct list_head, sched_cb_list);
3150
3151 void perf_sched_cb_dec(struct pmu *pmu)
3152 {
3153         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3154
3155         this_cpu_dec(perf_sched_cb_usages);
3156
3157         if (!--cpuctx->sched_cb_usage)
3158                 list_del(&cpuctx->sched_cb_entry);
3159 }
3160
3161
3162 void perf_sched_cb_inc(struct pmu *pmu)
3163 {
3164         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3165
3166         if (!cpuctx->sched_cb_usage++)
3167                 list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
3168
3169         this_cpu_inc(perf_sched_cb_usages);
3170 }
3171
3172 /*
3173  * This function provides the context switch callback to the lower code
3174  * layer. It is invoked ONLY when the context switch callback is enabled.
3175  *
3176  * This callback is relevant even to per-cpu events; for example multi event
3177  * PEBS requires this to provide PID/TID information. This requires we flush
3178  * all queued PEBS records before we context switch to a new task.
3179  */
3180 static void perf_pmu_sched_task(struct task_struct *prev,
3181                                 struct task_struct *next,
3182                                 bool sched_in)
3183 {
3184         struct perf_cpu_context *cpuctx;
3185         struct pmu *pmu;
3186
3187         if (prev == next)
3188                 return;
3189
3190         list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
3191                 pmu = cpuctx->ctx.pmu; /* software PMUs will not have sched_task */
3192
3193                 if (WARN_ON_ONCE(!pmu->sched_task))
3194                         continue;
3195
3196                 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3197                 perf_pmu_disable(pmu);
3198
3199                 pmu->sched_task(cpuctx->task_ctx, sched_in);
3200
3201                 perf_pmu_enable(pmu);
3202                 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3203         }
3204 }
3205
3206 static void perf_event_switch(struct task_struct *task,
3207                               struct task_struct *next_prev, bool sched_in);
3208
3209 #define for_each_task_context_nr(ctxn)                                  \
3210         for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
3211
3212 /*
3213  * Called from scheduler to remove the events of the current task,
3214  * with interrupts disabled.
3215  *
3216  * We stop each event and update the event value in event->count.
3217  *
3218  * This does not protect us against NMI, but disable()
3219  * sets the disabled bit in the control field of event _before_
3220  * accessing the event control register. If a NMI hits, then it will
3221  * not restart the event.
3222  */
3223 void __perf_event_task_sched_out(struct task_struct *task,
3224                                  struct task_struct *next)
3225 {
3226         int ctxn;
3227
3228         if (__this_cpu_read(perf_sched_cb_usages))
3229                 perf_pmu_sched_task(task, next, false);
3230
3231         if (atomic_read(&nr_switch_events))
3232                 perf_event_switch(task, next, false);
3233
3234         for_each_task_context_nr(ctxn)
3235                 perf_event_context_sched_out(task, ctxn, next);
3236
3237         /*
3238          * if cgroup events exist on this CPU, then we need
3239          * to check if we have to switch out PMU state.
3240          * cgroup event are system-wide mode only
3241          */
3242         if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3243                 perf_cgroup_sched_out(task, next);
3244 }
3245
3246 /*
3247  * Called with IRQs disabled
3248  */
3249 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
3250                               enum event_type_t event_type)
3251 {
3252         ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
3253 }
3254
3255 static int visit_groups_merge(struct perf_event_groups *groups, int cpu,
3256                               int (*func)(struct perf_event *, void *), void *data)
3257 {
3258         struct perf_event **evt, *evt1, *evt2;
3259         int ret;
3260
3261         evt1 = perf_event_groups_first(groups, -1);
3262         evt2 = perf_event_groups_first(groups, cpu);
3263
3264         while (evt1 || evt2) {
3265                 if (evt1 && evt2) {
3266                         if (evt1->group_index < evt2->group_index)
3267                                 evt = &evt1;
3268                         else
3269                                 evt = &evt2;
3270                 } else if (evt1) {
3271                         evt = &evt1;
3272                 } else {
3273                         evt = &evt2;
3274                 }
3275
3276                 ret = func(*evt, data);
3277                 if (ret)
3278                         return ret;
3279
3280                 *evt = perf_event_groups_next(*evt);
3281         }
3282
3283         return 0;
3284 }
3285
3286 struct sched_in_data {
3287         struct perf_event_context *ctx;
3288         struct perf_cpu_context *cpuctx;
3289         int can_add_hw;
3290 };
3291
3292 static int pinned_sched_in(struct perf_event *event, void *data)
3293 {
3294         struct sched_in_data *sid = data;
3295
3296         if (event->state <= PERF_EVENT_STATE_OFF)
3297                 return 0;
3298
3299         if (!event_filter_match(event))
3300                 return 0;
3301
3302         if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
3303                 if (!group_sched_in(event, sid->cpuctx, sid->ctx))
3304                         list_add_tail(&event->active_list, &sid->ctx->pinned_active);
3305         }
3306
3307         /*
3308          * If this pinned group hasn't been scheduled,
3309          * put it in error state.
3310          */
3311         if (event->state == PERF_EVENT_STATE_INACTIVE)
3312                 perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
3313
3314         return 0;
3315 }
3316
3317 static int flexible_sched_in(struct perf_event *event, void *data)
3318 {
3319         struct sched_in_data *sid = data;
3320
3321         if (event->state <= PERF_EVENT_STATE_OFF)
3322                 return 0;
3323
3324         if (!event_filter_match(event))
3325                 return 0;
3326
3327         if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
3328                 int ret = group_sched_in(event, sid->cpuctx, sid->ctx);
3329                 if (ret) {
3330                         sid->can_add_hw = 0;
3331                         sid->ctx->rotate_necessary = 1;
3332                         return 0;
3333                 }
3334                 list_add_tail(&event->active_list, &sid->ctx->flexible_active);
3335         }
3336
3337         return 0;
3338 }
3339
3340 static void
3341 ctx_pinned_sched_in(struct perf_event_context *ctx,
3342                     struct perf_cpu_context *cpuctx)
3343 {
3344         struct sched_in_data sid = {
3345                 .ctx = ctx,
3346                 .cpuctx = cpuctx,
3347                 .can_add_hw = 1,
3348         };
3349
3350         visit_groups_merge(&ctx->pinned_groups,
3351                            smp_processor_id(),
3352                            pinned_sched_in, &sid);
3353 }
3354
3355 static void
3356 ctx_flexible_sched_in(struct perf_event_context *ctx,
3357                       struct perf_cpu_context *cpuctx)
3358 {
3359         struct sched_in_data sid = {
3360                 .ctx = ctx,
3361                 .cpuctx = cpuctx,
3362                 .can_add_hw = 1,
3363         };
3364
3365         visit_groups_merge(&ctx->flexible_groups,
3366                            smp_processor_id(),
3367                            flexible_sched_in, &sid);
3368 }
3369
3370 static void
3371 ctx_sched_in(struct perf_event_context *ctx,
3372              struct perf_cpu_context *cpuctx,
3373              enum event_type_t event_type,
3374              struct task_struct *task)
3375 {
3376         int is_active = ctx->is_active;
3377         u64 now;
3378
3379         lockdep_assert_held(&ctx->lock);
3380
3381         if (likely(!ctx->nr_events))
3382                 return;
3383
3384         ctx->is_active |= (event_type | EVENT_TIME);
3385         if (ctx->task) {
3386                 if (!is_active)
3387                         cpuctx->task_ctx = ctx;
3388                 else
3389                         WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3390         }
3391
3392         is_active ^= ctx->is_active; /* changed bits */
3393
3394         if (is_active & EVENT_TIME) {
3395                 /* start ctx time */
3396                 now = perf_clock();
3397                 ctx->timestamp = now;
3398                 perf_cgroup_set_timestamp(task, ctx);
3399         }
3400
3401         /*
3402          * First go through the list and put on any pinned groups
3403          * in order to give them the best chance of going on.
3404          */
3405         if (is_active & EVENT_PINNED)
3406                 ctx_pinned_sched_in(ctx, cpuctx);
3407
3408         /* Then walk through the lower prio flexible groups */
3409         if (is_active & EVENT_FLEXIBLE)
3410                 ctx_flexible_sched_in(ctx, cpuctx);
3411 }
3412
3413 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
3414                              enum event_type_t event_type,
3415                              struct task_struct *task)
3416 {
3417         struct perf_event_context *ctx = &cpuctx->ctx;
3418
3419         ctx_sched_in(ctx, cpuctx, event_type, task);
3420 }
3421
3422 static void perf_event_context_sched_in(struct perf_event_context *ctx,
3423                                         struct task_struct *task)
3424 {
3425         struct perf_cpu_context *cpuctx;
3426
3427         cpuctx = __get_cpu_context(ctx);
3428         if (cpuctx->task_ctx == ctx)
3429                 return;
3430
3431         perf_ctx_lock(cpuctx, ctx);
3432         /*
3433          * We must check ctx->nr_events while holding ctx->lock, such
3434          * that we serialize against perf_install_in_context().
3435          */
3436         if (!ctx->nr_events)
3437                 goto unlock;
3438
3439         perf_pmu_disable(ctx->pmu);
3440         /*
3441          * We want to keep the following priority order:
3442          * cpu pinned (that don't need to move), task pinned,
3443          * cpu flexible, task flexible.
3444          *
3445          * However, if task's ctx is not carrying any pinned
3446          * events, no need to flip the cpuctx's events around.
3447          */
3448         if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
3449                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3450         perf_event_sched_in(cpuctx, ctx, task);
3451         perf_pmu_enable(ctx->pmu);
3452
3453 unlock:
3454         perf_ctx_unlock(cpuctx, ctx);
3455 }
3456
3457 /*
3458  * Called from scheduler to add the events of the current task
3459  * with interrupts disabled.
3460  *
3461  * We restore the event value and then enable it.
3462  *
3463  * This does not protect us against NMI, but enable()
3464  * sets the enabled bit in the control field of event _before_
3465  * accessing the event control register. If a NMI hits, then it will
3466  * keep the event running.
3467  */
3468 void __perf_event_task_sched_in(struct task_struct *prev,
3469                                 struct task_struct *task)
3470 {
3471         struct perf_event_context *ctx;
3472         int ctxn;
3473
3474         /*
3475          * If cgroup events exist on this CPU, then we need to check if we have
3476          * to switch in PMU state; cgroup event are system-wide mode only.
3477          *
3478          * Since cgroup events are CPU events, we must schedule these in before
3479          * we schedule in the task events.
3480          */
3481         if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3482                 perf_cgroup_sched_in(prev, task);
3483
3484         for_each_task_context_nr(ctxn) {
3485                 ctx = task->perf_event_ctxp[ctxn];
3486                 if (likely(!ctx))
3487                         continue;
3488
3489                 perf_event_context_sched_in(ctx, task);
3490         }
3491
3492         if (atomic_read(&nr_switch_events))
3493                 perf_event_switch(task, prev, true);
3494
3495         if (__this_cpu_read(perf_sched_cb_usages))
3496                 perf_pmu_sched_task(prev, task, true);
3497 }
3498
3499 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
3500 {
3501         u64 frequency = event->attr.sample_freq;
3502         u64 sec = NSEC_PER_SEC;
3503         u64 divisor, dividend;
3504
3505         int count_fls, nsec_fls, frequency_fls, sec_fls;
3506
3507         count_fls = fls64(count);
3508         nsec_fls = fls64(nsec);
3509         frequency_fls = fls64(frequency);
3510         sec_fls = 30;
3511
3512         /*
3513          * We got @count in @nsec, with a target of sample_freq HZ
3514          * the target period becomes:
3515          *
3516          *             @count * 10^9
3517          * period = -------------------
3518          *          @nsec * sample_freq
3519          *
3520          */
3521
3522         /*
3523          * Reduce accuracy by one bit such that @a and @b converge
3524          * to a similar magnitude.
3525          */
3526 #define REDUCE_FLS(a, b)                \
3527 do {                                    \
3528         if (a##_fls > b##_fls) {        \
3529                 a >>= 1;                \
3530                 a##_fls--;              \
3531         } else {                        \
3532                 b >>= 1;                \
3533                 b##_fls--;              \
3534         }                               \
3535 } while (0)
3536
3537         /*
3538          * Reduce accuracy until either term fits in a u64, then proceed with
3539          * the other, so that finally we can do a u64/u64 division.
3540          */
3541         while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
3542                 REDUCE_FLS(nsec, frequency);
3543                 REDUCE_FLS(sec, count);
3544         }
3545
3546         if (count_fls + sec_fls > 64) {
3547                 divisor = nsec * frequency;
3548
3549                 while (count_fls + sec_fls > 64) {
3550                         REDUCE_FLS(count, sec);
3551                         divisor >>= 1;
3552                 }
3553
3554                 dividend = count * sec;
3555         } else {
3556                 dividend = count * sec;
3557
3558                 while (nsec_fls + frequency_fls > 64) {
3559                         REDUCE_FLS(nsec, frequency);
3560                         dividend >>= 1;
3561                 }
3562
3563                 divisor = nsec * frequency;
3564         }
3565
3566         if (!divisor)
3567                 return dividend;
3568
3569         return div64_u64(dividend, divisor);
3570 }
3571
3572 static DEFINE_PER_CPU(int, perf_throttled_count);
3573 static DEFINE_PER_CPU(u64, perf_throttled_seq);
3574
3575 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
3576 {
3577         struct hw_perf_event *hwc = &event->hw;
3578         s64 period, sample_period;
3579         s64 delta;
3580
3581         period = perf_calculate_period(event, nsec, count);
3582
3583         delta = (s64)(period - hwc->sample_period);
3584         delta = (delta + 7) / 8; /* low pass filter */
3585
3586         sample_period = hwc->sample_period + delta;
3587
3588         if (!sample_period)
3589                 sample_period = 1;
3590
3591         hwc->sample_period = sample_period;
3592
3593         if (local64_read(&hwc->period_left) > 8*sample_period) {
3594                 if (disable)
3595                         event->pmu->stop(event, PERF_EF_UPDATE);
3596
3597                 local64_set(&hwc->period_left, 0);
3598
3599                 if (disable)
3600                         event->pmu->start(event, PERF_EF_RELOAD);
3601         }
3602 }
3603
3604 /*
3605  * combine freq adjustment with unthrottling to avoid two passes over the
3606  * events. At the same time, make sure, having freq events does not change
3607  * the rate of unthrottling as that would introduce bias.
3608  */
3609 static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
3610                                            int needs_unthr)
3611 {
3612         struct perf_event *event;
3613         struct hw_perf_event *hwc;
3614         u64 now, period = TICK_NSEC;
3615         s64 delta;
3616
3617         /*
3618          * only need to iterate over all events iff:
3619          * - context have events in frequency mode (needs freq adjust)
3620          * - there are events to unthrottle on this cpu
3621          */
3622         if (!(ctx->nr_freq || needs_unthr))
3623                 return;
3624
3625         raw_spin_lock(&ctx->lock);
3626         perf_pmu_disable(ctx->pmu);
3627
3628         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3629                 if (event->state != PERF_EVENT_STATE_ACTIVE)
3630                         continue;
3631
3632                 if (!event_filter_match(event))
3633                         continue;
3634
3635                 perf_pmu_disable(event->pmu);
3636
3637                 hwc = &event->hw;
3638
3639                 if (hwc->interrupts == MAX_INTERRUPTS) {
3640                         hwc->interrupts = 0;
3641                         perf_log_throttle(event, 1);
3642                         event->pmu->start(event, 0);
3643                 }
3644
3645                 if (!event->attr.freq || !event->attr.sample_freq)
3646                         goto next;
3647
3648                 /*
3649                  * stop the event and update event->count
3650                  */
3651                 event->pmu->stop(event, PERF_EF_UPDATE);
3652
3653                 now = local64_read(&event->count);
3654                 delta = now - hwc->freq_count_stamp;
3655                 hwc->freq_count_stamp = now;
3656
3657                 /*
3658                  * restart the event
3659                  * reload only if value has changed
3660                  * we have stopped the event so tell that
3661                  * to perf_adjust_period() to avoid stopping it
3662                  * twice.
3663                  */
3664                 if (delta > 0)
3665                         perf_adjust_period(event, period, delta, false);
3666
3667                 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
3668         next:
3669                 perf_pmu_enable(event->pmu);
3670         }
3671
3672         perf_pmu_enable(ctx->pmu);
3673         raw_spin_unlock(&ctx->lock);
3674 }
3675
3676 /*
3677  * Move @event to the tail of the @ctx's elegible events.
3678  */
3679 static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
3680 {
3681         /*
3682          * Rotate the first entry last of non-pinned groups. Rotation might be
3683          * disabled by the inheritance code.
3684          */
3685         if (ctx->rotate_disable)
3686                 return;
3687
3688         perf_event_groups_delete(&ctx->flexible_groups, event);
3689         perf_event_groups_insert(&ctx->flexible_groups, event);
3690 }
3691
3692 /* pick an event from the flexible_groups to rotate */
3693 static inline struct perf_event *
3694 ctx_event_to_rotate(struct perf_event_context *ctx)
3695 {
3696         struct perf_event *event;
3697
3698         /* pick the first active flexible event */
3699         event = list_first_entry_or_null(&ctx->flexible_active,
3700                                          struct perf_event, active_list);
3701
3702         /* if no active flexible event, pick the first event */
3703         if (!event) {
3704                 event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
3705                                       typeof(*event), group_node);
3706         }
3707
3708         return event;
3709 }
3710
3711 static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
3712 {
3713         struct perf_event *cpu_event = NULL, *task_event = NULL;
3714         struct perf_event_context *task_ctx = NULL;
3715         int cpu_rotate, task_rotate;
3716
3717         /*
3718          * Since we run this from IRQ context, nobody can install new
3719          * events, thus the event count values are stable.
3720          */
3721
3722         cpu_rotate = cpuctx->ctx.rotate_necessary;
3723         task_ctx = cpuctx->task_ctx;
3724         task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
3725
3726         if (!(cpu_rotate || task_rotate))
3727                 return false;
3728
3729         perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3730         perf_pmu_disable(cpuctx->ctx.pmu);
3731
3732         if (task_rotate)
3733                 task_event = ctx_event_to_rotate(task_ctx);
3734         if (cpu_rotate)
3735                 cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
3736
3737         /*
3738          * As per the order given at ctx_resched() first 'pop' task flexible
3739          * and then, if needed CPU flexible.
3740          */
3741         if (task_event || (task_ctx && cpu_event))
3742                 ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
3743         if (cpu_event)
3744                 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3745
3746         if (task_event)
3747                 rotate_ctx(task_ctx, task_event);
3748         if (cpu_event)
3749                 rotate_ctx(&cpuctx->ctx, cpu_event);
3750
3751         perf_event_sched_in(cpuctx, task_ctx, current);
3752
3753         perf_pmu_enable(cpuctx->ctx.pmu);
3754         perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3755
3756         return true;
3757 }
3758
3759 void perf_event_task_tick(void)
3760 {
3761         struct list_head *head = this_cpu_ptr(&active_ctx_list);
3762         struct perf_event_context *ctx, *tmp;
3763         int throttled;
3764
3765         lockdep_assert_irqs_disabled();
3766
3767         __this_cpu_inc(perf_throttled_seq);
3768         throttled = __this_cpu_xchg(perf_throttled_count, 0);
3769         tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
3770
3771         list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
3772                 perf_adjust_freq_unthr_context(ctx, throttled);
3773 }
3774
3775 static int event_enable_on_exec(struct perf_event *event,
3776                                 struct perf_event_context *ctx)
3777 {
3778         if (!event->attr.enable_on_exec)
3779                 return 0;
3780
3781         event->attr.enable_on_exec = 0;
3782         if (event->state >= PERF_EVENT_STATE_INACTIVE)
3783                 return 0;
3784
3785         perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
3786
3787         return 1;
3788 }
3789
3790 /*
3791  * Enable all of a task's events that have been marked enable-on-exec.
3792  * This expects task == current.
3793  */
3794 static void perf_event_enable_on_exec(int ctxn)
3795 {
3796         struct perf_event_context *ctx, *clone_ctx = NULL;
3797         enum event_type_t event_type = 0;
3798         struct perf_cpu_context *cpuctx;
3799         struct perf_event *event;
3800         unsigned long flags;
3801         int enabled = 0;
3802
3803         local_irq_save(flags);
3804         ctx = current->perf_event_ctxp[ctxn];
3805         if (!ctx || !ctx->nr_events)
3806                 goto out;
3807
3808         cpuctx = __get_cpu_context(ctx);
3809         perf_ctx_lock(cpuctx, ctx);
3810         ctx_sched_out(ctx, cpuctx, EVENT_TIME);
3811         list_for_each_entry(event, &ctx->event_list, event_entry) {
3812                 enabled |= event_enable_on_exec(event, ctx);
3813                 event_type |= get_event_type(event);
3814         }
3815
3816         /*
3817          * Unclone and reschedule this context if we enabled any event.
3818          */
3819         if (enabled) {
3820                 clone_ctx = unclone_ctx(ctx);
3821                 ctx_resched(cpuctx, ctx, event_type);
3822         } else {
3823                 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
3824         }
3825         perf_ctx_unlock(cpuctx, ctx);
3826
3827 out:
3828         local_irq_restore(flags);
3829
3830         if (clone_ctx)
3831                 put_ctx(clone_ctx);
3832 }
3833
3834 struct perf_read_data {
3835         struct perf_event *event;
3836         bool group;
3837         int ret;
3838 };
3839
3840 static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
3841 {
3842         u16 local_pkg, event_pkg;
3843
3844         if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
3845                 int local_cpu = smp_processor_id();
3846
3847                 event_pkg = topology_physical_package_id(event_cpu);
3848                 local_pkg = topology_physical_package_id(local_cpu);
3849
3850                 if (event_pkg == local_pkg)
3851                         return local_cpu;
3852         }
3853
3854         return event_cpu;
3855 }
3856
3857 /*
3858  * Cross CPU call to read the hardware event
3859  */
3860 static void __perf_event_read(void *info)
3861 {
3862         struct perf_read_data *data = info;
3863         struct perf_event *sub, *event = data->event;
3864         struct perf_event_context *ctx = event->ctx;
3865         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3866         struct pmu *pmu = event->pmu;
3867
3868         /*
3869          * If this is a task context, we need to check whether it is
3870          * the current task context of this cpu.  If not it has been
3871          * scheduled out before the smp call arrived.  In that case
3872          * event->count would have been updated to a recent sample
3873          * when the event was scheduled out.
3874          */
3875         if (ctx->task && cpuctx->task_ctx != ctx)
3876                 return;
3877
3878         raw_spin_lock(&ctx->lock);
3879         if (ctx->is_active & EVENT_TIME) {
3880                 update_context_time(ctx);
3881                 update_cgrp_time_from_event(event);
3882         }
3883
3884         perf_event_update_time(event);
3885         if (data->group)
3886                 perf_event_update_sibling_time(event);
3887
3888         if (event->state != PERF_EVENT_STATE_ACTIVE)
3889                 goto unlock;
3890
3891         if (!data->group) {
3892                 pmu->read(event);
3893                 data->ret = 0;
3894                 goto unlock;
3895         }
3896
3897         pmu->start_txn(pmu, PERF_PMU_TXN_READ);
3898
3899         pmu->read(event);
3900
3901         for_each_sibling_event(sub, event) {
3902                 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
3903                         /*
3904                          * Use sibling's PMU rather than @event's since
3905                          * sibling could be on different (eg: software) PMU.
3906                          */
3907                         sub->pmu->read(sub);
3908                 }
3909         }
3910
3911         data->ret = pmu->commit_txn(pmu);
3912
3913 unlock:
3914         raw_spin_unlock(&ctx->lock);
3915 }
3916
3917 static inline u64 perf_event_count(struct perf_event *event)
3918 {
3919         return local64_read(&event->count) + atomic64_read(&event->child_count);
3920 }
3921
3922 /*
3923  * NMI-safe method to read a local event, that is an event that
3924  * is:
3925  *   - either for the current task, or for this CPU
3926  *   - does not have inherit set, for inherited task events
3927  *     will not be local and we cannot read them atomically
3928  *   - must not have a pmu::count method
3929  */
3930 int perf_event_read_local(struct perf_event *event, u64 *value,
3931                           u64 *enabled, u64 *running)
3932 {
3933         unsigned long flags;
3934         int ret = 0;
3935
3936         /*
3937          * Disabling interrupts avoids all counter scheduling (context
3938          * switches, timer based rotation and IPIs).
3939          */
3940         local_irq_save(flags);
3941
3942         /*
3943          * It must not be an event with inherit set, we cannot read
3944          * all child counters from atomic context.
3945          */
3946         if (event->attr.inherit) {
3947                 ret = -EOPNOTSUPP;
3948                 goto out;
3949         }
3950
3951         /* If this is a per-task event, it must be for current */
3952         if ((event->attach_state & PERF_ATTACH_TASK) &&
3953             event->hw.target != current) {
3954                 ret = -EINVAL;
3955                 goto out;
3956         }
3957
3958         /* If this is a per-CPU event, it must be for this CPU */
3959         if (!(event->attach_state & PERF_ATTACH_TASK) &&
3960             event->cpu != smp_processor_id()) {
3961                 ret = -EINVAL;
3962                 goto out;
3963         }
3964
3965         /* If this is a pinned event it must be running on this CPU */
3966         if (event->attr.pinned && event->oncpu != smp_processor_id()) {
3967                 ret = -EBUSY;
3968                 goto out;
3969         }
3970
3971         /*
3972          * If the event is currently on this CPU, its either a per-task event,
3973          * or local to this CPU. Furthermore it means its ACTIVE (otherwise
3974          * oncpu == -1).
3975          */
3976         if (event->oncpu == smp_processor_id())
3977                 event->pmu->read(event);
3978
3979         *value = local64_read(&event->count);
3980         if (enabled || running) {
3981                 u64 now = event->shadow_ctx_time + perf_clock();
3982                 u64 __enabled, __running;
3983
3984                 __perf_update_times(event, now, &__enabled, &__running);
3985                 if (enabled)
3986                         *enabled = __enabled;
3987                 if (running)
3988                         *running = __running;
3989         }
3990 out:
3991         local_irq_restore(flags);
3992
3993         return ret;
3994 }
3995
3996 static int perf_event_read(struct perf_event *event, bool group)
3997 {
3998         enum perf_event_state state = READ_ONCE(event->state);
3999         int event_cpu, ret = 0;
4000
4001         /*
4002          * If event is enabled and currently active on a CPU, update the
4003          * value in the event structure:
4004          */
4005 again:
4006         if (state == PERF_EVENT_STATE_ACTIVE) {
4007                 struct perf_read_data data;
4008
4009                 /*
4010                  * Orders the ->state and ->oncpu loads such that if we see
4011                  * ACTIVE we must also see the right ->oncpu.
4012                  *
4013                  * Matches the smp_wmb() from event_sched_in().
4014                  */
4015                 smp_rmb();
4016
4017                 event_cpu = READ_ONCE(event->oncpu);
4018                 if ((unsigned)event_cpu >= nr_cpu_ids)
4019                         return 0;
4020
4021                 data = (struct perf_read_data){
4022                         .event = event,
4023                         .group = group,
4024                         .ret = 0,
4025                 };
4026
4027                 preempt_disable();
4028                 event_cpu = __perf_event_read_cpu(event, event_cpu);
4029
4030                 /*
4031                  * Purposely ignore the smp_call_function_single() return
4032                  * value.
4033                  *
4034                  * If event_cpu isn't a valid CPU it means the event got
4035                  * scheduled out and that will have updated the event count.
4036                  *
4037                  * Therefore, either way, we'll have an up-to-date event count
4038                  * after this.
4039                  */
4040                 (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
4041                 preempt_enable();
4042                 ret = data.ret;
4043
4044         } else if (state == PERF_EVENT_STATE_INACTIVE) {
4045                 struct perf_event_context *ctx = event->ctx;
4046                 unsigned long flags;
4047
4048                 raw_spin_lock_irqsave(&ctx->lock, flags);
4049                 state = event->state;
4050                 if (state != PERF_EVENT_STATE_INACTIVE) {
4051                         raw_spin_unlock_irqrestore(&ctx->lock, flags);
4052                         goto again;
4053                 }
4054
4055                 /*
4056                  * May read while context is not active (e.g., thread is
4057                  * blocked), in that case we cannot update context time
4058                  */
4059                 if (ctx->is_active & EVENT_TIME) {
4060                         update_context_time(ctx);
4061                         update_cgrp_time_from_event(event);
4062                 }
4063
4064                 perf_event_update_time(event);
4065                 if (group)
4066                         perf_event_update_sibling_time(event);
4067                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4068         }
4069
4070         return ret;
4071 }
4072
4073 /*
4074  * Initialize the perf_event context in a task_struct:
4075  */
4076 static void __perf_event_init_context(struct perf_event_context *ctx)
4077 {
4078         raw_spin_lock_init(&ctx->lock);
4079         mutex_init(&ctx->mutex);
4080         INIT_LIST_HEAD(&ctx->active_ctx_list);
4081         perf_event_groups_init(&ctx->pinned_groups);
4082         perf_event_groups_init(&ctx->flexible_groups);
4083         INIT_LIST_HEAD(&ctx->event_list);
4084         INIT_LIST_HEAD(&ctx->pinned_active);
4085         INIT_LIST_HEAD(&ctx->flexible_active);
4086         atomic_set(&ctx->refcount, 1);
4087 }
4088
4089 static struct perf_event_context *
4090 alloc_perf_context(struct pmu *pmu, struct task_struct *task)
4091 {
4092         struct perf_event_context *ctx;
4093
4094         ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
4095         if (!ctx)
4096                 return NULL;
4097
4098         __perf_event_init_context(ctx);
4099         if (task) {
4100                 ctx->task = task;
4101                 get_task_struct(task);
4102         }
4103         ctx->pmu = pmu;
4104
4105         return ctx;
4106 }
4107
4108 static struct task_struct *
4109 find_lively_task_by_vpid(pid_t vpid)
4110 {
4111         struct task_struct *task;
4112
4113         rcu_read_lock();
4114         if (!vpid)
4115                 task = current;
4116         else
4117                 task = find_task_by_vpid(vpid);
4118         if (task)
4119                 get_task_struct(task);
4120         rcu_read_unlock();
4121
4122         if (!task)
4123                 return ERR_PTR(-ESRCH);
4124
4125         return task;
4126 }
4127
4128 /*
4129  * Returns a matching context with refcount and pincount.
4130  */
4131 static struct perf_event_context *
4132 find_get_context(struct pmu *pmu, struct task_struct *task,
4133                 struct perf_event *event)
4134 {
4135         struct perf_event_context *ctx, *clone_ctx = NULL;
4136         struct perf_cpu_context *cpuctx;
4137         void *task_ctx_data = NULL;
4138         unsigned long flags;
4139         int ctxn, err;
4140         int cpu = event->cpu;
4141
4142         if (!task) {
4143                 /* Must be root to operate on a CPU event: */
4144                 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
4145                         return ERR_PTR(-EACCES);
4146
4147                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
4148                 ctx = &cpuctx->ctx;
4149                 get_ctx(ctx);
4150                 ++ctx->pin_count;
4151
4152                 return ctx;
4153         }
4154
4155         err = -EINVAL;
4156         ctxn = pmu->task_ctx_nr;
4157         if (ctxn < 0)
4158                 goto errout;
4159
4160         if (event->attach_state & PERF_ATTACH_TASK_DATA) {
4161                 task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
4162                 if (!task_ctx_data) {
4163                         err = -ENOMEM;
4164                         goto errout;
4165                 }
4166         }
4167
4168 retry:
4169         ctx = perf_lock_task_context(task, ctxn, &flags);
4170         if (ctx) {
4171                 clone_ctx = unclone_ctx(ctx);
4172                 ++ctx->pin_count;
4173
4174                 if (task_ctx_data && !ctx->task_ctx_data) {
4175                         ctx->task_ctx_data = task_ctx_data;
4176                         task_ctx_data = NULL;
4177                 }
4178                 raw_spin_unlock_irqrestore(&ctx->lock, flags);
4179
4180                 if (clone_ctx)
4181                         put_ctx(clone_ctx);
4182         } else {
4183                 ctx = alloc_perf_context(pmu, task);
4184                 err = -ENOMEM;
4185                 if (!ctx)
4186                         goto errout;
4187
4188                 if (task_ctx_data) {
4189                         ctx->task_ctx_data = task_ctx_data;
4190                         task_ctx_data = NULL;
4191                 }
4192
4193                 err = 0;
4194                 mutex_lock(&task->perf_event_mutex);
4195                 /*
4196                  * If it has already passed perf_event_exit_task().
4197                  * we must see PF_EXITING, it takes this mutex too.
4198                  */
4199                 if (task->flags & PF_EXITING)
4200                         err = -ESRCH;
4201                 else if (task->perf_event_ctxp[ctxn])
4202                         err = -EAGAIN;
4203                 else {
4204                         get_ctx(ctx);
4205                         ++ctx->pin_count;
4206                         rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
4207                 }
4208                 mutex_unlock(&task->perf_event_mutex);
4209
4210                 if (unlikely(err)) {
4211                         put_ctx(ctx);
4212
4213                         if (err == -EAGAIN)
4214                                 goto retry;
4215                         goto errout;
4216                 }
4217         }
4218
4219         kfree(task_ctx_data);
4220         return ctx;
4221
4222 errout:
4223         kfree(task_ctx_data);
4224         return ERR_PTR(err);
4225 }
4226
4227 static void perf_event_free_filter(struct perf_event *event);
4228 static void perf_event_free_bpf_prog(struct perf_event *event);
4229
4230 static void free_event_rcu(struct rcu_head *head)
4231 {
4232         struct perf_event *event;
4233
4234         event = container_of(head, struct perf_event, rcu_head);
4235         if (event->ns)
4236                 put_pid_ns(event->ns);
4237         perf_event_free_filter(event);
4238         kfree(event);
4239 }
4240
4241 static void ring_buffer_attach(struct perf_event *event,
4242                                struct ring_buffer *rb);
4243
4244 static void detach_sb_event(struct perf_event *event)
4245 {
4246         struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
4247
4248         raw_spin_lock(&pel->lock);
4249         list_del_rcu(&event->sb_list);
4250         raw_spin_unlock(&pel->lock);
4251 }
4252
4253 static bool is_sb_event(struct perf_event *event)
4254 {
4255         struct perf_event_attr *attr = &event->attr;
4256
4257         if (event->parent)
4258                 return false;
4259
4260         if (event->attach_state & PERF_ATTACH_TASK)
4261                 return false;
4262
4263         if (attr->mmap || attr->mmap_data || attr->mmap2 ||
4264             attr->comm || attr->comm_exec ||
4265             attr->task ||
4266             attr->context_switch)
4267                 return true;
4268         return false;
4269 }
4270
4271 static void unaccount_pmu_sb_event(struct perf_event *event)
4272 {
4273         if (is_sb_event(event))
4274                 detach_sb_event(event);
4275 }
4276
4277 static void unaccount_event_cpu(struct perf_event *event, int cpu)
4278 {
4279         if (event->parent)
4280                 return;
4281
4282         if (is_cgroup_event(event))
4283                 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
4284 }
4285
4286 #ifdef CONFIG_NO_HZ_FULL
4287 static DEFINE_SPINLOCK(nr_freq_lock);
4288 #endif
4289
4290 static void unaccount_freq_event_nohz(void)
4291 {
4292 #ifdef CONFIG_NO_HZ_FULL
4293         spin_lock(&nr_freq_lock);
4294         if (atomic_dec_and_test(&nr_freq_events))
4295                 tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
4296         spin_unlock(&nr_freq_lock);
4297 #endif
4298 }
4299
4300 static void unaccount_freq_event(void)
4301 {
4302         if (tick_nohz_full_enabled())
4303                 unaccount_freq_event_nohz();
4304         else
4305                 atomic_dec(&nr_freq_events);
4306 }
4307
4308 static void unaccount_event(struct perf_event *event)
4309 {
4310         bool dec = false;
4311
4312         if (event->parent)
4313                 return;
4314
4315         if (event->attach_state & PERF_ATTACH_TASK)
4316                 dec = true;
4317         if (event->attr.mmap || event->attr.mmap_data)
4318                 atomic_dec(&nr_mmap_events);
4319         if (event->attr.comm)
4320                 atomic_dec(&nr_comm_events);
4321         if (event->attr.namespaces)
4322                 atomic_dec(&nr_namespaces_events);
4323         if (event->attr.task)
4324                 atomic_dec(&nr_task_events);
4325         if (event->attr.freq)
4326                 unaccount_freq_event();
4327         if (event->attr.context_switch) {
4328                 dec = true;
4329                 atomic_dec(&nr_switch_events);
4330         }
4331         if (is_cgroup_event(event))
4332                 dec = true;
4333         if (has_branch_stack(event))
4334                 dec = true;
4335
4336         if (dec) {
4337                 if (!atomic_add_unless(&perf_sched_count, -1, 1))
4338                         schedule_delayed_work(&perf_sched_work, HZ);
4339         }
4340
4341         unaccount_event_cpu(event, event->cpu);
4342
4343         unaccount_pmu_sb_event(event);
4344 }
4345
4346 static void perf_sched_delayed(struct work_struct *work)
4347 {
4348         mutex_lock(&perf_sched_mutex);
4349         if (atomic_dec_and_test(&perf_sched_count))
4350                 static_branch_disable(&perf_sched_events);
4351         mutex_unlock(&perf_sched_mutex);
4352 }
4353
4354 /*
4355  * The following implement mutual exclusion of events on "exclusive" pmus
4356  * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
4357  * at a time, so we disallow creating events that might conflict, namely:
4358  *
4359  *  1) cpu-wide events in the presence of per-task events,
4360  *  2) per-task events in the presence of cpu-wide events,
4361  *  3) two matching events on the same context.
4362  *
4363  * The former two cases are handled in the allocation path (perf_event_alloc(),
4364  * _free_event()), the latter -- before the first perf_install_in_context().
4365  */
4366 static int exclusive_event_init(struct perf_event *event)
4367 {
4368         struct pmu *pmu = event->pmu;
4369
4370         if (!is_exclusive_pmu(pmu))
4371                 return 0;
4372
4373         /*
4374          * Prevent co-existence of per-task and cpu-wide events on the
4375          * same exclusive pmu.
4376          *
4377          * Negative pmu::exclusive_cnt means there are cpu-wide
4378          * events on this "exclusive" pmu, positive means there are
4379          * per-task events.
4380          *
4381          * Since this is called in perf_event_alloc() path, event::ctx
4382          * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
4383          * to mean "per-task event", because unlike other attach states it
4384          * never gets cleared.
4385          */
4386         if (event->attach_state & PERF_ATTACH_TASK) {
4387                 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
4388                         return -EBUSY;
4389         } else {
4390                 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
4391                         return -EBUSY;
4392         }
4393
4394         return 0;
4395 }
4396
4397 static void exclusive_event_destroy(struct perf_event *event)
4398 {
4399         struct pmu *pmu = event->pmu;
4400
4401         if (!is_exclusive_pmu(pmu))
4402                 return;
4403
4404         /* see comment in exclusive_event_init() */
4405         if (event->attach_state & PERF_ATTACH_TASK)
4406                 atomic_dec(&pmu->exclusive_cnt);
4407         else
4408                 atomic_inc(&pmu->exclusive_cnt);
4409 }
4410
4411 static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
4412 {
4413         if ((e1->pmu == e2->pmu) &&
4414             (e1->cpu == e2->cpu ||
4415              e1->cpu == -1 ||
4416              e2->cpu == -1))
4417                 return true;
4418         return false;
4419 }
4420
4421 static bool exclusive_event_installable(struct perf_event *event,
4422                                         struct perf_event_context *ctx)
4423 {
4424         struct perf_event *iter_event;
4425         struct pmu *pmu = event->pmu;
4426
4427         lockdep_assert_held(&ctx->mutex);
4428
4429         if (!is_exclusive_pmu(pmu))
4430                 return true;
4431
4432         list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
4433                 if (exclusive_event_match(iter_event, event))
4434                         return false;
4435         }
4436
4437         return true;
4438 }
4439
4440 static void perf_addr_filters_splice(struct perf_event *event,
4441                                        struct list_head *head);
4442
4443 static void _free_event(struct perf_event *event)
4444 {
4445         irq_work_sync(&event->pending);
4446
4447         unaccount_event(event);
4448
4449         if (event->rb) {
4450                 /*
4451                  * Can happen when we close an event with re-directed output.
4452                  *
4453                  * Since we have a 0 refcount, perf_mmap_close() will skip
4454                  * over us; possibly making our ring_buffer_put() the last.
4455                  */
4456                 mutex_lock(&event->mmap_mutex);
4457                 ring_buffer_attach(event, NULL);
4458                 mutex_unlock(&event->mmap_mutex);
4459         }
4460
4461         if (is_cgroup_event(event))
4462                 perf_detach_cgroup(event);
4463
4464         if (!event->parent) {
4465                 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
4466                         put_callchain_buffers();
4467         }
4468
4469         perf_event_free_bpf_prog(event);
4470         perf_addr_filters_splice(event, NULL);
4471         kfree(event->addr_filter_ranges);
4472
4473         if (event->destroy)
4474                 event->destroy(event);
4475
4476         /*
4477          * Must be after ->destroy(), due to uprobe_perf_close() using
4478          * hw.target.
4479          */
4480         if (event->hw.target)
4481                 put_task_struct(event->hw.target);
4482
4483         /*
4484          * perf_event_free_task() relies on put_ctx() being 'last', in particular
4485          * all task references must be cleaned up.
4486          */
4487         if (event->ctx)
4488                 put_ctx(event->ctx);
4489
4490         exclusive_event_destroy(event);
4491         module_put(event->pmu->module);
4492
4493         call_rcu(&event->rcu_head, free_event_rcu);
4494 }
4495
4496 /*
4497  * Used to free events which have a known refcount of 1, such as in error paths
4498  * where the event isn't exposed yet and inherited events.
4499  */
4500 static void free_event(struct perf_event *event)
4501 {
4502         if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
4503                                 "unexpected event refcount: %ld; ptr=%p\n",
4504                                 atomic_long_read(&event->refcount), event)) {
4505                 /* leak to avoid use-after-free */
4506                 return;
4507         }
4508
4509         _free_event(event);
4510 }
4511
4512 /*
4513  * Remove user event from the owner task.
4514  */
4515 static void perf_remove_from_owner(struct perf_event *event)
4516 {
4517         struct task_struct *owner;
4518
4519         rcu_read_lock();
4520         /*
4521          * Matches the smp_store_release() in perf_event_exit_task(). If we
4522          * observe !owner it means the list deletion is complete and we can
4523          * indeed free this event, otherwise we need to serialize on
4524          * owner->perf_event_mutex.
4525          */
4526         owner = READ_ONCE(event->owner);
4527         if (owner) {
4528                 /*
4529                  * Since delayed_put_task_struct() also drops the last
4530                  * task reference we can safely take a new reference
4531                  * while holding the rcu_read_lock().
4532                  */
4533                 get_task_struct(owner);
4534         }
4535         rcu_read_unlock();
4536
4537         if (owner) {
4538                 /*
4539                  * If we're here through perf_event_exit_task() we're already
4540                  * holding ctx->mutex which would be an inversion wrt. the
4541                  * normal lock order.
4542                  *
4543                  * However we can safely take this lock because its the child
4544                  * ctx->mutex.
4545                  */
4546                 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
4547
4548                 /*
4549                  * We have to re-check the event->owner field, if it is cleared
4550                  * we raced with perf_event_exit_task(), acquiring the mutex
4551                  * ensured they're done, and we can proceed with freeing the
4552                  * event.
4553                  */
4554                 if (event->owner) {
4555                         list_del_init(&event->owner_entry);
4556                         smp_store_release(&event->owner, NULL);
4557                 }
4558                 mutex_unlock(&owner->perf_event_mutex);
4559                 put_task_struct(owner);
4560         }
4561 }
4562
4563 static void put_event(struct perf_event *event)
4564 {
4565         if (!atomic_long_dec_and_test(&event->refcount))
4566                 return;
4567
4568         _free_event(event);
4569 }
4570
4571 /*
4572  * Kill an event dead; while event:refcount will preserve the event
4573  * object, it will not preserve its functionality. Once the last 'user'
4574  * gives up the object, we'll destroy the thing.
4575  */
4576 int perf_event_release_kernel(struct perf_event *event)
4577 {
4578         struct perf_event_context *ctx = event->ctx;
4579         struct perf_event *child, *tmp;
4580         LIST_HEAD(free_list);
4581
4582         /*
4583          * If we got here through err_file: fput(event_file); we will not have
4584          * attached to a context yet.
4585          */
4586         if (!ctx) {
4587                 WARN_ON_ONCE(event->attach_state &
4588                                 (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
4589                 goto no_ctx;
4590         }
4591
4592         if (!is_kernel_event(event))
4593                 perf_remove_from_owner(event);
4594
4595         ctx = perf_event_ctx_lock(event);
4596         WARN_ON_ONCE(ctx->parent_ctx);
4597         perf_remove_from_context(event, DETACH_GROUP);
4598
4599         raw_spin_lock_irq(&ctx->lock);
4600         /*
4601          * Mark this event as STATE_DEAD, there is no external reference to it
4602          * anymore.
4603          *
4604          * Anybody acquiring event->child_mutex after the below loop _must_
4605          * also see this, most importantly inherit_event() which will avoid
4606          * placing more children on the list.
4607          *
4608          * Thus this guarantees that we will in fact observe and kill _ALL_
4609          * child events.
4610          */
4611         event->state = PERF_EVENT_STATE_DEAD;
4612         raw_spin_unlock_irq(&ctx->lock);
4613
4614         perf_event_ctx_unlock(event, ctx);
4615
4616 again:
4617         mutex_lock(&event->child_mutex);
4618         list_for_each_entry(child, &event->child_list, child_list) {
4619
4620                 /*
4621                  * Cannot change, child events are not migrated, see the
4622                  * comment with perf_event_ctx_lock_nested().
4623                  */
4624                 ctx = READ_ONCE(child->ctx);
4625                 /*
4626                  * Since child_mutex nests inside ctx::mutex, we must jump
4627                  * through hoops. We start by grabbing a reference on the ctx.
4628                  *
4629                  * Since the event cannot get freed while we hold the
4630                  * child_mutex, the context must also exist and have a !0
4631                  * reference count.
4632                  */
4633                 get_ctx(ctx);
4634
4635                 /*
4636                  * Now that we have a ctx ref, we can drop child_mutex, and
4637                  * acquire ctx::mutex without fear of it going away. Then we
4638                  * can re-acquire child_mutex.
4639                  */
4640                 mutex_unlock(&event->child_mutex);
4641                 mutex_lock(&ctx->mutex);
4642                 mutex_lock(&event->child_mutex);
4643
4644                 /*
4645                  * Now that we hold ctx::mutex and child_mutex, revalidate our
4646                  * state, if child is still the first entry, it didn't get freed
4647                  * and we can continue doing so.
4648                  */
4649                 tmp = list_first_entry_or_null(&event->child_list,
4650                                                struct perf_event, child_list);
4651                 if (tmp == child) {
4652                         perf_remove_from_context(child, DETACH_GROUP);
4653                         list_move(&child->child_list, &free_list);
4654                         /*
4655                          * This matches the refcount bump in inherit_event();
4656                          * this can't be the last reference.
4657                          */
4658                         put_event(event);
4659                 }
4660
4661                 mutex_unlock(&event->child_mutex);
4662                 mutex_unlock(&ctx->mutex);
4663                 put_ctx(ctx);
4664                 goto again;
4665         }
4666         mutex_unlock(&event->child_mutex);
4667
4668         list_for_each_entry_safe(child, tmp, &free_list, child_list) {
4669                 void *var = &child->ctx->refcount;
4670
4671                 list_del(&child->child_list);
4672                 free_event(child);
4673
4674                 /*
4675                  * Wake any perf_event_free_task() waiting for this event to be
4676                  * freed.
4677                  */
4678                 smp_mb(); /* pairs with wait_var_event() */
4679                 wake_up_var(var);
4680         }
4681
4682 no_ctx:
4683         put_event(event); /* Must be the 'last' reference */
4684         return 0;
4685 }
4686 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
4687
4688 /*
4689  * Called when the last reference to the file is gone.
4690  */
4691 static int perf_release(struct inode *inode, struct file *file)
4692 {
4693         perf_event_release_kernel(file->private_data);
4694         return 0;
4695 }
4696
4697 static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
4698 {
4699         struct perf_event *child;
4700         u64 total = 0;
4701
4702         *enabled = 0;
4703         *running = 0;
4704
4705         mutex_lock(&event->child_mutex);
4706
4707         (void)perf_event_read(event, false);
4708         total += perf_event_count(event);
4709
4710         *enabled += event->total_time_enabled +
4711                         atomic64_read(&event->child_total_time_enabled);
4712         *running += event->total_time_running +
4713                         atomic64_read(&event->child_total_time_running);
4714
4715         list_for_each_entry(child, &event->child_list, child_list) {
4716                 (void)perf_event_read(child, false);
4717                 total += perf_event_count(child);
4718                 *enabled += child->total_time_enabled;
4719                 *running += child->total_time_running;
4720         }
4721         mutex_unlock(&event->child_mutex);
4722
4723         return total;
4724 }
4725
4726 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
4727 {
4728         struct perf_event_context *ctx;
4729         u64 count;
4730
4731         ctx = perf_event_ctx_lock(event);
4732         count = __perf_event_read_value(event, enabled, running);
4733         perf_event_ctx_unlock(event, ctx);
4734
4735         return count;
4736 }
4737 EXPORT_SYMBOL_GPL(perf_event_read_value);
4738
4739 static int __perf_read_group_add(struct perf_event *leader,
4740                                         u64 read_format, u64 *values)
4741 {
4742         struct perf_event_context *ctx = leader->ctx;
4743         struct perf_event *sub;
4744         unsigned long flags;
4745         int n = 1; /* skip @nr */
4746         int ret;
4747
4748         ret = perf_event_read(leader, true);
4749         if (ret)
4750                 return ret;
4751
4752         raw_spin_lock_irqsave(&ctx->lock, flags);
4753
4754         /*
4755          * Since we co-schedule groups, {enabled,running} times of siblings
4756          * will be identical to those of the leader, so we only publish one
4757          * set.
4758          */
4759         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
4760                 values[n++] += leader->total_time_enabled +
4761                         atomic64_read(&leader->child_total_time_enabled);
4762         }
4763
4764         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
4765                 values[n++] += leader->total_time_running +
4766                         atomic64_read(&leader->child_total_time_running);
4767         }
4768
4769         /*
4770          * Write {count,id} tuples for every sibling.
4771          */
4772         values[n++] += perf_event_count(leader);
4773         if (read_format & PERF_FORMAT_ID)
4774                 values[n++] = primary_event_id(leader);
4775
4776         for_each_sibling_event(sub, leader) {
4777                 values[n++] += perf_event_count(sub);
4778                 if (read_format & PERF_FORMAT_ID)
4779                         values[n++] = primary_event_id(sub);
4780         }
4781
4782         raw_spin_unlock_irqrestore(&ctx->lock, flags);
4783         return 0;
4784 }
4785
4786 static int perf_read_group(struct perf_event *event,
4787                                    u64 read_format, char __user *buf)
4788 {
4789         struct perf_event *leader = event->group_leader, *child;
4790         struct perf_event_context *ctx = leader->ctx;
4791         int ret;
4792         u64 *values;
4793
4794         lockdep_assert_held(&ctx->mutex);
4795
4796         values = kzalloc(event->read_size, GFP_KERNEL);
4797         if (!values)
4798                 return -ENOMEM;
4799
4800         values[0] = 1 + leader->nr_siblings;
4801
4802         /*
4803          * By locking the child_mutex of the leader we effectively
4804          * lock the child list of all siblings.. XXX explain how.
4805          */
4806         mutex_lock(&leader->child_mutex);
4807
4808         ret = __perf_read_group_add(leader, read_format, values);
4809         if (ret)
4810                 goto unlock;
4811
4812         list_for_each_entry(child, &leader->child_list, child_list) {
4813                 ret = __perf_read_group_add(child, read_format, values);
4814                 if (ret)
4815                         goto unlock;
4816         }
4817
4818         mutex_unlock(&leader->child_mutex);
4819
4820         ret = event->read_size;
4821         if (copy_to_user(buf, values, event->read_size))
4822                 ret = -EFAULT;
4823         goto out;
4824
4825 unlock:
4826         mutex_unlock(&leader->child_mutex);
4827 out:
4828         kfree(values);
4829         return ret;
4830 }
4831
4832 static int perf_read_one(struct perf_event *event,
4833                                  u64 read_format, char __user *buf)
4834 {
4835         u64 enabled, running;
4836         u64 values[4];
4837         int n = 0;
4838
4839         values[n++] = __perf_event_read_value(event, &enabled, &running);
4840         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
4841                 values[n++] = enabled;
4842         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
4843                 values[n++] = running;
4844         if (read_format & PERF_FORMAT_ID)
4845                 values[n++] = primary_event_id(event);
4846
4847         if (copy_to_user(buf, values, n * sizeof(u64)))
4848                 return -EFAULT;
4849
4850         return n * sizeof(u64);
4851 }
4852
4853 static bool is_event_hup(struct perf_event *event)
4854 {
4855         bool no_children;
4856
4857         if (event->state > PERF_EVENT_STATE_EXIT)
4858                 return false;
4859
4860         mutex_lock(&event->child_mutex);
4861         no_children = list_empty(&event->child_list);
4862         mutex_unlock(&event->child_mutex);
4863         return no_children;
4864 }
4865
4866 /*
4867  * Read the performance event - simple non blocking version for now
4868  */
4869 static ssize_t
4870 __perf_read(struct perf_event *event, char __user *buf, size_t count)
4871 {
4872         u64 read_format = event->attr.read_format;
4873         int ret;
4874
4875         /*
4876          * Return end-of-file for a read on an event that is in
4877          * error state (i.e. because it was pinned but it couldn't be
4878          * scheduled on to the CPU at some point).
4879          */
4880         if (event->state == PERF_EVENT_STATE_ERROR)
4881                 return 0;
4882
4883         if (count < event->read_size)
4884                 return -ENOSPC;
4885
4886         WARN_ON_ONCE(event->ctx->parent_ctx);
4887         if (read_format & PERF_FORMAT_GROUP)
4888                 ret = perf_read_group(event, read_format, buf);
4889         else
4890                 ret = perf_read_one(event, read_format, buf);
4891
4892         return ret;
4893 }
4894
4895 static ssize_t
4896 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
4897 {
4898         struct perf_event *event = file->private_data;
4899         struct perf_event_context *ctx;
4900         int ret;
4901
4902         ctx = perf_event_ctx_lock(event);
4903         ret = __perf_read(event, buf, count);
4904         perf_event_ctx_unlock(event, ctx);
4905
4906         return ret;
4907 }
4908
4909 static __poll_t perf_poll(struct file *file, poll_table *wait)
4910 {
4911         struct perf_event *event = file->private_data;
4912         struct ring_buffer *rb;
4913         __poll_t events = EPOLLHUP;
4914
4915         poll_wait(file, &event->waitq, wait);
4916
4917         if (is_event_hup(event))
4918                 return events;
4919
4920         /*
4921          * Pin the event->rb by taking event->mmap_mutex; otherwise
4922          * perf_event_set_output() can swizzle our rb and make us miss wakeups.
4923          */
4924         mutex_lock(&event->mmap_mutex);
4925         rb = event->rb;
4926         if (rb)
4927                 events = atomic_xchg(&rb->poll, 0);
4928         mutex_unlock(&event->mmap_mutex);
4929         return events;
4930 }
4931
4932 static void _perf_event_reset(struct perf_event *event)
4933 {
4934         (void)perf_event_read(event, false);
4935         local64_set(&event->count, 0);
4936         perf_event_update_userpage(event);
4937 }
4938
4939 /*
4940  * Holding the top-level event's child_mutex means that any
4941  * descendant process that has inherited this event will block
4942  * in perf_event_exit_event() if it goes to exit, thus satisfying the
4943  * task existence requirements of perf_event_enable/disable.
4944  */
4945 static void perf_event_for_each_child(struct perf_event *event,
4946                                         void (*func)(struct perf_event *))
4947 {
4948         struct perf_event *child;
4949
4950         WARN_ON_ONCE(event->ctx->parent_ctx);
4951
4952         mutex_lock(&event->child_mutex);
4953         func(event);
4954         list_for_each_entry(child, &event->child_list, child_list)
4955                 func(child);
4956         mutex_unlock(&event->child_mutex);
4957 }
4958
4959 static void perf_event_for_each(struct perf_event *event,
4960                                   void (*func)(struct perf_event *))
4961 {
4962         struct perf_event_context *ctx = event->ctx;
4963         struct perf_event *sibling;
4964
4965         lockdep_assert_held(&ctx->mutex);
4966
4967         event = event->group_leader;
4968
4969         perf_event_for_each_child(event, func);
4970         for_each_sibling_event(sibling, event)
4971                 perf_event_for_each_child(sibling, func);
4972 }
4973
4974 static void __perf_event_period(struct perf_event *event,
4975                                 struct perf_cpu_context *cpuctx,
4976                                 struct perf_event_context *ctx,
4977                                 void *info)
4978 {
4979         u64 value = *((u64 *)info);
4980         bool active;
4981
4982         if (event->attr.freq) {
4983                 event->attr.sample_freq = value;
4984         } else {
4985                 event->attr.sample_period = value;
4986                 event->hw.sample_period = value;
4987         }
4988
4989         active = (event->state == PERF_EVENT_STATE_ACTIVE);
4990         if (active) {
4991                 perf_pmu_disable(ctx->pmu);
4992                 /*
4993                  * We could be throttled; unthrottle now to avoid the tick
4994                  * trying to unthrottle while we already re-started the event.
4995                  */
4996                 if (event->hw.interrupts == MAX_INTERRUPTS) {
4997                         event->hw.interrupts = 0;
4998                         perf_log_throttle(event, 1);
4999                 }
5000                 event->pmu->stop(event, PERF_EF_UPDATE);
5001         }
5002
5003         local64_set(&event->hw.period_left, 0);
5004
5005         if (active) {
5006                 event->pmu->start(event, PERF_EF_RELOAD);
5007                 perf_pmu_enable(ctx->pmu);
5008         }
5009 }
5010
5011 static int perf_event_check_period(struct perf_event *event, u64 value)
5012 {
5013         return event->pmu->check_period(event, value);
5014 }
5015
5016 static int perf_event_period(struct perf_event *event, u64 __user *arg)
5017 {
5018         u64 value;
5019
5020         if (!is_sampling_event(event))
5021                 return -EINVAL;
5022
5023         if (copy_from_user(&value, arg, sizeof(value)))
5024                 return -EFAULT;
5025
5026         if (!value)
5027                 return -EINVAL;
5028
5029         if (event->attr.freq && value > sysctl_perf_event_sample_rate)
5030                 return -EINVAL;
5031
5032         if (perf_event_check_period(event, value))
5033                 return -EINVAL;
5034
5035         if (!event->attr.freq && (value & (1ULL << 63)))
5036                 return -EINVAL;
5037
5038         event_function_call(event, __perf_event_period, &value);
5039
5040         return 0;
5041 }
5042
5043 static const struct file_operations perf_fops;
5044
5045 static inline int perf_fget_light(int fd, struct fd *p)
5046 {
5047         struct fd f = fdget(fd);
5048         if (!f.file)
5049                 return -EBADF;
5050
5051         if (f.file->f_op != &perf_fops) {
5052                 fdput(f);
5053                 return -EBADF;
5054         }
5055         *p = f;
5056         return 0;
5057 }
5058
5059 static int perf_event_set_output(struct perf_event *event,
5060                                  struct perf_event *output_event);
5061 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
5062 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
5063 static int perf_copy_attr(struct perf_event_attr __user *uattr,
5064                           struct perf_event_attr *attr);
5065
5066 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
5067 {
5068         void (*func)(struct perf_event *);
5069         u32 flags = arg;
5070
5071         switch (cmd) {
5072         case PERF_EVENT_IOC_ENABLE:
5073                 func = _perf_event_enable;
5074                 break;
5075         case PERF_EVENT_IOC_DISABLE:
5076                 func = _perf_event_disable;
5077                 break;
5078         case PERF_EVENT_IOC_RESET:
5079                 func = _perf_event_reset;
5080                 break;
5081
5082         case PERF_EVENT_IOC_REFRESH:
5083                 return _perf_event_refresh(event, arg);
5084
5085         case PERF_EVENT_IOC_PERIOD:
5086                 return perf_event_period(event, (u64 __user *)arg);
5087
5088         case PERF_EVENT_IOC_ID:
5089         {
5090                 u64 id = primary_event_id(event);
5091
5092                 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
5093                         return -EFAULT;
5094                 return 0;
5095         }
5096
5097         case PERF_EVENT_IOC_SET_OUTPUT:
5098         {
5099                 int ret;
5100                 if (arg != -1) {
5101                         struct perf_event *output_event;
5102                         struct fd output;
5103                         ret = perf_fget_light(arg, &output);
5104                         if (ret)
5105                                 return ret;
5106                         output_event = output.file->private_data;
5107                         ret = perf_event_set_output(event, output_event);
5108                         fdput(output);
5109                 } else {
5110                         ret = perf_event_set_output(event, NULL);
5111                 }
5112                 return ret;
5113         }
5114
5115         case PERF_EVENT_IOC_SET_FILTER:
5116                 return perf_event_set_filter(event, (void __user *)arg);
5117
5118         case PERF_EVENT_IOC_SET_BPF:
5119                 return perf_event_set_bpf_prog(event, arg);
5120
5121         case PERF_EVENT_IOC_PAUSE_OUTPUT: {
5122                 struct ring_buffer *rb;
5123
5124                 rcu_read_lock();
5125                 rb = rcu_dereference(event->rb);
5126                 if (!rb || !rb->nr_pages) {
5127                         rcu_read_unlock();
5128                         return -EINVAL;
5129                 }
5130                 rb_toggle_paused(rb, !!arg);
5131                 rcu_read_unlock();
5132                 return 0;
5133         }
5134
5135         case PERF_EVENT_IOC_QUERY_BPF:
5136                 return perf_event_query_prog_array(event, (void __user *)arg);
5137
5138         case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
5139                 struct perf_event_attr new_attr;
5140                 int err = perf_copy_attr((struct perf_event_attr __user *)arg,
5141                                          &new_attr);
5142
5143                 if (err)
5144                         return err;
5145
5146                 return perf_event_modify_attr(event,  &new_attr);
5147         }
5148         default:
5149                 return -ENOTTY;
5150         }
5151
5152         if (flags & PERF_IOC_FLAG_GROUP)
5153                 perf_event_for_each(event, func);
5154         else
5155                 perf_event_for_each_child(event, func);
5156
5157         return 0;
5158 }
5159
5160 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
5161 {
5162         struct perf_event *event = file->private_data;
5163         struct perf_event_context *ctx;
5164         long ret;
5165
5166         ctx = perf_event_ctx_lock(event);
5167         ret = _perf_ioctl(event, cmd, arg);
5168         perf_event_ctx_unlock(event, ctx);
5169
5170         return ret;
5171 }
5172
5173 #ifdef CONFIG_COMPAT
5174 static long perf_compat_ioctl(struct file *file, unsigned int cmd,
5175                                 unsigned long arg)
5176 {
5177         switch (_IOC_NR(cmd)) {
5178         case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
5179         case _IOC_NR(PERF_EVENT_IOC_ID):
5180         case _IOC_NR(PERF_EVENT_IOC_QUERY_BPF):
5181         case _IOC_NR(PERF_EVENT_IOC_MODIFY_ATTRIBUTES):
5182                 /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
5183                 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
5184                         cmd &= ~IOCSIZE_MASK;
5185                         cmd |= sizeof(void *) << IOCSIZE_SHIFT;
5186                 }
5187                 break;
5188         }
5189         return perf_ioctl(file, cmd, arg);
5190 }
5191 #else
5192 # define perf_compat_ioctl NULL
5193 #endif
5194
5195 int perf_event_task_enable(void)
5196 {
5197         struct perf_event_context *ctx;
5198         struct perf_event *event;
5199
5200         mutex_lock(&current->perf_event_mutex);
5201         list_for_each_entry(event, &current->perf_event_list, owner_entry) {
5202                 ctx = perf_event_ctx_lock(event);
5203                 perf_event_for_each_child(event, _perf_event_enable);
5204                 perf_event_ctx_unlock(event, ctx);
5205         }
5206         mutex_unlock(&current->perf_event_mutex);
5207
5208         return 0;
5209 }
5210
5211 int perf_event_task_disable(void)
5212 {
5213         struct perf_event_context *ctx;
5214         struct perf_event *event;
5215
5216         mutex_lock(&current->perf_event_mutex);
5217         list_for_each_entry(event, &current->perf_event_list, owner_entry) {
5218                 ctx = perf_event_ctx_lock(event);
5219                 perf_event_for_each_child(event, _perf_event_disable);
5220                 perf_event_ctx_unlock(event, ctx);
5221         }
5222         mutex_unlock(&current->perf_event_mutex);
5223
5224         return 0;
5225 }
5226
5227 static int perf_event_index(struct perf_event *event)
5228 {
5229         if (event->hw.state & PERF_HES_STOPPED)
5230                 return 0;
5231
5232         if (event->state != PERF_EVENT_STATE_ACTIVE)
5233                 return 0;
5234
5235         return event->pmu->event_idx(event);
5236 }
5237
5238 static void calc_timer_values(struct perf_event *event,
5239                                 u64 *now,
5240                                 u64 *enabled,
5241                                 u64 *running)
5242 {
5243         u64 ctx_time;
5244
5245         *now = perf_clock();
5246         ctx_time = event->shadow_ctx_time + *now;
5247         __perf_update_times(event, ctx_time, enabled, running);
5248 }
5249
5250 static void perf_event_init_userpage(struct perf_event *event)
5251 {
5252         struct perf_event_mmap_page *userpg;
5253         struct ring_buffer *rb;
5254
5255         rcu_read_lock();
5256         rb = rcu_dereference(event->rb);
5257         if (!rb)
5258                 goto unlock;
5259
5260         userpg = rb->user_page;
5261
5262         /* Allow new userspace to detect that bit 0 is deprecated */
5263         userpg->cap_bit0_is_deprecated = 1;
5264         userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
5265         userpg->data_offset = PAGE_SIZE;
5266         userpg->data_size = perf_data_size(rb);
5267
5268 unlock:
5269         rcu_read_unlock();
5270 }
5271
5272 void __weak arch_perf_update_userpage(
5273         struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
5274 {
5275 }
5276
5277 /*
5278  * Callers need to ensure there can be no nesting of this function, otherwise
5279  * the seqlock logic goes bad. We can not serialize this because the arch
5280  * code calls this from NMI context.
5281  */
5282 void perf_event_update_userpage(struct perf_event *event)
5283 {
5284         struct perf_event_mmap_page *userpg;
5285         struct ring_buffer *rb;
5286         u64 enabled, running, now;
5287
5288         rcu_read_lock();
5289         rb = rcu_dereference(event->rb);
5290         if (!rb)
5291                 goto unlock;
5292
5293         /*
5294          * compute total_time_enabled, total_time_running
5295          * based on snapshot values taken when the event
5296          * was last scheduled in.
5297          *
5298          * we cannot simply called update_context_time()
5299          * because of locking issue as we can be called in
5300          * NMI context
5301          */
5302         calc_timer_values(event, &now, &enabled, &running);
5303
5304         userpg = rb->user_page;
5305         /*
5306          * Disable preemption to guarantee consistent time stamps are stored to
5307          * the user page.
5308          */
5309         preempt_disable();
5310         ++userpg->lock;
5311         barrier();
5312         userpg->index = perf_event_index(event);
5313         userpg->offset = perf_event_count(event);
5314         if (userpg->index)
5315                 userpg->offset -= local64_read(&event->hw.prev_count);
5316
5317         userpg->time_enabled = enabled +
5318                         atomic64_read(&event->child_total_time_enabled);
5319
5320         userpg->time_running = running +
5321                         atomic64_read(&event->child_total_time_running);
5322
5323         arch_perf_update_userpage(event, userpg, now);
5324
5325         barrier();
5326         ++userpg->lock;
5327         preempt_enable();
5328 unlock:
5329         rcu_read_unlock();
5330 }
5331 EXPORT_SYMBOL_GPL(perf_event_update_userpage);
5332
5333 static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
5334 {
5335         struct perf_event *event = vmf->vma->vm_file->private_data;
5336         struct ring_buffer *rb;
5337         vm_fault_t ret = VM_FAULT_SIGBUS;
5338
5339         if (vmf->flags & FAULT_FLAG_MKWRITE) {
5340                 if (vmf->pgoff == 0)
5341                         ret = 0;
5342                 return ret;
5343         }
5344
5345         rcu_read_lock();
5346         rb = rcu_dereference(event->rb);
5347         if (!rb)
5348                 goto unlock;
5349
5350         if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
5351                 goto unlock;
5352
5353         vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
5354         if (!vmf->page)
5355                 goto unlock;
5356
5357         get_page(vmf->page);
5358         vmf->page->mapping = vmf->vma->vm_file->f_mapping;
5359         vmf->page->index   = vmf->pgoff;
5360
5361         ret = 0;
5362 unlock:
5363         rcu_read_unlock();
5364
5365         return ret;
5366 }
5367
5368 static void ring_buffer_attach(struct perf_event *event,
5369                                struct ring_buffer *rb)
5370 {
5371         struct ring_buffer *old_rb = NULL;
5372         unsigned long flags;
5373
5374         if (event->rb) {
5375                 /*
5376                  * Should be impossible, we set this when removing
5377                  * event->rb_entry and wait/clear when adding event->rb_entry.
5378                  */
5379                 WARN_ON_ONCE(event->rcu_pending);
5380
5381                 old_rb = event->rb;
5382                 spin_lock_irqsave(&old_rb->event_lock, flags);
5383                 list_del_rcu(&event->rb_entry);
5384                 spin_unlock_irqrestore(&old_rb->event_lock, flags);
5385
5386                 event->rcu_batches = get_state_synchronize_rcu();
5387                 event->rcu_pending = 1;
5388         }
5389
5390         if (rb) {
5391                 if (event->rcu_pending) {
5392                         cond_synchronize_rcu(event->rcu_batches);
5393                         event->rcu_pending = 0;
5394                 }
5395
5396                 spin_lock_irqsave(&rb->event_lock, flags);
5397                 list_add_rcu(&event->rb_entry, &rb->event_list);
5398                 spin_unlock_irqrestore(&rb->event_lock, flags);
5399         }
5400
5401         /*
5402          * Avoid racing with perf_mmap_close(AUX): stop the event
5403          * before swizzling the event::rb pointer; if it's getting
5404          * unmapped, its aux_mmap_count will be 0 and it won't
5405          * restart. See the comment in __perf_pmu_output_stop().
5406          *
5407          * Data will inevitably be lost when set_output is done in
5408          * mid-air, but then again, whoever does it like this is
5409          * not in for the data anyway.
5410          */
5411         if (has_aux(event))
5412                 perf_event_stop(event, 0);
5413
5414         rcu_assign_pointer(event->rb, rb);
5415
5416         if (old_rb) {
5417                 ring_buffer_put(old_rb);
5418                 /*
5419                  * Since we detached before setting the new rb, so that we
5420                  * could attach the new rb, we could have missed a wakeup.
5421                  * Provide it now.
5422                  */
5423                 wake_up_all(&event->waitq);
5424         }
5425 }
5426
5427 static void ring_buffer_wakeup(struct perf_event *event)
5428 {
5429         struct ring_buffer *rb;
5430
5431         rcu_read_lock();
5432         rb = rcu_dereference(event->rb);
5433         if (rb) {
5434                 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
5435                         wake_up_all(&event->waitq);
5436         }
5437         rcu_read_unlock();
5438 }
5439
5440 struct ring_buffer *ring_buffer_get(struct perf_event *event)
5441 {
5442         struct ring_buffer *rb;
5443
5444         rcu_read_lock();
5445         rb = rcu_dereference(event->rb);
5446         if (rb) {
5447                 if (!atomic_inc_not_zero(&rb->refcount))
5448                         rb = NULL;
5449         }
5450         rcu_read_unlock();
5451
5452         return rb;
5453 }
5454
5455 void ring_buffer_put(struct ring_buffer *rb)
5456 {
5457         if (!atomic_dec_and_test(&rb->refcount))
5458                 return;
5459
5460         WARN_ON_ONCE(!list_empty(&rb->event_list));
5461
5462         call_rcu(&rb->rcu_head, rb_free_rcu);
5463 }
5464
5465 static void perf_mmap_open(struct vm_area_struct *vma)
5466 {
5467         struct perf_event *event = vma->vm_file->private_data;
5468
5469         atomic_inc(&event->mmap_count);
5470         atomic_inc(&event->rb->mmap_count);
5471
5472         if (vma->vm_pgoff)
5473                 atomic_inc(&event->rb->aux_mmap_count);
5474
5475         if (event->pmu->event_mapped)
5476                 event->pmu->event_mapped(event, vma->vm_mm);
5477 }
5478
5479 static void perf_pmu_output_stop(struct perf_event *event);
5480
5481 /*
5482  * A buffer can be mmap()ed multiple times; either directly through the same
5483  * event, or through other events by use of perf_event_set_output().
5484  *
5485  * In order to undo the VM accounting done by perf_mmap() we need to destroy
5486  * the buffer here, where we still have a VM context. This means we need
5487  * to detach all events redirecting to us.
5488  */
5489 static void perf_mmap_close(struct vm_area_struct *vma)
5490 {
5491         struct perf_event *event = vma->vm_file->private_data;
5492         struct ring_buffer *rb = ring_buffer_get(event);
5493         struct user_struct *mmap_user = rb->mmap_user;
5494         int mmap_locked = rb->mmap_locked;
5495         unsigned long size = perf_data_size(rb);
5496         bool detach_rest = false;
5497
5498         if (event->pmu->event_unmapped)
5499                 event->pmu->event_unmapped(event, vma->vm_mm);
5500
5501         /*
5502          * rb->aux_mmap_count will always drop before rb->mmap_count and
5503          * event->mmap_count, so it is ok to use event->mmap_mutex to
5504          * serialize with perf_mmap here.
5505          */
5506         if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
5507             atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
5508                 /*
5509                  * Stop all AUX events that are writing to this buffer,
5510                  * so that we can free its AUX pages and corresponding PMU
5511                  * data. Note that after rb::aux_mmap_count dropped to zero,
5512                  * they won't start any more (see perf_aux_output_begin()).
5513                  */
5514                 perf_pmu_output_stop(event);
5515
5516                 /* now it's safe to free the pages */
5517                 atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
5518                 vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
5519
5520                 /* this has to be the last one */
5521                 rb_free_aux(rb);
5522                 WARN_ON_ONCE(atomic_read(&rb->aux_refcount));
5523
5524                 mutex_unlock(&event->mmap_mutex);
5525         }
5526
5527         if (atomic_dec_and_test(&rb->mmap_count))
5528                 detach_rest = true;
5529
5530         if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
5531                 goto out_put;
5532
5533         ring_buffer_attach(event, NULL);
5534         mutex_unlock(&event->mmap_mutex);
5535
5536         /* If there's still other mmap()s of this buffer, we're done. */
5537         if (!detach_rest)
5538                 goto out_put;
5539
5540         /*
5541          * No other mmap()s, detach from all other events that might redirect
5542          * into the now unreachable buffer. Somewhat complicated by the
5543          * fact that rb::event_lock otherwise nests inside mmap_mutex.
5544          */
5545 again:
5546         rcu_read_lock();
5547         list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
5548                 if (!atomic_long_inc_not_zero(&event->refcount)) {
5549                         /*
5550                          * This event is en-route to free_event() which will
5551                          * detach it and remove it from the list.
5552                          */
5553                         continue;
5554                 }
5555                 rcu_read_unlock();
5556
5557                 mutex_lock(&event->mmap_mutex);
5558                 /*
5559                  * Check we didn't race with perf_event_set_output() which can
5560                  * swizzle the rb from under us while we were waiting to
5561                  * acquire mmap_mutex.
5562                  *
5563                  * If we find a different rb; ignore this event, a next
5564                  * iteration will no longer find it on the list. We have to
5565                  * still restart the iteration to make sure we're not now
5566                  * iterating the wrong list.
5567                  */
5568                 if (event->rb == rb)
5569                         ring_buffer_attach(event, NULL);
5570
5571                 mutex_unlock(&event->mmap_mutex);
5572                 put_event(event);
5573
5574                 /*
5575                  * Restart the iteration; either we're on the wrong list or
5576                  * destroyed its integrity by doing a deletion.
5577                  */
5578                 goto again;
5579         }
5580         rcu_read_unlock();
5581
5582         /*
5583          * It could be there's still a few 0-ref events on the list; they'll
5584          * get cleaned up by free_event() -- they'll also still have their
5585          * ref on the rb and will free it whenever they are done with it.
5586          *
5587          * Aside from that, this buffer is 'fully' detached and unmapped,
5588          * undo the VM accounting.
5589          */
5590
5591         atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
5592         vma->vm_mm->pinned_vm -= mmap_locked;
5593         free_uid(mmap_user);
5594
5595 out_put:
5596         ring_buffer_put(rb); /* could be last */
5597 }
5598
5599 static const struct vm_operations_struct perf_mmap_vmops = {
5600         .open           = perf_mmap_open,
5601         .close          = perf_mmap_close, /* non mergable */
5602         .fault          = perf_mmap_fault,
5603         .page_mkwrite   = perf_mmap_fault,
5604 };
5605
5606 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
5607 {
5608         struct perf_event *event = file->private_data;
5609         unsigned long user_locked, user_lock_limit;
5610         struct user_struct *user = current_user();
5611         unsigned long locked, lock_limit;
5612         struct ring_buffer *rb = NULL;
5613         unsigned long vma_size;
5614         unsigned long nr_pages;
5615         long user_extra = 0, extra = 0;
5616         int ret = 0, flags = 0;
5617
5618         /*
5619          * Don't allow mmap() of inherited per-task counters. This would
5620          * create a performance issue due to all children writing to the
5621          * same rb.
5622          */
5623         if (event->cpu == -1 && event->attr.inherit)
5624                 return -EINVAL;
5625
5626         if (!(vma->vm_flags & VM_SHARED))
5627                 return -EINVAL;
5628
5629         vma_size = vma->vm_end - vma->vm_start;
5630
5631         if (vma->vm_pgoff == 0) {
5632                 nr_pages = (vma_size / PAGE_SIZE) - 1;
5633         } else {
5634                 /*
5635                  * AUX area mapping: if rb->aux_nr_pages != 0, it's already
5636                  * mapped, all subsequent mappings should have the same size
5637                  * and offset. Must be above the normal perf buffer.
5638                  */
5639                 u64 aux_offset, aux_size;
5640
5641                 if (!event->rb)
5642                         return -EINVAL;
5643
5644                 nr_pages = vma_size / PAGE_SIZE;
5645
5646                 mutex_lock(&event->mmap_mutex);
5647                 ret = -EINVAL;
5648
5649                 rb = event->rb;
5650                 if (!rb)
5651                         goto aux_unlock;
5652
5653                 aux_offset = READ_ONCE(rb->user_page->aux_offset);
5654                 aux_size = READ_ONCE(rb->user_page->aux_size);
5655
5656                 if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
5657                         goto aux_unlock;
5658
5659                 if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
5660                         goto aux_unlock;
5661
5662                 /* already mapped with a different offset */
5663                 if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
5664                         goto aux_unlock;
5665
5666                 if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
5667                         goto aux_unlock;
5668
5669                 /* already mapped with a different size */
5670                 if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
5671                         goto aux_unlock;
5672
5673                 if (!is_power_of_2(nr_pages))
5674                         goto aux_unlock;
5675
5676                 if (!atomic_inc_not_zero(&rb->mmap_count))
5677                         goto aux_unlock;
5678
5679                 if (rb_has_aux(rb)) {
5680                         atomic_inc(&rb->aux_mmap_count);
5681                         ret = 0;
5682                         goto unlock;
5683                 }
5684
5685                 atomic_set(&rb->aux_mmap_count, 1);
5686                 user_extra = nr_pages;
5687
5688                 goto accounting;
5689         }
5690
5691         /*
5692          * If we have rb pages ensure they're a power-of-two number, so we
5693          * can do bitmasks instead of modulo.
5694          */
5695         if (nr_pages != 0 && !is_power_of_2(nr_pages))
5696                 return -EINVAL;
5697
5698         if (vma_size != PAGE_SIZE * (1 + nr_pages))
5699                 return -EINVAL;
5700
5701         WARN_ON_ONCE(event->ctx->parent_ctx);
5702 again:
5703         mutex_lock(&event->mmap_mutex);
5704         if (event->rb) {
5705                 if (event->rb->nr_pages != nr_pages) {
5706                         ret = -EINVAL;
5707                         goto unlock;
5708                 }
5709
5710                 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
5711                         /*
5712                          * Raced against perf_mmap_close() through
5713                          * perf_event_set_output(). Try again, hope for better
5714                          * luck.
5715                          */
5716                         mutex_unlock(&event->mmap_mutex);
5717                         goto again;
5718                 }
5719
5720                 goto unlock;
5721         }
5722
5723         user_extra = nr_pages + 1;
5724
5725 accounting:
5726         user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
5727
5728         /*
5729          * Increase the limit linearly with more CPUs:
5730          */
5731         user_lock_limit *= num_online_cpus();
5732
5733         user_locked = atomic_long_read(&user->locked_vm);
5734
5735         /*
5736          * sysctl_perf_event_mlock may have changed, so that
5737          *     user->locked_vm > user_lock_limit
5738          */
5739         if (user_locked > user_lock_limit)
5740                 user_locked = user_lock_limit;
5741         user_locked += user_extra;
5742
5743         if (user_locked > user_lock_limit)
5744                 extra = user_locked - user_lock_limit;
5745
5746         lock_limit = rlimit(RLIMIT_MEMLOCK);
5747         lock_limit >>= PAGE_SHIFT;
5748         locked = vma->vm_mm->pinned_vm + extra;
5749
5750         if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
5751                 !capable(CAP_IPC_LOCK)) {
5752                 ret = -EPERM;
5753                 goto unlock;
5754         }
5755
5756         WARN_ON(!rb && event->rb);
5757
5758         if (vma->vm_flags & VM_WRITE)
5759                 flags |= RING_BUFFER_WRITABLE;
5760
5761         if (!rb) {
5762                 rb = rb_alloc(nr_pages,
5763                               event->attr.watermark ? event->attr.wakeup_watermark : 0,
5764                               event->cpu, flags);
5765
5766                 if (!rb) {
5767                         ret = -ENOMEM;
5768                         goto unlock;
5769                 }
5770
5771                 atomic_set(&rb->mmap_count, 1);
5772                 rb->mmap_user = get_current_user();
5773                 rb->mmap_locked = extra;
5774
5775                 ring_buffer_attach(event, rb);
5776
5777                 perf_event_init_userpage(event);
5778                 perf_event_update_userpage(event);
5779         } else {
5780                 ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
5781                                    event->attr.aux_watermark, flags);
5782                 if (!ret)
5783                         rb->aux_mmap_locked = extra;
5784         }
5785
5786 unlock:
5787         if (!ret) {
5788                 atomic_long_add(user_extra, &user->locked_vm);
5789                 vma->vm_mm->pinned_vm += extra;
5790
5791                 atomic_inc(&event->mmap_count);
5792         } else if (rb) {
5793                 atomic_dec(&rb->mmap_count);
5794         }
5795 aux_unlock:
5796         mutex_unlock(&event->mmap_mutex);
5797
5798         /*
5799          * Since pinned accounting is per vm we cannot allow fork() to copy our
5800          * vma.
5801          */
5802         vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
5803         vma->vm_ops = &perf_mmap_vmops;
5804
5805         if (event->pmu->event_mapped)
5806                 event->pmu->event_mapped(event, vma->vm_mm);
5807
5808         return ret;
5809 }
5810
5811 static int perf_fasync(int fd, struct file *filp, int on)
5812 {
5813         struct inode *inode = file_inode(filp);
5814         struct perf_event *event = filp->private_data;
5815         int retval;
5816
5817         inode_lock(inode);
5818         retval = fasync_helper(fd, filp, on, &event->fasync);
5819         inode_unlock(inode);
5820
5821         if (retval < 0)
5822                 return retval;
5823
5824         return 0;
5825 }
5826
5827 static const struct file_operations perf_fops = {
5828         .llseek                 = no_llseek,
5829         .release                = perf_release,
5830         .read                   = perf_read,
5831         .poll                   = perf_poll,
5832         .unlocked_ioctl         = perf_ioctl,
5833         .compat_ioctl           = perf_compat_ioctl,
5834         .mmap                   = perf_mmap,
5835         .fasync                 = perf_fasync,
5836 };
5837
5838 /*
5839  * Perf event wakeup
5840  *
5841  * If there's data, ensure we set the poll() state and publish everything
5842  * to user-space before waking everybody up.
5843  */
5844
5845 static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
5846 {
5847         /* only the parent has fasync state */
5848         if (event->parent)
5849                 event = event->parent;
5850         return &event->fasync;
5851 }
5852
5853 void perf_event_wakeup(struct perf_event *event)
5854 {
5855         ring_buffer_wakeup(event);
5856
5857         if (event->pending_kill) {
5858                 kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
5859                 event->pending_kill = 0;
5860         }
5861 }
5862
5863 static void perf_pending_event_disable(struct perf_event *event)
5864 {
5865         int cpu = READ_ONCE(event->pending_disable);
5866
5867         if (cpu < 0)
5868                 return;
5869
5870         if (cpu == smp_processor_id()) {
5871                 WRITE_ONCE(event->pending_disable, -1);
5872                 perf_event_disable_local(event);
5873                 return;
5874         }
5875
5876         /*
5877          *  CPU-A                       CPU-B
5878          *
5879          *  perf_event_disable_inatomic()
5880          *    @pending_disable = CPU-A;
5881          *    irq_work_queue();
5882          *
5883          *  sched-out
5884          *    @pending_disable = -1;
5885          *
5886          *                              sched-in
5887          *                              perf_event_disable_inatomic()
5888          *                                @pending_disable = CPU-B;
5889          *                                irq_work_queue(); // FAILS
5890          *
5891          *  irq_work_run()
5892          *    perf_pending_event()
5893          *
5894          * But the event runs on CPU-B and wants disabling there.
5895          */
5896         irq_work_queue_on(&event->pending, cpu);
5897 }
5898
5899 static void perf_pending_event(struct irq_work *entry)
5900 {
5901         struct perf_event *event = container_of(entry, struct perf_event, pending);
5902         int rctx;
5903
5904         rctx = perf_swevent_get_recursion_context();
5905         /*
5906          * If we 'fail' here, that's OK, it means recursion is already disabled
5907          * and we won't recurse 'further'.
5908          */
5909
5910         perf_pending_event_disable(event);
5911
5912         if (event->pending_wakeup) {
5913                 event->pending_wakeup = 0;
5914                 perf_event_wakeup(event);
5915         }
5916
5917         if (rctx >= 0)
5918                 perf_swevent_put_recursion_context(rctx);
5919 }
5920
5921 /*
5922  * We assume there is only KVM supporting the callbacks.
5923  * Later on, we might change it to a list if there is
5924  * another virtualization implementation supporting the callbacks.
5925  */
5926 struct perf_guest_info_callbacks *perf_guest_cbs;
5927
5928 int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5929 {
5930         perf_guest_cbs = cbs;
5931         return 0;
5932 }
5933 EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
5934
5935 int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5936 {
5937         perf_guest_cbs = NULL;
5938         return 0;
5939 }
5940 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
5941
5942 static void
5943 perf_output_sample_regs(struct perf_output_handle *handle,
5944                         struct pt_regs *regs, u64 mask)
5945 {
5946         int bit;
5947         DECLARE_BITMAP(_mask, 64);
5948
5949         bitmap_from_u64(_mask, mask);
5950         for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
5951                 u64 val;
5952
5953                 val = perf_reg_value(regs, bit);
5954                 perf_output_put(handle, val);
5955         }
5956 }
5957
5958 static void perf_sample_regs_user(struct perf_regs *regs_user,
5959                                   struct pt_regs *regs,
5960                                   struct pt_regs *regs_user_copy)
5961 {
5962         if (user_mode(regs)) {
5963                 regs_user->abi = perf_reg_abi(current);
5964                 regs_user->regs = regs;
5965         } else if (!(current->flags & PF_KTHREAD)) {
5966                 perf_get_regs_user(regs_user, regs, regs_user_copy);
5967         } else {
5968                 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
5969                 regs_user->regs = NULL;
5970         }
5971 }
5972
5973 static void perf_sample_regs_intr(struct perf_regs *regs_intr,
5974                                   struct pt_regs *regs)
5975 {
5976         regs_intr->regs = regs;
5977         regs_intr->abi  = perf_reg_abi(current);
5978 }
5979
5980
5981 /*
5982  * Get remaining task size from user stack pointer.
5983  *
5984  * It'd be better to take stack vma map and limit this more
5985  * precisly, but there's no way to get it safely under interrupt,
5986  * so using TASK_SIZE as limit.
5987  */
5988 static u64 perf_ustack_task_size(struct pt_regs *regs)
5989 {
5990         unsigned long addr = perf_user_stack_pointer(regs);
5991
5992         if (!addr || addr >= TASK_SIZE)
5993                 return 0;
5994
5995         return TASK_SIZE - addr;
5996 }
5997
5998 static u16
5999 perf_sample_ustack_size(u16 stack_size, u16 header_size,
6000                         struct pt_regs *regs)
6001 {
6002         u64 task_size;
6003
6004         /* No regs, no stack pointer, no dump. */
6005         if (!regs)
6006                 return 0;
6007
6008         /*
6009          * Check if we fit in with the requested stack size into the:
6010          * - TASK_SIZE
6011          *   If we don't, we limit the size to the TASK_SIZE.
6012          *
6013          * - remaining sample size
6014          *   If we don't, we customize the stack size to
6015          *   fit in to the remaining sample size.
6016          */
6017
6018         task_size  = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
6019         stack_size = min(stack_size, (u16) task_size);
6020
6021         /* Current header size plus static size and dynamic size. */
6022         header_size += 2 * sizeof(u64);
6023
6024         /* Do we fit in with the current stack dump size? */
6025         if ((u16) (header_size + stack_size) < header_size) {
6026                 /*
6027                  * If we overflow the maximum size for the sample,
6028                  * we customize the stack dump size to fit in.
6029                  */
6030                 stack_size = USHRT_MAX - header_size - sizeof(u64);
6031                 stack_size = round_up(stack_size, sizeof(u64));
6032         }
6033
6034         return stack_size;
6035 }
6036
6037 static void
6038 perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
6039                           struct pt_regs *regs)
6040 {
6041         /* Case of a kernel thread, nothing to dump */
6042         if (!regs) {
6043                 u64 size = 0;
6044                 perf_output_put(handle, size);
6045         } else {
6046                 unsigned long sp;
6047                 unsigned int rem;
6048                 u64 dyn_size;
6049                 mm_segment_t fs;
6050
6051                 /*
6052                  * We dump:
6053                  * static size
6054                  *   - the size requested by user or the best one we can fit
6055                  *     in to the sample max size
6056                  * data
6057                  *   - user stack dump data
6058                  * dynamic size
6059                  *   - the actual dumped size
6060                  */
6061
6062                 /* Static size. */
6063                 perf_output_put(handle, dump_size);
6064
6065                 /* Data. */
6066                 sp = perf_user_stack_pointer(regs);
6067                 fs = get_fs();
6068                 set_fs(USER_DS);
6069                 rem = __output_copy_user(handle, (void *) sp, dump_size);
6070                 set_fs(fs);
6071                 dyn_size = dump_size - rem;
6072
6073                 perf_output_skip(handle, rem);
6074
6075                 /* Dynamic size. */
6076                 perf_output_put(handle, dyn_size);
6077         }
6078 }
6079
6080 static void __perf_event_header__init_id(struct perf_event_header *header,
6081                                          struct perf_sample_data *data,
6082                                          struct perf_event *event)
6083 {
6084         u64 sample_type = event->attr.sample_type;
6085
6086         data->type = sample_type;
6087         header->size += event->id_header_size;
6088
6089         if (sample_type & PERF_SAMPLE_TID) {
6090                 /* namespace issues */
6091                 data->tid_entry.pid = perf_event_pid(event, current);
6092                 data->tid_entry.tid = perf_event_tid(event, current);
6093         }
6094
6095         if (sample_type & PERF_SAMPLE_TIME)
6096                 data->time = perf_event_clock(event);
6097
6098         if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
6099                 data->id = primary_event_id(event);
6100
6101         if (sample_type & PERF_SAMPLE_STREAM_ID)
6102                 data->stream_id = event->id;
6103
6104         if (sample_type & PERF_SAMPLE_CPU) {
6105                 data->cpu_entry.cpu      = raw_smp_processor_id();
6106                 data->cpu_entry.reserved = 0;
6107         }
6108 }
6109
6110 void perf_event_header__init_id(struct perf_event_header *header,
6111                                 struct perf_sample_data *data,
6112                                 struct perf_event *event)
6113 {
6114         if (event->attr.sample_id_all)
6115                 __perf_event_header__init_id(header, data, event);
6116 }
6117
6118 static void __perf_event__output_id_sample(struct perf_output_handle *handle,
6119                                            struct perf_sample_data *data)
6120 {
6121         u64 sample_type = data->type;
6122
6123         if (sample_type & PERF_SAMPLE_TID)
6124                 perf_output_put(handle, data->tid_entry);
6125
6126         if (sample_type & PERF_SAMPLE_TIME)
6127                 perf_output_put(handle, data->time);
6128
6129         if (sample_type & PERF_SAMPLE_ID)
6130                 perf_output_put(handle, data->id);
6131
6132         if (sample_type & PERF_SAMPLE_STREAM_ID)
6133                 perf_output_put(handle, data->stream_id);
6134
6135         if (sample_type & PERF_SAMPLE_CPU)
6136                 perf_output_put(handle, data->cpu_entry);
6137
6138         if (sample_type & PERF_SAMPLE_IDENTIFIER)
6139                 perf_output_put(handle, data->id);
6140 }
6141
6142 void perf_event__output_id_sample(struct perf_event *event,
6143                                   struct perf_output_handle *handle,
6144                                   struct perf_sample_data *sample)
6145 {
6146         if (event->attr.sample_id_all)
6147                 __perf_event__output_id_sample(handle, sample);
6148 }
6149
6150 static void perf_output_read_one(struct perf_output_handle *handle,
6151                                  struct perf_event *event,
6152                                  u64 enabled, u64 running)
6153 {
6154         u64 read_format = event->attr.read_format;
6155         u64 values[4];
6156         int n = 0;
6157
6158         values[n++] = perf_event_count(event);
6159         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
6160                 values[n++] = enabled +
6161                         atomic64_read(&event->child_total_time_enabled);
6162         }
6163         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
6164                 values[n++] = running +
6165                         atomic64_read(&event->child_total_time_running);
6166         }
6167         if (read_format & PERF_FORMAT_ID)
6168                 values[n++] = primary_event_id(event);
6169
6170         __output_copy(handle, values, n * sizeof(u64));
6171 }
6172
6173 static void perf_output_read_group(struct perf_output_handle *handle,
6174                             struct perf_event *event,
6175                             u64 enabled, u64 running)
6176 {
6177         struct perf_event *leader = event->group_leader, *sub;
6178         u64 read_format = event->attr.read_format;
6179         u64 values[5];
6180         int n = 0;
6181
6182         values[n++] = 1 + leader->nr_siblings;
6183
6184         if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
6185                 values[n++] = enabled;
6186
6187         if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
6188                 values[n++] = running;
6189
6190         if ((leader != event) &&
6191             (leader->state == PERF_EVENT_STATE_ACTIVE))
6192                 leader->pmu->read(leader);
6193
6194         values[n++] = perf_event_count(leader);
6195         if (read_format & PERF_FORMAT_ID)
6196                 values[n++] = primary_event_id(leader);
6197
6198         __output_copy(handle, values, n * sizeof(u64));
6199
6200         for_each_sibling_event(sub, leader) {
6201                 n = 0;
6202
6203                 if ((sub != event) &&
6204                     (sub->state == PERF_EVENT_STATE_ACTIVE))
6205                         sub->pmu->read(sub);
6206
6207                 values[n++] = perf_event_count(sub);
6208                 if (read_format & PERF_FORMAT_ID)
6209                         values[n++] = primary_event_id(sub);
6210
6211                 __output_copy(handle, values, n * sizeof(u64));
6212         }
6213 }
6214
6215 #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
6216                                  PERF_FORMAT_TOTAL_TIME_RUNNING)
6217
6218 /*
6219  * XXX PERF_SAMPLE_READ vs inherited events seems difficult.
6220  *
6221  * The problem is that its both hard and excessively expensive to iterate the
6222  * child list, not to mention that its impossible to IPI the children running
6223  * on another CPU, from interrupt/NMI context.
6224  */
6225 static void perf_output_read(struct perf_output_handle *handle,
6226                              struct perf_event *event)
6227 {
6228         u64 enabled = 0, running = 0, now;
6229         u64 read_format = event->attr.read_format;
6230
6231         /*
6232          * compute total_time_enabled, total_time_running
6233          * based on snapshot values taken when the event
6234          * was last scheduled in.
6235          *
6236          * we cannot simply called update_context_time()
6237          * because of locking issue as we are called in
6238          * NMI context
6239          */
6240         if (read_format & PERF_FORMAT_TOTAL_TIMES)
6241                 calc_timer_values(event, &now, &enabled, &running);
6242
6243         if (event->attr.read_format & PERF_FORMAT_GROUP)
6244                 perf_output_read_group(handle, event, enabled, running);
6245         else
6246                 perf_output_read_one(handle, event, enabled, running);
6247 }
6248
6249 void perf_output_sample(struct perf_output_handle *handle,
6250                         struct perf_event_header *header,
6251                         struct perf_sample_data *data,
6252                         struct perf_event *event)
6253 {
6254         u64 sample_type = data->type;
6255
6256         perf_output_put(handle, *header);
6257
6258         if (sample_type & PERF_SAMPLE_IDENTIFIER)
6259                 perf_output_put(handle, data->id);
6260
6261         if (sample_type & PERF_SAMPLE_IP)
6262                 perf_output_put(handle, data->ip);
6263
6264         if (sample_type & PERF_SAMPLE_TID)
6265                 perf_output_put(handle, data->tid_entry);
6266
6267         if (sample_type & PERF_SAMPLE_TIME)
6268                 perf_output_put(handle, data->time);
6269
6270         if (sample_type & PERF_SAMPLE_ADDR)
6271                 perf_output_put(handle, data->addr);
6272
6273         if (sample_type & PERF_SAMPLE_ID)
6274                 perf_output_put(handle, data->id);
6275
6276         if (sample_type & PERF_SAMPLE_STREAM_ID)
6277                 perf_output_put(handle, data->stream_id);
6278
6279         if (sample_type & PERF_SAMPLE_CPU)
6280                 perf_output_put(handle, data->cpu_entry);
6281
6282         if (sample_type & PERF_SAMPLE_PERIOD)
6283                 perf_output_put(handle, data->period);
6284
6285         if (sample_type & PERF_SAMPLE_READ)
6286                 perf_output_read(handle, event);
6287
6288         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
6289                 int size = 1;
6290
6291                 size += data->callchain->nr;
6292                 size *= sizeof(u64);
6293                 __output_copy(handle, data->callchain, size);
6294         }
6295
6296         if (sample_type & PERF_SAMPLE_RAW) {
6297                 struct perf_raw_record *raw = data->raw;
6298
6299                 if (raw) {
6300                         struct perf_raw_frag *frag = &raw->frag;
6301
6302                         perf_output_put(handle, raw->size);
6303                         do {
6304                                 if (frag->copy) {
6305                                         __output_custom(handle, frag->copy,
6306                                                         frag->data, frag->size);
6307                                 } else {
6308                                         __output_copy(handle, frag->data,
6309                                                       frag->size);
6310                                 }
6311                                 if (perf_raw_frag_last(frag))
6312                                         break;
6313                                 frag = frag->next;
6314                         } while (1);
6315                         if (frag->pad)
6316                                 __output_skip(handle, NULL, frag->pad);
6317                 } else {
6318                         struct {
6319                                 u32     size;
6320                                 u32     data;
6321                         } raw = {
6322                                 .size = sizeof(u32),
6323                                 .data = 0,
6324                         };
6325                         perf_output_put(handle, raw);
6326                 }
6327         }
6328
6329         if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
6330                 if (data->br_stack) {
6331                         size_t size;
6332
6333                         size = data->br_stack->nr
6334                              * sizeof(struct perf_branch_entry);
6335
6336                         perf_output_put(handle, data->br_stack->nr);
6337                         perf_output_copy(handle, data->br_stack->entries, size);
6338                 } else {
6339                         /*
6340                          * we always store at least the value of nr
6341                          */
6342                         u64 nr = 0;
6343                         perf_output_put(handle, nr);
6344                 }
6345         }
6346
6347         if (sample_type & PERF_SAMPLE_REGS_USER) {
6348                 u64 abi = data->regs_user.abi;
6349
6350                 /*
6351                  * If there are no regs to dump, notice it through
6352                  * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
6353                  */
6354                 perf_output_put(handle, abi);
6355
6356                 if (abi) {
6357                         u64 mask = event->attr.sample_regs_user;
6358                         perf_output_sample_regs(handle,
6359                                                 data->regs_user.regs,
6360                                                 mask);
6361                 }
6362         }
6363
6364         if (sample_type & PERF_SAMPLE_STACK_USER) {
6365                 perf_output_sample_ustack(handle,
6366                                           data->stack_user_size,
6367                                           data->regs_user.regs);
6368         }
6369
6370         if (sample_type & PERF_SAMPLE_WEIGHT)
6371                 perf_output_put(handle, data->weight);
6372
6373         if (sample_type & PERF_SAMPLE_DATA_SRC)
6374                 perf_output_put(handle, data->data_src.val);
6375
6376         if (sample_type & PERF_SAMPLE_TRANSACTION)
6377                 perf_output_put(handle, data->txn);
6378
6379         if (sample_type & PERF_SAMPLE_REGS_INTR) {
6380                 u64 abi = data->regs_intr.abi;
6381                 /*
6382                  * If there are no regs to dump, notice it through
6383                  * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
6384                  */
6385                 perf_output_put(handle, abi);
6386
6387                 if (abi) {
6388                         u64 mask = event->attr.sample_regs_intr;
6389
6390                         perf_output_sample_regs(handle,
6391                                                 data->regs_intr.regs,
6392                                                 mask);
6393                 }
6394         }
6395
6396         if (sample_type & PERF_SAMPLE_PHYS_ADDR)
6397                 perf_output_put(handle, data->phys_addr);
6398
6399         if (!event->attr.watermark) {
6400                 int wakeup_events = event->attr.wakeup_events;
6401
6402                 if (wakeup_events) {
6403                         struct ring_buffer *rb = handle->rb;
6404                         int events = local_inc_return(&rb->events);
6405
6406                         if (events >= wakeup_events) {
6407                                 local_sub(wakeup_events, &rb->events);
6408                                 local_inc(&rb->wakeup);
6409                         }
6410                 }
6411         }
6412 }
6413
6414 static u64 perf_virt_to_phys(u64 virt)
6415 {
6416         u64 phys_addr = 0;
6417         struct page *p = NULL;
6418
6419         if (!virt)
6420                 return 0;
6421
6422         if (virt >= TASK_SIZE) {
6423                 /* If it's vmalloc()d memory, leave phys_addr as 0 */
6424                 if (virt_addr_valid((void *)(uintptr_t)virt) &&
6425                     !(virt >= VMALLOC_START && virt < VMALLOC_END))
6426                         phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
6427         } else {
6428                 /*
6429                  * Walking the pages tables for user address.
6430                  * Interrupts are disabled, so it prevents any tear down
6431                  * of the page tables.
6432                  * Try IRQ-safe __get_user_pages_fast first.
6433                  * If failed, leave phys_addr as 0.
6434                  */
6435                 if (current->mm != NULL) {
6436                         pagefault_disable();
6437                         if (__get_user_pages_fast(virt, 1, 0, &p) == 1)
6438                                 phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
6439                         pagefault_enable();
6440                 }
6441
6442                 if (p)
6443                         put_page(p);
6444         }
6445
6446         return phys_addr;
6447 }
6448
6449 static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
6450
6451 struct perf_callchain_entry *
6452 perf_callchain(struct perf_event *event, struct pt_regs *regs)
6453 {
6454         bool kernel = !event->attr.exclude_callchain_kernel;
6455         bool user   = !event->attr.exclude_callchain_user;
6456         /* Disallow cross-task user callchains. */
6457         bool crosstask = event->ctx->task && event->ctx->task != current;
6458         const u32 max_stack = event->attr.sample_max_stack;
6459         struct perf_callchain_entry *callchain;
6460
6461         if (!kernel && !user)
6462                 return &__empty_callchain;
6463
6464         callchain = get_perf_callchain(regs, 0, kernel, user,
6465                                        max_stack, crosstask, true);
6466         return callchain ?: &__empty_callchain;
6467 }
6468
6469 void perf_prepare_sample(struct perf_event_header *header,
6470                          struct perf_sample_data *data,
6471                          struct perf_event *event,
6472                          struct pt_regs *regs)
6473 {
6474         u64 sample_type = event->attr.sample_type;
6475
6476         header->type = PERF_RECORD_SAMPLE;
6477         header->size = sizeof(*header) + event->header_size;
6478
6479         header->misc = 0;
6480         header->misc |= perf_misc_flags(regs);
6481
6482         __perf_event_header__init_id(header, data, event);
6483
6484         if (sample_type & PERF_SAMPLE_IP)
6485                 data->ip = perf_instruction_pointer(regs);
6486
6487         if (sample_type & PERF_SAMPLE_CALLCHAIN) {
6488                 int size = 1;
6489
6490                 if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
6491                         data->callchain = perf_callchain(event, regs);
6492
6493                 size += data->callchain->nr;
6494
6495                 header->size += size * sizeof(u64);
6496         }
6497
6498         if (sample_type & PERF_SAMPLE_RAW) {
6499                 struct perf_raw_record *raw = data->raw;
6500                 int size;
6501
6502                 if (raw) {
6503                         struct perf_raw_frag *frag = &raw->frag;
6504                         u32 sum = 0;
6505
6506                         do {
6507                                 sum += frag->size;
6508                                 if (perf_raw_frag_last(frag))
6509                                         break;
6510                                 frag = frag->next;
6511                         } while (1);
6512
6513                         size = round_up(sum + sizeof(u32), sizeof(u64));
6514                         raw->size = size - sizeof(u32);
6515                         frag->pad = raw->size - sum;
6516                 } else {
6517                         size = sizeof(u64);
6518                 }
6519
6520                 header->size += size;
6521         }
6522
6523         if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
6524                 int size = sizeof(u64); /* nr */
6525                 if (data->br_stack) {
6526                         size += data->br_stack->nr
6527                               * sizeof(struct perf_branch_entry);
6528                 }
6529                 header->size += size;
6530         }
6531
6532         if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
6533                 perf_sample_regs_user(&data->regs_user, regs,
6534                                       &data->regs_user_copy);
6535
6536         if (sample_type & PERF_SAMPLE_REGS_USER) {
6537                 /* regs dump ABI info */
6538                 int size = sizeof(u64);
6539
6540                 if (data->regs_user.regs) {
6541                         u64 mask = event->attr.sample_regs_user;
6542                         size += hweight64(mask) * sizeof(u64);
6543                 }
6544
6545                 header->size += size;
6546         }
6547
6548         if (sample_type & PERF_SAMPLE_STACK_USER) {
6549                 /*
6550                  * Either we need PERF_SAMPLE_STACK_USER bit to be allways
6551                  * processed as the last one or have additional check added
6552                  * in case new sample type is added, because we could eat
6553                  * up the rest of the sample size.
6554                  */
6555                 u16 stack_size = event->attr.sample_stack_user;
6556                 u16 size = sizeof(u64);
6557
6558                 stack_size = perf_sample_ustack_size(stack_size, header->size,
6559                                                      data->regs_user.regs);
6560
6561                 /*
6562                  * If there is something to dump, add space for the dump
6563                  * itself and for the field that tells the dynamic size,
6564                  * which is how many have been actually dumped.
6565                  */
6566                 if (stack_size)
6567                         size += sizeof(u64) + stack_size;
6568
6569                 data->stack_user_size = stack_size;
6570                 header->size += size;
6571         }
6572
6573         if (sample_type & PERF_SAMPLE_REGS_INTR) {
6574                 /* regs dump ABI info */
6575                 int size = sizeof(u64);
6576
6577                 perf_sample_regs_intr(&data->regs_intr, regs);
6578
6579                 if (data->regs_intr.regs) {
6580                         u64 mask = event->attr.sample_regs_intr;
6581
6582                         size += hweight64(mask) * sizeof(u64);
6583                 }
6584
6585                 header->size += size;
6586         }
6587
6588         if (sample_type & PERF_SAMPLE_PHYS_ADDR)
6589                 data->phys_addr = perf_virt_to_phys(data->addr);
6590 }
6591
6592 static __always_inline void
6593 __perf_event_output(struct perf_event *event,
6594                     struct perf_sample_data *data,
6595                     struct pt_regs *regs,
6596                     int (*output_begin)(struct perf_output_handle *,
6597                                         struct perf_event *,
6598                                         unsigned int))
6599 {
6600         struct perf_output_handle handle;
6601         struct perf_event_header header;
6602
6603         /* protect the callchain buffers */
6604         rcu_read_lock();
6605
6606         perf_prepare_sample(&header, data, event, regs);
6607
6608         if (output_begin(&handle, event, header.size))
6609                 goto exit;
6610
6611         perf_output_sample(&handle, &header, data, event);
6612
6613         perf_output_end(&handle);
6614
6615 exit:
6616         rcu_read_unlock();
6617 }
6618
6619 void
6620 perf_event_output_forward(struct perf_event *event,
6621                          struct perf_sample_data *data,
6622                          struct pt_regs *regs)
6623 {
6624         __perf_event_output(event, data, regs, perf_output_begin_forward);
6625 }
6626
6627 void
6628 perf_event_output_backward(struct perf_event *event,
6629                            struct perf_sample_data *data,
6630                            struct pt_regs *regs)
6631 {
6632         __perf_event_output(event, data, regs, perf_output_begin_backward);
6633 }
6634
6635 void
6636 perf_event_output(struct perf_event *event,
6637                   struct perf_sample_data *data,
6638                   struct pt_regs *regs)
6639 {
6640         __perf_event_output(event, data, regs, perf_output_begin);
6641 }
6642
6643 /*
6644  * read event_id
6645  */
6646
6647 struct perf_read_event {
6648         struct perf_event_header        header;
6649
6650         u32                             pid;
6651         u32                             tid;
6652 };
6653
6654 static void
6655 perf_event_read_event(struct perf_event *event,
6656                         struct task_struct *task)
6657 {
6658         struct perf_output_handle handle;
6659         struct perf_sample_data sample;
6660         struct perf_read_event read_event = {
6661                 .header = {
6662                         .type = PERF_RECORD_READ,
6663                         .misc = 0,
6664                         .size = sizeof(read_event) + event->read_size,
6665                 },
6666                 .pid = perf_event_pid(event, task),
6667                 .tid = perf_event_tid(event, task),
6668         };
6669         int ret;
6670
6671         perf_event_header__init_id(&read_event.header, &sample, event);
6672         ret = perf_output_begin(&handle, event, read_event.header.size);
6673         if (ret)
6674                 return;
6675
6676         perf_output_put(&handle, read_event);
6677         perf_output_read(&handle, event);
6678         perf_event__output_id_sample(event, &handle, &sample);
6679
6680         perf_output_end(&handle);
6681 }
6682
6683 typedef void (perf_iterate_f)(struct perf_event *event, void *data);
6684
6685 static void
6686 perf_iterate_ctx(struct perf_event_context *ctx,
6687                    perf_iterate_f output,
6688                    void *data, bool all)
6689 {
6690         struct perf_event *event;
6691
6692         list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
6693                 if (!all) {
6694                         if (event->state < PERF_EVENT_STATE_INACTIVE)
6695                                 continue;
6696                         if (!event_filter_match(event))
6697                                 continue;
6698                 }
6699
6700                 output(event, data);
6701         }
6702 }
6703
6704 static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
6705 {
6706         struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
6707         struct perf_event *event;
6708
6709         list_for_each_entry_rcu(event, &pel->list, sb_list) {
6710                 /*
6711                  * Skip events that are not fully formed yet; ensure that
6712                  * if we observe event->ctx, both event and ctx will be
6713                  * complete enough. See perf_install_in_context().
6714                  */
6715                 if (!smp_load_acquire(&event->ctx))
6716                         continue;
6717
6718                 if (event->state < PERF_EVENT_STATE_INACTIVE)
6719                         continue;
6720                 if (!event_filter_match(event))
6721                         continue;
6722                 output(event, data);
6723         }
6724 }
6725
6726 /*
6727  * Iterate all events that need to receive side-band events.
6728  *
6729  * For new callers; ensure that account_pmu_sb_event() includes
6730  * your event, otherwise it might not get delivered.
6731  */
6732 static void
6733 perf_iterate_sb(perf_iterate_f output, void *data,
6734                struct perf_event_context *task_ctx)
6735 {
6736         struct perf_event_context *ctx;
6737         int ctxn;
6738
6739         rcu_read_lock();
6740         preempt_disable();
6741
6742         /*
6743          * If we have task_ctx != NULL we only notify the task context itself.
6744          * The task_ctx is set only for EXIT events before releasing task
6745          * context.
6746          */
6747         if (task_ctx) {
6748                 perf_iterate_ctx(task_ctx, output, data, false);
6749                 goto done;
6750         }
6751
6752         perf_iterate_sb_cpu(output, data);
6753
6754         for_each_task_context_nr(ctxn) {
6755                 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
6756                 if (ctx)
6757                         perf_iterate_ctx(ctx, output, data, false);
6758         }
6759 done:
6760         preempt_enable();
6761         rcu_read_unlock();
6762 }
6763
6764 /*
6765  * Clear all file-based filters at exec, they'll have to be
6766  * re-instated when/if these objects are mmapped again.
6767  */
6768 static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
6769 {
6770         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
6771         struct perf_addr_filter *filter;
6772         unsigned int restart = 0, count = 0;
6773         unsigned long flags;
6774
6775         if (!has_addr_filter(event))
6776                 return;
6777
6778         raw_spin_lock_irqsave(&ifh->lock, flags);
6779         list_for_each_entry(filter, &ifh->list, entry) {
6780                 if (filter->path.dentry) {
6781                         event->addr_filter_ranges[count].start = 0;
6782                         event->addr_filter_ranges[count].size = 0;
6783                         restart++;
6784                 }
6785
6786                 count++;
6787         }
6788
6789         if (restart)
6790                 event->addr_filters_gen++;
6791         raw_spin_unlock_irqrestore(&ifh->lock, flags);
6792
6793         if (restart)
6794                 perf_event_stop(event, 1);
6795 }
6796
6797 void perf_event_exec(void)
6798 {
6799         struct perf_event_context *ctx;
6800         int ctxn;
6801
6802         rcu_read_lock();
6803         for_each_task_context_nr(ctxn) {
6804                 ctx = current->perf_event_ctxp[ctxn];
6805                 if (!ctx)
6806                         continue;
6807
6808                 perf_event_enable_on_exec(ctxn);
6809
6810                 perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
6811                                    true);
6812         }
6813         rcu_read_unlock();
6814 }
6815
6816 struct remote_output {
6817         struct ring_buffer      *rb;
6818         int                     err;
6819 };
6820
6821 static void __perf_event_output_stop(struct perf_event *event, void *data)
6822 {
6823         struct perf_event *parent = event->parent;
6824         struct remote_output *ro = data;
6825         struct ring_buffer *rb = ro->rb;
6826         struct stop_event_data sd = {
6827                 .event  = event,
6828         };
6829
6830         if (!has_aux(event))
6831                 return;
6832
6833         if (!parent)
6834                 parent = event;
6835
6836         /*
6837          * In case of inheritance, it will be the parent that links to the
6838          * ring-buffer, but it will be the child that's actually using it.
6839          *
6840          * We are using event::rb to determine if the event should be stopped,
6841          * however this may race with ring_buffer_attach() (through set_output),
6842          * which will make us skip the event that actually needs to be stopped.
6843          * So ring_buffer_attach() has to stop an aux event before re-assigning
6844          * its rb pointer.
6845          */
6846         if (rcu_dereference(parent->rb) == rb)
6847                 ro->err = __perf_event_stop(&sd);
6848 }
6849
6850 static int __perf_pmu_output_stop(void *info)
6851 {
6852         struct perf_event *event = info;
6853         struct pmu *pmu = event->ctx->pmu;
6854         struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
6855         struct remote_output ro = {
6856                 .rb     = event->rb,
6857         };
6858
6859         rcu_read_lock();
6860         perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
6861         if (cpuctx->task_ctx)
6862                 perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
6863                                    &ro, false);
6864         rcu_read_unlock();
6865
6866         return ro.err;
6867 }
6868
6869 static void perf_pmu_output_stop(struct perf_event *event)
6870 {
6871         struct perf_event *iter;
6872         int err, cpu;
6873
6874 restart:
6875         rcu_read_lock();
6876         list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
6877                 /*
6878                  * For per-CPU events, we need to make sure that neither they
6879                  * nor their children are running; for cpu==-1 events it's
6880                  * sufficient to stop the event itself if it's active, since
6881                  * it can't have children.
6882                  */
6883                 cpu = iter->cpu;
6884                 if (cpu == -1)
6885                         cpu = READ_ONCE(iter->oncpu);
6886
6887                 if (cpu == -1)
6888                         continue;
6889
6890                 err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
6891                 if (err == -EAGAIN) {
6892                         rcu_read_unlock();
6893                         goto restart;
6894                 }
6895         }
6896         rcu_read_unlock();
6897 }
6898
6899 /*
6900  * task tracking -- fork/exit
6901  *
6902  * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
6903  */
6904
6905 struct perf_task_event {
6906         struct task_struct              *task;
6907         struct perf_event_context       *task_ctx;
6908
6909         struct {
6910                 struct perf_event_header        header;
6911
6912                 u32                             pid;
6913                 u32                             ppid;
6914                 u32                             tid;
6915                 u32                             ptid;
6916                 u64                             time;
6917         } event_id;
6918 };
6919
6920 static int perf_event_task_match(struct perf_event *event)
6921 {
6922         return event->attr.comm  || event->attr.mmap ||
6923                event->attr.mmap2 || event->attr.mmap_data ||
6924                event->attr.task;
6925 }
6926
6927 static void perf_event_task_output(struct perf_event *event,
6928                                    void *data)
6929 {
6930         struct perf_task_event *task_event = data;
6931         struct perf_output_handle handle;
6932         struct perf_sample_data sample;
6933         struct task_struct *task = task_event->task;
6934         int ret, size = task_event->event_id.header.size;
6935
6936         if (!perf_event_task_match(event))
6937                 return;
6938
6939         perf_event_header__init_id(&task_event->event_id.header, &sample, event);
6940
6941         ret = perf_output_begin(&handle, event,
6942                                 task_event->event_id.header.size);
6943         if (ret)
6944                 goto out;
6945
6946         task_event->event_id.pid = perf_event_pid(event, task);
6947         task_event->event_id.tid = perf_event_tid(event, task);
6948
6949         if (task_event->event_id.header.type == PERF_RECORD_EXIT) {
6950                 task_event->event_id.ppid = perf_event_pid(event,
6951                                                         task->real_parent);
6952                 task_event->event_id.ptid = perf_event_pid(event,
6953                                                         task->real_parent);
6954         } else {  /* PERF_RECORD_FORK */
6955                 task_event->event_id.ppid = perf_event_pid(event, current);
6956                 task_event->event_id.ptid = perf_event_tid(event, current);
6957         }
6958
6959         task_event->event_id.time = perf_event_clock(event);
6960
6961         perf_output_put(&handle, task_event->event_id);
6962
6963         perf_event__output_id_sample(event, &handle, &sample);
6964
6965         perf_output_end(&handle);
6966 out:
6967         task_event->event_id.header.size = size;
6968 }
6969
6970 static void perf_event_task(struct task_struct *task,
6971                               struct perf_event_context *task_ctx,
6972                               int new)
6973 {
6974         struct perf_task_event task_event;
6975
6976         if (!atomic_read(&nr_comm_events) &&
6977             !atomic_read(&nr_mmap_events) &&
6978             !atomic_read(&nr_task_events))
6979                 return;
6980
6981         task_event = (struct perf_task_event){
6982                 .task     = task,
6983                 .task_ctx = task_ctx,
6984                 .event_id    = {
6985                         .header = {
6986                                 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
6987                                 .misc = 0,
6988                                 .size = sizeof(task_event.event_id),
6989                         },
6990                         /* .pid  */
6991                         /* .ppid */
6992                         /* .tid  */
6993                         /* .ptid */
6994                         /* .time */
6995                 },
6996         };
6997
6998         perf_iterate_sb(perf_event_task_output,
6999                        &task_event,
7000                        task_ctx);
7001 }
7002
7003 void perf_event_fork(struct task_struct *task)
7004 {
7005         perf_event_task(task, NULL, 1);
7006         perf_event_namespaces(task);
7007 }
7008
7009 /*
7010  * comm tracking
7011  */
7012
7013 struct perf_comm_event {
7014         struct task_struct      *task;
7015         char                    *comm;
7016         int                     comm_size;
7017
7018         struct {
7019                 struct perf_event_header        header;
7020
7021                 u32                             pid;
7022                 u32                             tid;
7023         } event_id;
7024 };
7025
7026 static int perf_event_comm_match(struct perf_event *event)
7027 {
7028         return event->attr.comm;
7029 }
7030
7031 static void perf_event_comm_output(struct perf_event *event,
7032                                    void *data)
7033 {
7034         struct perf_comm_event *comm_event = data;
7035         struct perf_output_handle handle;
7036         struct perf_sample_data sample;
7037         int size = comm_event->event_id.header.size;
7038         int ret;
7039
7040         if (!perf_event_comm_match(event))
7041                 return;
7042
7043         perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
7044         ret = perf_output_begin(&handle, event,
7045                                 comm_event->event_id.header.size);
7046
7047         if (ret)
7048                 goto out;
7049
7050         comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
7051         comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
7052
7053         perf_output_put(&handle, comm_event->event_id);
7054         __output_copy(&handle, comm_event->comm,
7055                                    comm_event->comm_size);
7056
7057         perf_event__output_id_sample(event, &handle, &sample);
7058
7059         perf_output_end(&handle);
7060 out:
7061         comm_event->event_id.header.size = size;
7062 }
7063
7064 static void perf_event_comm_event(struct perf_comm_event *comm_event)
7065 {
7066         char comm[TASK_COMM_LEN];
7067         unsigned int size;
7068
7069         memset(comm, 0, sizeof(comm));
7070         strlcpy(comm, comm_event->task->comm, sizeof(comm));
7071         size = ALIGN(strlen(comm)+1, sizeof(u64));
7072
7073         comm_event->comm = comm;
7074         comm_event->comm_size = size;
7075
7076         comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
7077
7078         perf_iterate_sb(perf_event_comm_output,
7079                        comm_event,
7080                        NULL);
7081 }
7082
7083 void perf_event_comm(struct task_struct *task, bool exec)
7084 {
7085         struct perf_comm_event comm_event;
7086
7087         if (!atomic_read(&nr_comm_events))
7088                 return;
7089
7090         comm_event = (struct perf_comm_event){
7091                 .task   = task,
7092                 /* .comm      */
7093                 /* .comm_size */
7094                 .event_id  = {
7095                         .header = {
7096                                 .type = PERF_RECORD_COMM,
7097                                 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
7098                                 /* .size */
7099                         },
7100                         /* .pid */
7101                         /* .tid */
7102                 },
7103         };
7104
7105         perf_event_comm_event(&comm_event);
7106 }
7107
7108 /*
7109  * namespaces tracking
7110  */
7111
7112 struct perf_namespaces_event {
7113         struct task_struct              *task;
7114
7115         struct {
7116                 struct perf_event_header        header;
7117
7118                 u32                             pid;
7119                 u32                             tid;
7120                 u64                             nr_namespaces;
7121                 struct perf_ns_link_info        link_info[NR_NAMESPACES];
7122         } event_id;
7123 };
7124
7125 static int perf_event_namespaces_match(struct perf_event *event)
7126 {
7127         return event->attr.namespaces;
7128 }
7129
7130 static void perf_event_namespaces_output(struct perf_event *event,
7131                                          void *data)
7132 {
7133         struct perf_namespaces_event *namespaces_event = data;
7134         struct perf_output_handle handle;
7135         struct perf_sample_data sample;
7136         u16 header_size = namespaces_event->event_id.header.size;
7137         int ret;
7138
7139         if (!perf_event_namespaces_match(event))
7140                 return;
7141
7142         perf_event_header__init_id(&namespaces_event->event_id.header,
7143                                    &sample, event);
7144         ret = perf_output_begin(&handle, event,
7145                                 namespaces_event->event_id.header.size);
7146         if (ret)
7147                 goto out;
7148
7149         namespaces_event->event_id.pid = perf_event_pid(event,
7150                                                         namespaces_event->task);
7151         namespaces_event->event_id.tid = perf_event_tid(event,
7152                                                         namespaces_event->task);
7153
7154         perf_output_put(&handle, namespaces_event->event_id);
7155
7156         perf_event__output_id_sample(event, &handle, &sample);
7157
7158         perf_output_end(&handle);
7159 out:
7160         namespaces_event->event_id.header.size = header_size;
7161 }
7162
7163 static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info,
7164                                    struct task_struct *task,
7165                                    const struct proc_ns_operations *ns_ops)
7166 {
7167         struct path ns_path;
7168         struct inode *ns_inode;
7169         void *error;
7170
7171         error = ns_get_path(&ns_path, task, ns_ops);
7172         if (!error) {
7173                 ns_inode = ns_path.dentry->d_inode;
7174                 ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev);
7175                 ns_link_info->ino = ns_inode->i_ino;
7176                 path_put(&ns_path);
7177         }
7178 }
7179
7180 void perf_event_namespaces(struct task_struct *task)
7181 {
7182         struct perf_namespaces_event namespaces_event;
7183         struct perf_ns_link_info *ns_link_info;
7184
7185         if (!atomic_read(&nr_namespaces_events))
7186                 return;
7187
7188         namespaces_event = (struct perf_namespaces_event){
7189                 .task   = task,
7190                 .event_id  = {
7191                         .header = {
7192                                 .type = PERF_RECORD_NAMESPACES,
7193                                 .misc = 0,
7194                                 .size = sizeof(namespaces_event.event_id),
7195                         },
7196                         /* .pid */
7197                         /* .tid */
7198                         .nr_namespaces = NR_NAMESPACES,
7199                         /* .link_info[NR_NAMESPACES] */
7200                 },
7201         };
7202
7203         ns_link_info = namespaces_event.event_id.link_info;
7204
7205         perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX],
7206                                task, &mntns_operations);
7207
7208 #ifdef CONFIG_USER_NS
7209         perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX],
7210                                task, &userns_operations);
7211 #endif
7212 #ifdef CONFIG_NET_NS
7213         perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX],
7214                                task, &netns_operations);
7215 #endif
7216 #ifdef CONFIG_UTS_NS
7217         perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX],
7218                                task, &utsns_operations);
7219 #endif
7220 #ifdef CONFIG_IPC_NS
7221         perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX],
7222                                task, &ipcns_operations);
7223 #endif
7224 #ifdef CONFIG_PID_NS
7225         perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX],
7226                                task, &pidns_operations);
7227 #endif
7228 #ifdef CONFIG_CGROUPS
7229         perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX],
7230                                task, &cgroupns_operations);
7231 #endif
7232
7233         perf_iterate_sb(perf_event_namespaces_output,
7234                         &namespaces_event,
7235                         NULL);
7236 }
7237
7238 /*
7239  * mmap tracking
7240  */
7241
7242 struct perf_mmap_event {
7243         struct vm_area_struct   *vma;
7244
7245         const char              *file_name;
7246         int                     file_size;
7247         int                     maj, min;
7248         u64                     ino;
7249         u64                     ino_generation;
7250         u32                     prot, flags;
7251
7252         struct {
7253                 struct perf_event_header        header;
7254
7255                 u32                             pid;
7256                 u32                             tid;
7257                 u64                             start;
7258                 u64                             len;
7259                 u64                             pgoff;
7260         } event_id;
7261 };
7262
7263 static int perf_event_mmap_match(struct perf_event *event,
7264                                  void *data)
7265 {
7266         struct perf_mmap_event *mmap_event = data;
7267         struct vm_area_struct *vma = mmap_event->vma;
7268         int executable = vma->vm_flags & VM_EXEC;
7269
7270         return (!executable && event->attr.mmap_data) ||
7271                (executable && (event->attr.mmap || event->attr.mmap2));
7272 }
7273
7274 static void perf_event_mmap_output(struct perf_event *event,
7275                                    void *data)
7276 {
7277         struct perf_mmap_event *mmap_event = data;
7278         struct perf_output_handle handle;
7279         struct perf_sample_data sample;
7280         int size = mmap_event->event_id.header.size;
7281         u32 type = mmap_event->event_id.header.type;
7282         int ret;
7283
7284         if (!perf_event_mmap_match(event, data))
7285                 return;
7286
7287         if (event->attr.mmap2) {
7288                 mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
7289                 mmap_event->event_id.header.size += sizeof(mmap_event->maj);
7290                 mmap_event->event_id.header.size += sizeof(mmap_event->min);
7291                 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
7292                 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
7293                 mmap_event->event_id.header.size += sizeof(mmap_event->prot);
7294                 mmap_event->event_id.header.size += sizeof(mmap_event->flags);
7295         }
7296
7297         perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
7298         ret = perf_output_begin(&handle, event,
7299                                 mmap_event->event_id.header.size);
7300         if (ret)
7301                 goto out;
7302
7303         mmap_event->event_id.pid = perf_event_pid(event, current);
7304         mmap_event->event_id.tid = perf_event_tid(event, current);
7305
7306         perf_output_put(&handle, mmap_event->event_id);
7307
7308         if (event->attr.mmap2) {
7309                 perf_output_put(&handle, mmap_event->maj);
7310                 perf_output_put(&handle, mmap_event->min);
7311                 perf_output_put(&handle, mmap_event->ino);
7312                 perf_output_put(&handle, mmap_event->ino_generation);
7313                 perf_output_put(&handle, mmap_event->prot);
7314                 perf_output_put(&handle, mmap_event->flags);
7315         }
7316
7317         __output_copy(&handle, mmap_event->file_name,
7318                                    mmap_event->file_size);
7319
7320         perf_event__output_id_sample(event, &handle, &sample);
7321
7322         perf_output_end(&handle);
7323 out:
7324         mmap_event->event_id.header.size = size;
7325         mmap_event->event_id.header.type = type;
7326 }
7327
7328 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
7329 {
7330         struct vm_area_struct *vma = mmap_event->vma;
7331         struct file *file = vma->vm_file;
7332         int maj = 0, min = 0;
7333         u64 ino = 0, gen = 0;
7334         u32 prot = 0, flags = 0;
7335         unsigned int size;
7336         char tmp[16];
7337         char *buf = NULL;
7338         char *name;
7339
7340         if (vma->vm_flags & VM_READ)
7341                 prot |= PROT_READ;
7342         if (vma->vm_flags & VM_WRITE)
7343                 prot |= PROT_WRITE;
7344         if (vma->vm_flags & VM_EXEC)
7345                 prot |= PROT_EXEC;
7346
7347         if (vma->vm_flags & VM_MAYSHARE)
7348                 flags = MAP_SHARED;
7349         else
7350                 flags = MAP_PRIVATE;
7351
7352         if (vma->vm_flags & VM_DENYWRITE)
7353                 flags |= MAP_DENYWRITE;
7354         if (vma->vm_flags & VM_MAYEXEC)
7355                 flags |= MAP_EXECUTABLE;
7356         if (vma->vm_flags & VM_LOCKED)
7357                 flags |= MAP_LOCKED;
7358         if (vma->vm_flags & VM_HUGETLB)
7359                 flags |= MAP_HUGETLB;
7360
7361         if (file) {
7362                 struct inode *inode;
7363                 dev_t dev;
7364
7365                 buf = kmalloc(PATH_MAX, GFP_KERNEL);
7366                 if (!buf) {
7367                         name = "//enomem";
7368                         goto cpy_name;
7369                 }
7370                 /*
7371                  * d_path() works from the end of the rb backwards, so we
7372                  * need to add enough zero bytes after the string to handle
7373                  * the 64bit alignment we do later.
7374                  */
7375                 name = file_path(file, buf, PATH_MAX - sizeof(u64));
7376                 if (IS_ERR(name)) {
7377                         name = "//toolong";
7378                         goto cpy_name;
7379                 }
7380                 inode = file_inode(vma->vm_file);
7381                 dev = inode->i_sb->s_dev;
7382                 ino = inode->i_ino;
7383                 gen = inode->i_generation;
7384                 maj = MAJOR(dev);
7385                 min = MINOR(dev);
7386
7387                 goto got_name;
7388         } else {
7389                 if (vma->vm_ops && vma->vm_ops->name) {
7390                         name = (char *) vma->vm_ops->name(vma);
7391                         if (name)
7392                                 goto cpy_name;
7393                 }
7394
7395                 name = (char *)arch_vma_name(vma);
7396                 if (name)
7397                         goto cpy_name;
7398
7399                 if (vma->vm_start <= vma->vm_mm->start_brk &&
7400                                 vma->vm_end >= vma->vm_mm->brk) {
7401                         name = "[heap]";
7402                         goto cpy_name;
7403                 }
7404                 if (vma->vm_start <= vma->vm_mm->start_stack &&
7405                                 vma->vm_end >= vma->vm_mm->start_stack) {
7406                         name = "[stack]";
7407                         goto cpy_name;
7408                 }
7409
7410                 name = "//anon";
7411                 goto cpy_name;
7412         }
7413
7414 cpy_name:
7415         strlcpy(tmp, name, sizeof(tmp));
7416         name = tmp;
7417 got_name:
7418         /*
7419          * Since our buffer works in 8 byte units we need to align our string
7420          * size to a multiple of 8. However, we must guarantee the tail end is
7421          * zero'd out to avoid leaking random bits to userspace.
7422          */
7423         size = strlen(name)+1;
7424         while (!IS_ALIGNED(size, sizeof(u64)))
7425                 name[size++] = '\0';
7426
7427         mmap_event->file_name = name;
7428         mmap_event->file_size = size;
7429         mmap_event->maj = maj;
7430         mmap_event->min = min;
7431         mmap_event->ino = ino;
7432         mmap_event->ino_generation = gen;
7433         mmap_event->prot = prot;
7434         mmap_event->flags = flags;
7435
7436         if (!(vma->vm_flags & VM_EXEC))
7437                 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
7438
7439         mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
7440
7441         perf_iterate_sb(perf_event_mmap_output,
7442                        mmap_event,
7443                        NULL);
7444
7445         kfree(buf);
7446 }
7447
7448 /*
7449  * Check whether inode and address range match filter criteria.
7450  */
7451 static bool perf_addr_filter_match(struct perf_addr_filter *filter,
7452                                      struct file *file, unsigned long offset,
7453                                      unsigned long size)
7454 {
7455         /* d_inode(NULL) won't be equal to any mapped user-space file */
7456         if (!filter->path.dentry)
7457                 return false;
7458
7459         if (d_inode(filter->path.dentry) != file_inode(file))
7460                 return false;
7461
7462         if (filter->offset > offset + size)
7463                 return false;
7464
7465         if (filter->offset + filter->size < offset)
7466                 return false;
7467
7468         return true;
7469 }
7470
7471 static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter,
7472                                         struct vm_area_struct *vma,
7473                                         struct perf_addr_filter_range *fr)
7474 {
7475         unsigned long vma_size = vma->vm_end - vma->vm_start;
7476         unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
7477         struct file *file = vma->vm_file;
7478
7479         if (!perf_addr_filter_match(filter, file, off, vma_size))
7480                 return false;
7481
7482         if (filter->offset < off) {
7483                 fr->start = vma->vm_start;
7484                 fr->size = min(vma_size, filter->size - (off - filter->offset));
7485         } else {
7486                 fr->start = vma->vm_start + filter->offset - off;
7487                 fr->size = min(vma->vm_end - fr->start, filter->size);
7488         }
7489
7490         return true;
7491 }
7492
7493 static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
7494 {
7495         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
7496         struct vm_area_struct *vma = data;
7497         struct perf_addr_filter *filter;
7498         unsigned int restart = 0, count = 0;
7499         unsigned long flags;
7500
7501         if (!has_addr_filter(event))
7502                 return;
7503
7504         if (!vma->vm_file)
7505                 return;
7506
7507         raw_spin_lock_irqsave(&ifh->lock, flags);
7508         list_for_each_entry(filter, &ifh->list, entry) {
7509                 if (perf_addr_filter_vma_adjust(filter, vma,
7510                                                 &event->addr_filter_ranges[count]))
7511                         restart++;
7512
7513                 count++;
7514         }
7515
7516         if (restart)
7517                 event->addr_filters_gen++;
7518         raw_spin_unlock_irqrestore(&ifh->lock, flags);
7519
7520         if (restart)
7521                 perf_event_stop(event, 1);
7522 }
7523
7524 /*
7525  * Adjust all task's events' filters to the new vma
7526  */
7527 static void perf_addr_filters_adjust(struct vm_area_struct *vma)
7528 {
7529         struct perf_event_context *ctx;
7530         int ctxn;
7531
7532         /*
7533          * Data tracing isn't supported yet and as such there is no need
7534          * to keep track of anything that isn't related to executable code:
7535          */
7536         if (!(vma->vm_flags & VM_EXEC))
7537                 return;
7538
7539         rcu_read_lock();
7540         for_each_task_context_nr(ctxn) {
7541                 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
7542                 if (!ctx)
7543                         continue;
7544
7545                 perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
7546         }
7547         rcu_read_unlock();
7548 }
7549
7550 void perf_event_mmap(struct vm_area_struct *vma)
7551 {
7552         struct perf_mmap_event mmap_event;
7553
7554         if (!atomic_read(&nr_mmap_events))
7555                 return;
7556
7557         mmap_event = (struct perf_mmap_event){
7558                 .vma    = vma,
7559                 /* .file_name */
7560                 /* .file_size */
7561                 .event_id  = {
7562                         .header = {
7563                                 .type = PERF_RECORD_MMAP,
7564                                 .misc = PERF_RECORD_MISC_USER,
7565                                 /* .size */
7566                         },
7567                         /* .pid */
7568                         /* .tid */
7569                         .start  = vma->vm_start,
7570                         .len    = vma->vm_end - vma->vm_start,
7571                         .pgoff  = (u64)vma->vm_pgoff << PAGE_SHIFT,
7572                 },
7573                 /* .maj (attr_mmap2 only) */
7574                 /* .min (attr_mmap2 only) */
7575                 /* .ino (attr_mmap2 only) */
7576                 /* .ino_generation (attr_mmap2 only) */
7577                 /* .prot (attr_mmap2 only) */
7578                 /* .flags (attr_mmap2 only) */
7579         };
7580
7581         perf_addr_filters_adjust(vma);
7582         perf_event_mmap_event(&mmap_event);
7583 }
7584
7585 void perf_event_aux_event(struct perf_event *event, unsigned long head,
7586                           unsigned long size, u64 flags)
7587 {
7588         struct perf_output_handle handle;
7589         struct perf_sample_data sample;
7590         struct perf_aux_event {
7591                 struct perf_event_header        header;
7592                 u64                             offset;
7593                 u64                             size;
7594                 u64                             flags;
7595         } rec = {
7596                 .header = {
7597                         .type = PERF_RECORD_AUX,
7598                         .misc = 0,
7599                         .size = sizeof(rec),
7600                 },
7601                 .offset         = head,
7602                 .size           = size,
7603                 .flags          = flags,
7604         };
7605         int ret;
7606
7607         perf_event_header__init_id(&rec.header, &sample, event);
7608         ret = perf_output_begin(&handle, event, rec.header.size);
7609
7610         if (ret)
7611                 return;
7612
7613         perf_output_put(&handle, rec);
7614         perf_event__output_id_sample(event, &handle, &sample);
7615
7616         perf_output_end(&handle);
7617 }
7618
7619 /*
7620  * Lost/dropped samples logging
7621  */
7622 void perf_log_lost_samples(struct perf_event *event, u64 lost)
7623 {
7624         struct perf_output_handle handle;
7625         struct perf_sample_data sample;
7626         int ret;
7627
7628         struct {
7629                 struct perf_event_header        header;
7630                 u64                             lost;
7631         } lost_samples_event = {
7632                 .header = {
7633                         .type = PERF_RECORD_LOST_SAMPLES,
7634                         .misc = 0,
7635                         .size = sizeof(lost_samples_event),
7636                 },
7637                 .lost           = lost,
7638         };
7639
7640         perf_event_header__init_id(&lost_samples_event.header, &sample, event);
7641
7642         ret = perf_output_begin(&handle, event,
7643                                 lost_samples_event.header.size);
7644         if (ret)
7645                 return;
7646
7647         perf_output_put(&handle, lost_samples_event);
7648         perf_event__output_id_sample(event, &handle, &sample);
7649         perf_output_end(&handle);
7650 }
7651
7652 /*
7653  * context_switch tracking
7654  */
7655
7656 struct perf_switch_event {
7657         struct task_struct      *task;
7658         struct task_struct      *next_prev;
7659
7660         struct {
7661                 struct perf_event_header        header;
7662                 u32                             next_prev_pid;
7663                 u32                             next_prev_tid;
7664         } event_id;
7665 };
7666
7667 static int perf_event_switch_match(struct perf_event *event)
7668 {
7669         return event->attr.context_switch;
7670 }
7671
7672 static void perf_event_switch_output(struct perf_event *event, void *data)
7673 {
7674         struct perf_switch_event *se = data;
7675         struct perf_output_handle handle;
7676         struct perf_sample_data sample;
7677         int ret;
7678
7679         if (!perf_event_switch_match(event))
7680                 return;
7681
7682         /* Only CPU-wide events are allowed to see next/prev pid/tid */
7683         if (event->ctx->task) {
7684                 se->event_id.header.type = PERF_RECORD_SWITCH;
7685                 se->event_id.header.size = sizeof(se->event_id.header);
7686         } else {
7687                 se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
7688                 se->event_id.header.size = sizeof(se->event_id);
7689                 se->event_id.next_prev_pid =
7690                                         perf_event_pid(event, se->next_prev);
7691                 se->event_id.next_prev_tid =
7692                                         perf_event_tid(event, se->next_prev);
7693         }
7694
7695         perf_event_header__init_id(&se->event_id.header, &sample, event);
7696
7697         ret = perf_output_begin(&handle, event, se->event_id.header.size);
7698         if (ret)
7699                 return;
7700
7701         if (event->ctx->task)
7702                 perf_output_put(&handle, se->event_id.header);
7703         else
7704                 perf_output_put(&handle, se->event_id);
7705
7706         perf_event__output_id_sample(event, &handle, &sample);
7707
7708         perf_output_end(&handle);
7709 }
7710
7711 static void perf_event_switch(struct task_struct *task,
7712                               struct task_struct *next_prev, bool sched_in)
7713 {
7714         struct perf_switch_event switch_event;
7715
7716         /* N.B. caller checks nr_switch_events != 0 */
7717
7718         switch_event = (struct perf_switch_event){
7719                 .task           = task,
7720                 .next_prev      = next_prev,
7721                 .event_id       = {
7722                         .header = {
7723                                 /* .type */
7724                                 .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
7725                                 /* .size */
7726                         },
7727                         /* .next_prev_pid */
7728                         /* .next_prev_tid */
7729                 },
7730         };
7731
7732         if (!sched_in && task->state == TASK_RUNNING)
7733                 switch_event.event_id.header.misc |=
7734                                 PERF_RECORD_MISC_SWITCH_OUT_PREEMPT;
7735
7736         perf_iterate_sb(perf_event_switch_output,
7737                        &switch_event,
7738                        NULL);
7739 }
7740
7741 /*
7742  * IRQ throttle logging
7743  */
7744
7745 static void perf_log_throttle(struct perf_event *event, int enable)
7746 {
7747         struct perf_output_handle handle;
7748         struct perf_sample_data sample;
7749         int ret;
7750
7751         struct {
7752                 struct perf_event_header        header;
7753                 u64                             time;
7754                 u64                             id;
7755                 u64                             stream_id;
7756         } throttle_event = {
7757                 .header = {
7758                         .type = PERF_RECORD_THROTTLE,
7759                         .misc = 0,
7760                         .size = sizeof(throttle_event),
7761                 },
7762                 .time           = perf_event_clock(event),
7763                 .id             = primary_event_id(event),
7764                 .stream_id      = event->id,
7765         };
7766
7767         if (enable)
7768                 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
7769
7770         perf_event_header__init_id(&throttle_event.header, &sample, event);
7771
7772         ret = perf_output_begin(&handle, event,
7773                                 throttle_event.header.size);
7774         if (ret)
7775                 return;
7776
7777         perf_output_put(&handle, throttle_event);
7778         perf_event__output_id_sample(event, &handle, &sample);
7779         perf_output_end(&handle);
7780 }
7781
7782 void perf_event_itrace_started(struct perf_event *event)
7783 {
7784         event->attach_state |= PERF_ATTACH_ITRACE;
7785 }
7786
7787 static void perf_log_itrace_start(struct perf_event *event)
7788 {
7789         struct perf_output_handle handle;
7790         struct perf_sample_data sample;
7791         struct perf_aux_event {
7792                 struct perf_event_header        header;
7793                 u32                             pid;
7794                 u32                             tid;
7795         } rec;
7796         int ret;
7797
7798         if (event->parent)
7799                 event = event->parent;
7800
7801         if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
7802             event->attach_state & PERF_ATTACH_ITRACE)
7803                 return;
7804
7805         rec.header.type = PERF_RECORD_ITRACE_START;
7806         rec.header.misc = 0;
7807         rec.header.size = sizeof(rec);
7808         rec.pid = perf_event_pid(event, current);
7809         rec.tid = perf_event_tid(event, current);
7810
7811         perf_event_header__init_id(&rec.header, &sample, event);
7812         ret = perf_output_begin(&handle, event, rec.header.size);
7813
7814         if (ret)
7815                 return;
7816
7817         perf_output_put(&handle, rec);
7818         perf_event__output_id_sample(event, &handle, &sample);
7819
7820         perf_output_end(&handle);
7821 }
7822
7823 static int
7824 __perf_event_account_interrupt(struct perf_event *event, int throttle)
7825 {
7826         struct hw_perf_event *hwc = &event->hw;
7827         int ret = 0;
7828         u64 seq;
7829
7830         seq = __this_cpu_read(perf_throttled_seq);
7831         if (seq != hwc->interrupts_seq) {
7832                 hwc->interrupts_seq = seq;
7833                 hwc->interrupts = 1;
7834         } else {
7835                 hwc->interrupts++;
7836                 if (unlikely(throttle
7837                              && hwc->interrupts >= max_samples_per_tick)) {
7838                         __this_cpu_inc(perf_throttled_count);
7839                         tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
7840                         hwc->interrupts = MAX_INTERRUPTS;
7841                         perf_log_throttle(event, 0);
7842                         ret = 1;
7843                 }
7844         }
7845
7846         if (event->attr.freq) {
7847                 u64 now = perf_clock();
7848                 s64 delta = now - hwc->freq_time_stamp;
7849
7850                 hwc->freq_time_stamp = now;
7851
7852                 if (delta > 0 && delta < 2*TICK_NSEC)
7853                         perf_adjust_period(event, delta, hwc->last_period, true);
7854         }
7855
7856         return ret;
7857 }
7858
7859 int perf_event_account_interrupt(struct perf_event *event)
7860 {
7861         return __perf_event_account_interrupt(event, 1);
7862 }
7863
7864 /*
7865  * Generic event overflow handling, sampling.
7866  */
7867
7868 static int __perf_event_overflow(struct perf_event *event,
7869                                    int throttle, struct perf_sample_data *data,
7870                                    struct pt_regs *regs)
7871 {
7872         int events = atomic_read(&event->event_limit);
7873         int ret = 0;
7874
7875         /*
7876          * Non-sampling counters might still use the PMI to fold short
7877          * hardware counters, ignore those.
7878          */
7879         if (unlikely(!is_sampling_event(event)))
7880                 return 0;
7881
7882         ret = __perf_event_account_interrupt(event, throttle);
7883
7884         /*
7885          * XXX event_limit might not quite work as expected on inherited
7886          * events
7887          */
7888
7889         event->pending_kill = POLL_IN;
7890         if (events && atomic_dec_and_test(&event->event_limit)) {
7891                 ret = 1;
7892                 event->pending_kill = POLL_HUP;
7893
7894                 perf_event_disable_inatomic(event);
7895         }
7896
7897         READ_ONCE(event->overflow_handler)(event, data, regs);
7898
7899         if (*perf_event_fasync(event) && event->pending_kill) {
7900                 event->pending_wakeup = 1;
7901                 irq_work_queue(&event->pending);
7902         }
7903
7904         return ret;
7905 }
7906
7907 int perf_event_overflow(struct perf_event *event,
7908                           struct perf_sample_data *data,
7909                           struct pt_regs *regs)
7910 {
7911         return __perf_event_overflow(event, 1, data, regs);
7912 }
7913
7914 /*
7915  * Generic software event infrastructure
7916  */
7917
7918 struct swevent_htable {
7919         struct swevent_hlist            *swevent_hlist;
7920         struct mutex                    hlist_mutex;
7921         int                             hlist_refcount;
7922
7923         /* Recursion avoidance in each contexts */
7924         int                             recursion[PERF_NR_CONTEXTS];
7925 };
7926
7927 static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
7928
7929 /*
7930  * We directly increment event->count and keep a second value in
7931  * event->hw.period_left to count intervals. This period event
7932  * is kept in the range [-sample_period, 0] so that we can use the
7933  * sign as trigger.
7934  */
7935
7936 u64 perf_swevent_set_period(struct perf_event *event)
7937 {
7938         struct hw_perf_event *hwc = &event->hw;
7939         u64 period = hwc->last_period;
7940         u64 nr, offset;
7941         s64 old, val;
7942
7943         hwc->last_period = hwc->sample_period;
7944
7945 again:
7946         old = val = local64_read(&hwc->period_left);
7947         if (val < 0)
7948                 return 0;
7949
7950         nr = div64_u64(period + val, period);
7951         offset = nr * period;
7952         val -= offset;
7953         if (local64_cmpxchg(&hwc->period_left, old, val) != old)
7954                 goto again;
7955
7956         return nr;
7957 }
7958
7959 static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
7960                                     struct perf_sample_data *data,
7961                                     struct pt_regs *regs)
7962 {
7963         struct hw_perf_event *hwc = &event->hw;
7964         int throttle = 0;
7965
7966         if (!overflow)
7967                 overflow = perf_swevent_set_period(event);
7968
7969         if (hwc->interrupts == MAX_INTERRUPTS)
7970                 return;
7971
7972         for (; overflow; overflow--) {
7973                 if (__perf_event_overflow(event, throttle,
7974                                             data, regs)) {
7975                         /*
7976                          * We inhibit the overflow from happening when
7977                          * hwc->interrupts == MAX_INTERRUPTS.
7978                          */
7979                         break;
7980                 }
7981                 throttle = 1;
7982         }
7983 }
7984
7985 static void perf_swevent_event(struct perf_event *event, u64 nr,
7986                                struct perf_sample_data *data,
7987                                struct pt_regs *regs)
7988 {
7989         struct hw_perf_event *hwc = &event->hw;
7990
7991         local64_add(nr, &event->count);
7992
7993         if (!regs)
7994                 return;
7995
7996         if (!is_sampling_event(event))
7997                 return;
7998
7999         if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
8000                 data->period = nr;
8001                 return perf_swevent_overflow(event, 1, data, regs);
8002         } else
8003                 data->period = event->hw.last_period;
8004
8005         if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
8006                 return perf_swevent_overflow(event, 1, data, regs);
8007
8008         if (local64_add_negative(nr, &hwc->period_left))
8009                 return;
8010
8011         perf_swevent_overflow(event, 0, data, regs);
8012 }
8013
8014 static int perf_exclude_event(struct perf_event *event,
8015                               struct pt_regs *regs)
8016 {
8017         if (event->hw.state & PERF_HES_STOPPED)
8018                 return 1;
8019
8020         if (regs) {
8021                 if (event->attr.exclude_user && user_mode(regs))
8022                         return 1;
8023
8024                 if (event->attr.exclude_kernel && !user_mode(regs))
8025                         return 1;
8026         }
8027
8028         return 0;
8029 }
8030
8031 static int perf_swevent_match(struct perf_event *event,
8032                                 enum perf_type_id type,
8033                                 u32 event_id,
8034                                 struct perf_sample_data *data,
8035                                 struct pt_regs *regs)
8036 {
8037         if (event->attr.type != type)
8038                 return 0;
8039
8040         if (event->attr.config != event_id)
8041                 return 0;
8042
8043         if (perf_exclude_event(event, regs))
8044                 return 0;
8045
8046         return 1;
8047 }
8048
8049 static inline u64 swevent_hash(u64 type, u32 event_id)
8050 {
8051         u64 val = event_id | (type << 32);
8052
8053         return hash_64(val, SWEVENT_HLIST_BITS);
8054 }
8055
8056 static inline struct hlist_head *
8057 __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
8058 {
8059         u64 hash = swevent_hash(type, event_id);
8060
8061         return &hlist->heads[hash];
8062 }
8063
8064 /* For the read side: events when they trigger */
8065 static inline struct hlist_head *
8066 find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
8067 {
8068         struct swevent_hlist *hlist;
8069
8070         hlist = rcu_dereference(swhash->swevent_hlist);
8071         if (!hlist)
8072                 return NULL;
8073
8074         return __find_swevent_head(hlist, type, event_id);
8075 }
8076
8077 /* For the event head insertion and removal in the hlist */
8078 static inline struct hlist_head *
8079 find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
8080 {
8081         struct swevent_hlist *hlist;
8082         u32 event_id = event->attr.config;
8083         u64 type = event->attr.type;
8084
8085         /*
8086          * Event scheduling is always serialized against hlist allocation
8087          * and release. Which makes the protected version suitable here.
8088          * The context lock guarantees that.
8089          */
8090         hlist = rcu_dereference_protected(swhash->swevent_hlist,
8091                                           lockdep_is_held(&event->ctx->lock));
8092         if (!hlist)
8093                 return NULL;
8094
8095         return __find_swevent_head(hlist, type, event_id);
8096 }
8097
8098 static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
8099                                     u64 nr,
8100                                     struct perf_sample_data *data,
8101                                     struct pt_regs *regs)
8102 {
8103         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
8104         struct perf_event *event;
8105         struct hlist_head *head;
8106
8107         rcu_read_lock();
8108         head = find_swevent_head_rcu(swhash, type, event_id);
8109         if (!head)
8110                 goto end;
8111
8112         hlist_for_each_entry_rcu(event, head, hlist_entry) {
8113                 if (perf_swevent_match(event, type, event_id, data, regs))
8114                         perf_swevent_event(event, nr, data, regs);
8115         }
8116 end:
8117         rcu_read_unlock();
8118 }
8119
8120 DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
8121
8122 int perf_swevent_get_recursion_context(void)
8123 {
8124         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
8125
8126         return get_recursion_context(swhash->recursion);
8127 }
8128 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
8129
8130 void perf_swevent_put_recursion_context(int rctx)
8131 {
8132         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
8133
8134         put_recursion_context(swhash->recursion, rctx);
8135 }
8136
8137 void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
8138 {
8139         struct perf_sample_data data;
8140
8141         if (WARN_ON_ONCE(!regs))
8142                 return;
8143
8144         perf_sample_data_init(&data, addr, 0);
8145         do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
8146 }
8147
8148 void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
8149 {
8150         int rctx;
8151
8152         preempt_disable_notrace();
8153         rctx = perf_swevent_get_recursion_context();
8154         if (unlikely(rctx < 0))
8155                 goto fail;
8156
8157         ___perf_sw_event(event_id, nr, regs, addr);
8158
8159         perf_swevent_put_recursion_context(rctx);
8160 fail:
8161         preempt_enable_notrace();
8162 }
8163
8164 static void perf_swevent_read(struct perf_event *event)
8165 {
8166 }
8167
8168 static int perf_swevent_add(struct perf_event *event, int flags)
8169 {
8170         struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
8171         struct hw_perf_event *hwc = &event->hw;
8172         struct hlist_head *head;
8173
8174         if (is_sampling_event(event)) {
8175                 hwc->last_period = hwc->sample_period;
8176                 perf_swevent_set_period(event);
8177         }
8178
8179         hwc->state = !(flags & PERF_EF_START);
8180
8181         head = find_swevent_head(swhash, event);
8182         if (WARN_ON_ONCE(!head))
8183                 return -EINVAL;
8184
8185         hlist_add_head_rcu(&event->hlist_entry, head);
8186         perf_event_update_userpage(event);
8187
8188         return 0;
8189 }
8190
8191 static void perf_swevent_del(struct perf_event *event, int flags)
8192 {
8193         hlist_del_rcu(&event->hlist_entry);
8194 }
8195
8196 static void perf_swevent_start(struct perf_event *event, int flags)
8197 {
8198         event->hw.state = 0;
8199 }
8200
8201 static void perf_swevent_stop(struct perf_event *event, int flags)
8202 {
8203         event->hw.state = PERF_HES_STOPPED;
8204 }
8205
8206 /* Deref the hlist from the update side */
8207 static inline struct swevent_hlist *
8208 swevent_hlist_deref(struct swevent_htable *swhash)
8209 {
8210         return rcu_dereference_protected(swhash->swevent_hlist,
8211                                          lockdep_is_held(&swhash->hlist_mutex));
8212 }
8213
8214 static void swevent_hlist_release(struct swevent_htable *swhash)
8215 {
8216         struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
8217
8218         if (!hlist)
8219                 return;
8220
8221         RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
8222         kfree_rcu(hlist, rcu_head);
8223 }
8224
8225 static void swevent_hlist_put_cpu(int cpu)
8226 {
8227         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
8228
8229         mutex_lock(&swhash->hlist_mutex);
8230
8231         if (!--swhash->hlist_refcount)
8232                 swevent_hlist_release(swhash);
8233
8234         mutex_unlock(&swhash->hlist_mutex);
8235 }
8236
8237 static void swevent_hlist_put(void)
8238 {
8239         int cpu;
8240
8241         for_each_possible_cpu(cpu)
8242                 swevent_hlist_put_cpu(cpu);
8243 }
8244
8245 static int swevent_hlist_get_cpu(int cpu)
8246 {
8247         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
8248         int err = 0;
8249
8250         mutex_lock(&swhash->hlist_mutex);
8251         if (!swevent_hlist_deref(swhash) &&
8252             cpumask_test_cpu(cpu, perf_online_mask)) {
8253                 struct swevent_hlist *hlist;
8254
8255                 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
8256                 if (!hlist) {
8257                         err = -ENOMEM;
8258                         goto exit;
8259                 }
8260                 rcu_assign_pointer(swhash->swevent_hlist, hlist);
8261         }
8262         swhash->hlist_refcount++;
8263 exit:
8264         mutex_unlock(&swhash->hlist_mutex);
8265
8266         return err;
8267 }
8268
8269 static int swevent_hlist_get(void)
8270 {
8271         int err, cpu, failed_cpu;
8272
8273         mutex_lock(&pmus_lock);
8274         for_each_possible_cpu(cpu) {
8275                 err = swevent_hlist_get_cpu(cpu);
8276                 if (err) {
8277                         failed_cpu = cpu;
8278                         goto fail;
8279                 }
8280         }
8281         mutex_unlock(&pmus_lock);
8282         return 0;
8283 fail:
8284         for_each_possible_cpu(cpu) {
8285                 if (cpu == failed_cpu)
8286                         break;
8287                 swevent_hlist_put_cpu(cpu);
8288         }
8289         mutex_unlock(&pmus_lock);
8290         return err;
8291 }
8292
8293 struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
8294
8295 static void sw_perf_event_destroy(struct perf_event *event)
8296 {
8297         u64 event_id = event->attr.config;
8298
8299         WARN_ON(event->parent);
8300
8301         static_key_slow_dec(&perf_swevent_enabled[event_id]);
8302         swevent_hlist_put();
8303 }
8304
8305 static int perf_swevent_init(struct perf_event *event)
8306 {
8307         u64 event_id = event->attr.config;
8308
8309         if (event->attr.type != PERF_TYPE_SOFTWARE)
8310                 return -ENOENT;
8311
8312         /*
8313          * no branch sampling for software events
8314          */
8315         if (has_branch_stack(event))
8316                 return -EOPNOTSUPP;
8317
8318         switch (event_id) {
8319         case PERF_COUNT_SW_CPU_CLOCK:
8320         case PERF_COUNT_SW_TASK_CLOCK:
8321                 return -ENOENT;
8322
8323         default:
8324                 break;
8325         }
8326
8327         if (event_id >= PERF_COUNT_SW_MAX)
8328                 return -ENOENT;
8329
8330         if (!event->parent) {
8331                 int err;
8332
8333                 err = swevent_hlist_get();
8334                 if (err)
8335                         return err;
8336
8337                 static_key_slow_inc(&perf_swevent_enabled[event_id]);
8338                 event->destroy = sw_perf_event_destroy;
8339         }
8340
8341         return 0;
8342 }
8343
8344 static struct pmu perf_swevent = {
8345         .task_ctx_nr    = perf_sw_context,
8346
8347         .capabilities   = PERF_PMU_CAP_NO_NMI,
8348
8349         .event_init     = perf_swevent_init,
8350         .add            = perf_swevent_add,
8351         .del            = perf_swevent_del,
8352         .start          = perf_swevent_start,
8353         .stop           = perf_swevent_stop,
8354         .read           = perf_swevent_read,
8355 };
8356
8357 #ifdef CONFIG_EVENT_TRACING
8358
8359 static int perf_tp_filter_match(struct perf_event *event,
8360                                 struct perf_sample_data *data)
8361 {
8362         void *record = data->raw->frag.data;
8363
8364         /* only top level events have filters set */
8365         if (event->parent)
8366                 event = event->parent;
8367
8368         if (likely(!event->filter) || filter_match_preds(event->filter, record))
8369                 return 1;
8370         return 0;
8371 }
8372
8373 static int perf_tp_event_match(struct perf_event *event,
8374                                 struct perf_sample_data *data,
8375                                 struct pt_regs *regs)
8376 {
8377         if (event->hw.state & PERF_HES_STOPPED)
8378                 return 0;
8379         /*
8380          * All tracepoints are from kernel-space.
8381          */
8382         if (event->attr.exclude_kernel)
8383                 return 0;
8384
8385         if (!perf_tp_filter_match(event, data))
8386                 return 0;
8387
8388         return 1;
8389 }
8390
8391 void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
8392                                struct trace_event_call *call, u64 count,
8393                                struct pt_regs *regs, struct hlist_head *head,
8394                                struct task_struct *task)
8395 {
8396         if (bpf_prog_array_valid(call)) {
8397                 *(struct pt_regs **)raw_data = regs;
8398                 if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
8399                         perf_swevent_put_recursion_context(rctx);
8400                         return;
8401                 }
8402         }
8403         perf_tp_event(call->event.type, count, raw_data, size, regs, head,
8404                       rctx, task);
8405 }
8406 EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
8407
8408 void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
8409                    struct pt_regs *regs, struct hlist_head *head, int rctx,
8410                    struct task_struct *task)
8411 {
8412         struct perf_sample_data data;
8413         struct perf_event *event;
8414
8415         struct perf_raw_record raw = {
8416                 .frag = {
8417                         .size = entry_size,
8418                         .data = record,
8419                 },
8420         };
8421
8422         perf_sample_data_init(&data, 0, 0);
8423         data.raw = &raw;
8424
8425         perf_trace_buf_update(record, event_type);
8426
8427         hlist_for_each_entry_rcu(event, head, hlist_entry) {
8428                 if (perf_tp_event_match(event, &data, regs))
8429                         perf_swevent_event(event, count, &data, regs);
8430         }
8431
8432         /*
8433          * If we got specified a target task, also iterate its context and
8434          * deliver this event there too.
8435          */
8436         if (task && task != current) {
8437                 struct perf_event_context *ctx;
8438                 struct trace_entry *entry = record;
8439
8440                 rcu_read_lock();
8441                 ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
8442                 if (!ctx)
8443                         goto unlock;
8444
8445                 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
8446                         if (event->cpu != smp_processor_id())
8447                                 continue;
8448                         if (event->attr.type != PERF_TYPE_TRACEPOINT)
8449                                 continue;
8450                         if (event->attr.config != entry->type)
8451                                 continue;
8452                         if (perf_tp_event_match(event, &data, regs))
8453                                 perf_swevent_event(event, count, &data, regs);
8454                 }
8455 unlock:
8456                 rcu_read_unlock();
8457         }
8458
8459         perf_swevent_put_recursion_context(rctx);
8460 }
8461 EXPORT_SYMBOL_GPL(perf_tp_event);
8462
8463 static void tp_perf_event_destroy(struct perf_event *event)
8464 {
8465         perf_trace_destroy(event);
8466 }
8467
8468 static int perf_tp_event_init(struct perf_event *event)
8469 {
8470         int err;
8471
8472         if (event->attr.type != PERF_TYPE_TRACEPOINT)
8473                 return -ENOENT;
8474
8475         /*
8476          * no branch sampling for tracepoint events
8477          */
8478         if (has_branch_stack(event))
8479                 return -EOPNOTSUPP;
8480
8481         err = perf_trace_init(event);
8482         if (err)
8483                 return err;
8484
8485         event->destroy = tp_perf_event_destroy;
8486
8487         return 0;
8488 }
8489
8490 static struct pmu perf_tracepoint = {
8491         .task_ctx_nr    = perf_sw_context,
8492
8493         .event_init     = perf_tp_event_init,
8494         .add            = perf_trace_add,
8495         .del            = perf_trace_del,
8496         .start          = perf_swevent_start,
8497         .stop           = perf_swevent_stop,
8498         .read           = perf_swevent_read,
8499 };
8500
8501 #if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
8502 /*
8503  * Flags in config, used by dynamic PMU kprobe and uprobe
8504  * The flags should match following PMU_FORMAT_ATTR().
8505  *
8506  * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe
8507  *                               if not set, create kprobe/uprobe
8508  */
8509 enum perf_probe_config {
8510         PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0,  /* [k,u]retprobe */
8511 };
8512
8513 PMU_FORMAT_ATTR(retprobe, "config:0");
8514
8515 static struct attribute *probe_attrs[] = {
8516         &format_attr_retprobe.attr,
8517         NULL,
8518 };
8519
8520 static struct attribute_group probe_format_group = {
8521         .name = "format",
8522         .attrs = probe_attrs,
8523 };
8524
8525 static const struct attribute_group *probe_attr_groups[] = {
8526         &probe_format_group,
8527         NULL,
8528 };
8529 #endif
8530
8531 #ifdef CONFIG_KPROBE_EVENTS
8532 static int perf_kprobe_event_init(struct perf_event *event);
8533 static struct pmu perf_kprobe = {
8534         .task_ctx_nr    = perf_sw_context,
8535         .event_init     = perf_kprobe_event_init,
8536         .add            = perf_trace_add,
8537         .del            = perf_trace_del,
8538         .start          = perf_swevent_start,
8539         .stop           = perf_swevent_stop,
8540         .read           = perf_swevent_read,
8541         .attr_groups    = probe_attr_groups,
8542 };
8543
8544 static int perf_kprobe_event_init(struct perf_event *event)
8545 {
8546         int err;
8547         bool is_retprobe;
8548
8549         if (event->attr.type != perf_kprobe.type)
8550                 return -ENOENT;
8551
8552         if (!capable(CAP_SYS_ADMIN))
8553                 return -EACCES;
8554
8555         /*
8556          * no branch sampling for probe events
8557          */
8558         if (has_branch_stack(event))
8559                 return -EOPNOTSUPP;
8560
8561         is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
8562         err = perf_kprobe_init(event, is_retprobe);
8563         if (err)
8564                 return err;
8565
8566         event->destroy = perf_kprobe_destroy;
8567
8568         return 0;
8569 }
8570 #endif /* CONFIG_KPROBE_EVENTS */
8571
8572 #ifdef CONFIG_UPROBE_EVENTS
8573 static int perf_uprobe_event_init(struct perf_event *event);
8574 static struct pmu perf_uprobe = {
8575         .task_ctx_nr    = perf_sw_context,
8576         .event_init     = perf_uprobe_event_init,
8577         .add            = perf_trace_add,
8578         .del            = perf_trace_del,
8579         .start          = perf_swevent_start,
8580         .stop           = perf_swevent_stop,
8581         .read           = perf_swevent_read,
8582         .attr_groups    = probe_attr_groups,
8583 };
8584
8585 static int perf_uprobe_event_init(struct perf_event *event)
8586 {
8587         int err;
8588         bool is_retprobe;
8589
8590         if (event->attr.type != perf_uprobe.type)
8591                 return -ENOENT;
8592
8593         if (!capable(CAP_SYS_ADMIN))
8594                 return -EACCES;
8595
8596         /*
8597          * no branch sampling for probe events
8598          */
8599         if (has_branch_stack(event))
8600                 return -EOPNOTSUPP;
8601
8602         is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
8603         err = perf_uprobe_init(event, is_retprobe);
8604         if (err)
8605                 return err;
8606
8607         event->destroy = perf_uprobe_destroy;
8608
8609         return 0;
8610 }
8611 #endif /* CONFIG_UPROBE_EVENTS */
8612
8613 static inline void perf_tp_register(void)
8614 {
8615         perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
8616 #ifdef CONFIG_KPROBE_EVENTS
8617         perf_pmu_register(&perf_kprobe, "kprobe", -1);
8618 #endif
8619 #ifdef CONFIG_UPROBE_EVENTS
8620         perf_pmu_register(&perf_uprobe, "uprobe", -1);
8621 #endif
8622 }
8623
8624 static void perf_event_free_filter(struct perf_event *event)
8625 {
8626         ftrace_profile_free_filter(event);
8627 }
8628
8629 #ifdef CONFIG_BPF_SYSCALL
8630 static void bpf_overflow_handler(struct perf_event *event,
8631                                  struct perf_sample_data *data,
8632                                  struct pt_regs *regs)
8633 {
8634         struct bpf_perf_event_data_kern ctx = {
8635                 .data = data,
8636                 .event = event,
8637         };
8638         int ret = 0;
8639
8640         ctx.regs = perf_arch_bpf_user_pt_regs(regs);
8641         preempt_disable();
8642         if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
8643                 goto out;
8644         rcu_read_lock();
8645         ret = BPF_PROG_RUN(event->prog, &ctx);
8646         rcu_read_unlock();
8647 out:
8648         __this_cpu_dec(bpf_prog_active);
8649         preempt_enable();
8650         if (!ret)
8651                 return;
8652
8653         event->orig_overflow_handler(event, data, regs);
8654 }
8655
8656 static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
8657 {
8658         struct bpf_prog *prog;
8659
8660         if (event->overflow_handler_context)
8661                 /* hw breakpoint or kernel counter */
8662                 return -EINVAL;
8663
8664         if (event->prog)
8665                 return -EEXIST;
8666
8667         prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
8668         if (IS_ERR(prog))
8669                 return PTR_ERR(prog);
8670
8671         event->prog = prog;
8672         event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
8673         WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
8674         return 0;
8675 }
8676
8677 static void perf_event_free_bpf_handler(struct perf_event *event)
8678 {
8679         struct bpf_prog *prog = event->prog;
8680
8681         if (!prog)
8682                 return;
8683
8684         WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
8685         event->prog = NULL;
8686         bpf_prog_put(prog);
8687 }
8688 #else
8689 static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
8690 {
8691         return -EOPNOTSUPP;
8692 }
8693 static void perf_event_free_bpf_handler(struct perf_event *event)
8694 {
8695 }
8696 #endif
8697
8698 /*
8699  * returns true if the event is a tracepoint, or a kprobe/upprobe created
8700  * with perf_event_open()
8701  */
8702 static inline bool perf_event_is_tracing(struct perf_event *event)
8703 {
8704         if (event->pmu == &perf_tracepoint)
8705                 return true;
8706 #ifdef CONFIG_KPROBE_EVENTS
8707         if (event->pmu == &perf_kprobe)
8708                 return true;
8709 #endif
8710 #ifdef CONFIG_UPROBE_EVENTS
8711         if (event->pmu == &perf_uprobe)
8712                 return true;
8713 #endif
8714         return false;
8715 }
8716
8717 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
8718 {
8719         bool is_kprobe, is_tracepoint, is_syscall_tp;
8720         struct bpf_prog *prog;
8721         int ret;
8722
8723         if (!perf_event_is_tracing(event))
8724                 return perf_event_set_bpf_handler(event, prog_fd);
8725
8726         is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
8727         is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
8728         is_syscall_tp = is_syscall_trace_event(event->tp_event);
8729         if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
8730                 /* bpf programs can only be attached to u/kprobe or tracepoint */
8731                 return -EINVAL;
8732
8733         prog = bpf_prog_get(prog_fd);
8734         if (IS_ERR(prog))
8735                 return PTR_ERR(prog);
8736
8737         if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
8738             (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
8739             (is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
8740                 /* valid fd, but invalid bpf program type */
8741                 bpf_prog_put(prog);
8742                 return -EINVAL;
8743         }
8744
8745         /* Kprobe override only works for kprobes, not uprobes. */
8746         if (prog->kprobe_override &&
8747             !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) {
8748                 bpf_prog_put(prog);
8749                 return -EINVAL;
8750         }
8751
8752         if (is_tracepoint || is_syscall_tp) {
8753                 int off = trace_event_get_offsets(event->tp_event);
8754
8755                 if (prog->aux->max_ctx_offset > off) {
8756                         bpf_prog_put(prog);
8757                         return -EACCES;
8758                 }
8759         }
8760
8761         ret = perf_event_attach_bpf_prog(event, prog);
8762         if (ret)
8763                 bpf_prog_put(prog);
8764         return ret;
8765 }
8766
8767 static void perf_event_free_bpf_prog(struct perf_event *event)
8768 {
8769         if (!perf_event_is_tracing(event)) {
8770                 perf_event_free_bpf_handler(event);
8771                 return;
8772         }
8773         perf_event_detach_bpf_prog(event);
8774 }
8775
8776 #else
8777
8778 static inline void perf_tp_register(void)
8779 {
8780 }
8781
8782 static void perf_event_free_filter(struct perf_event *event)
8783 {
8784 }
8785
8786 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
8787 {
8788         return -ENOENT;
8789 }
8790
8791 static void perf_event_free_bpf_prog(struct perf_event *event)
8792 {
8793 }
8794 #endif /* CONFIG_EVENT_TRACING */
8795
8796 #ifdef CONFIG_HAVE_HW_BREAKPOINT
8797 void perf_bp_event(struct perf_event *bp, void *data)
8798 {
8799         struct perf_sample_data sample;
8800         struct pt_regs *regs = data;
8801
8802         perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
8803
8804         if (!bp->hw.state && !perf_exclude_event(bp, regs))
8805                 perf_swevent_event(bp, 1, &sample, regs);
8806 }
8807 #endif
8808
8809 /*
8810  * Allocate a new address filter
8811  */
8812 static struct perf_addr_filter *
8813 perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
8814 {
8815         int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
8816         struct perf_addr_filter *filter;
8817
8818         filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
8819         if (!filter)
8820                 return NULL;
8821
8822         INIT_LIST_HEAD(&filter->entry);
8823         list_add_tail(&filter->entry, filters);
8824
8825         return filter;
8826 }
8827
8828 static void free_filters_list(struct list_head *filters)
8829 {
8830         struct perf_addr_filter *filter, *iter;
8831
8832         list_for_each_entry_safe(filter, iter, filters, entry) {
8833                 path_put(&filter->path);
8834                 list_del(&filter->entry);
8835                 kfree(filter);
8836         }
8837 }
8838
8839 /*
8840  * Free existing address filters and optionally install new ones
8841  */
8842 static void perf_addr_filters_splice(struct perf_event *event,
8843                                      struct list_head *head)
8844 {
8845         unsigned long flags;
8846         LIST_HEAD(list);
8847
8848         if (!has_addr_filter(event))
8849                 return;
8850
8851         /* don't bother with children, they don't have their own filters */
8852         if (event->parent)
8853                 return;
8854
8855         raw_spin_lock_irqsave(&event->addr_filters.lock, flags);
8856
8857         list_splice_init(&event->addr_filters.list, &list);
8858         if (head)
8859                 list_splice(head, &event->addr_filters.list);
8860
8861         raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);
8862
8863         free_filters_list(&list);
8864 }
8865
8866 /*
8867  * Scan through mm's vmas and see if one of them matches the
8868  * @filter; if so, adjust filter's address range.
8869  * Called with mm::mmap_sem down for reading.
8870  */
8871 static void perf_addr_filter_apply(struct perf_addr_filter *filter,
8872                                    struct mm_struct *mm,
8873                                    struct perf_addr_filter_range *fr)
8874 {
8875         struct vm_area_struct *vma;
8876
8877         for (vma = mm->mmap; vma; vma = vma->vm_next) {
8878                 if (!vma->vm_file)
8879                         continue;
8880
8881                 if (perf_addr_filter_vma_adjust(filter, vma, fr))
8882                         return;
8883         }
8884 }
8885
8886 /*
8887  * Update event's address range filters based on the
8888  * task's existing mappings, if any.
8889  */
8890 static void perf_event_addr_filters_apply(struct perf_event *event)
8891 {
8892         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
8893         struct task_struct *task = READ_ONCE(event->ctx->task);
8894         struct perf_addr_filter *filter;
8895         struct mm_struct *mm = NULL;
8896         unsigned int count = 0;
8897         unsigned long flags;
8898
8899         /*
8900          * We may observe TASK_TOMBSTONE, which means that the event tear-down
8901          * will stop on the parent's child_mutex that our caller is also holding
8902          */
8903         if (task == TASK_TOMBSTONE)
8904                 return;
8905
8906         if (ifh->nr_file_filters) {
8907                 mm = get_task_mm(event->ctx->task);
8908                 if (!mm)
8909                         goto restart;
8910
8911                 down_read(&mm->mmap_sem);
8912         }
8913
8914         raw_spin_lock_irqsave(&ifh->lock, flags);
8915         list_for_each_entry(filter, &ifh->list, entry) {
8916                 if (filter->path.dentry) {
8917                         /*
8918                          * Adjust base offset if the filter is associated to a
8919                          * binary that needs to be mapped:
8920                          */
8921                         event->addr_filter_ranges[count].start = 0;
8922                         event->addr_filter_ranges[count].size = 0;
8923
8924                         perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]);
8925                 } else {
8926                         event->addr_filter_ranges[count].start = filter->offset;
8927                         event->addr_filter_ranges[count].size  = filter->size;
8928                 }
8929
8930                 count++;
8931         }
8932
8933         event->addr_filters_gen++;
8934         raw_spin_unlock_irqrestore(&ifh->lock, flags);
8935
8936         if (ifh->nr_file_filters) {
8937                 up_read(&mm->mmap_sem);
8938
8939                 mmput(mm);
8940         }
8941
8942 restart:
8943         perf_event_stop(event, 1);
8944 }
8945
8946 /*
8947  * Address range filtering: limiting the data to certain
8948  * instruction address ranges. Filters are ioctl()ed to us from
8949  * userspace as ascii strings.
8950  *
8951  * Filter string format:
8952  *
8953  * ACTION RANGE_SPEC
8954  * where ACTION is one of the
8955  *  * "filter": limit the trace to this region
8956  *  * "start": start tracing from this address
8957  *  * "stop": stop tracing at this address/region;
8958  * RANGE_SPEC is
8959  *  * for kernel addresses: <start address>[/<size>]
8960  *  * for object files:     <start address>[/<size>]@</path/to/object/file>
8961  *
8962  * if <size> is not specified or is zero, the range is treated as a single
8963  * address; not valid for ACTION=="filter".
8964  */
8965 enum {
8966         IF_ACT_NONE = -1,
8967         IF_ACT_FILTER,
8968         IF_ACT_START,
8969         IF_ACT_STOP,
8970         IF_SRC_FILE,
8971         IF_SRC_KERNEL,
8972         IF_SRC_FILEADDR,
8973         IF_SRC_KERNELADDR,
8974 };
8975
8976 enum {
8977         IF_STATE_ACTION = 0,
8978         IF_STATE_SOURCE,
8979         IF_STATE_END,
8980 };
8981
8982 static const match_table_t if_tokens = {
8983         { IF_ACT_FILTER,        "filter" },
8984         { IF_ACT_START,         "start" },
8985         { IF_ACT_STOP,          "stop" },
8986         { IF_SRC_FILE,          "%u/%u@%s" },
8987         { IF_SRC_KERNEL,        "%u/%u" },
8988         { IF_SRC_FILEADDR,      "%u@%s" },
8989         { IF_SRC_KERNELADDR,    "%u" },
8990         { IF_ACT_NONE,          NULL },
8991 };
8992
8993 /*
8994  * Address filter string parser
8995  */
8996 static int
8997 perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
8998                              struct list_head *filters)
8999 {
9000         struct perf_addr_filter *filter = NULL;
9001         char *start, *orig, *filename = NULL;
9002         substring_t args[MAX_OPT_ARGS];
9003         int state = IF_STATE_ACTION, token;
9004         unsigned int kernel = 0;
9005         int ret = -EINVAL;
9006
9007         orig = fstr = kstrdup(fstr, GFP_KERNEL);
9008         if (!fstr)
9009                 return -ENOMEM;
9010
9011         while ((start = strsep(&fstr, " ,\n")) != NULL) {
9012                 static const enum perf_addr_filter_action_t actions[] = {
9013                         [IF_ACT_FILTER] = PERF_ADDR_FILTER_ACTION_FILTER,
9014                         [IF_ACT_START]  = PERF_ADDR_FILTER_ACTION_START,
9015                         [IF_ACT_STOP]   = PERF_ADDR_FILTER_ACTION_STOP,
9016                 };
9017                 ret = -EINVAL;
9018
9019                 if (!*start)
9020                         continue;
9021
9022                 /* filter definition begins */
9023                 if (state == IF_STATE_ACTION) {
9024                         filter = perf_addr_filter_new(event, filters);
9025                         if (!filter)
9026                                 goto fail;
9027                 }
9028
9029                 token = match_token(start, if_tokens, args);
9030                 switch (token) {
9031                 case IF_ACT_FILTER:
9032                 case IF_ACT_START:
9033                 case IF_ACT_STOP:
9034                         if (state != IF_STATE_ACTION)
9035                                 goto fail;
9036
9037                         filter->action = actions[token];
9038                         state = IF_STATE_SOURCE;
9039                         break;
9040
9041                 case IF_SRC_KERNELADDR:
9042                 case IF_SRC_KERNEL:
9043                         kernel = 1;
9044
9045                 case IF_SRC_FILEADDR:
9046                 case IF_SRC_FILE:
9047                         if (state != IF_STATE_SOURCE)
9048                                 goto fail;
9049
9050                         *args[0].to = 0;
9051                         ret = kstrtoul(args[0].from, 0, &filter->offset);
9052                         if (ret)
9053                                 goto fail;
9054
9055                         if (token == IF_SRC_KERNEL || token == IF_SRC_FILE) {
9056                                 *args[1].to = 0;
9057                                 ret = kstrtoul(args[1].from, 0, &filter->size);
9058                                 if (ret)
9059                                         goto fail;
9060                         }
9061
9062                         if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
9063                                 int fpos = token == IF_SRC_FILE ? 2 : 1;
9064
9065                                 kfree(filename);
9066                                 filename = match_strdup(&args[fpos]);
9067                                 if (!filename) {
9068                                         ret = -ENOMEM;
9069                                         goto fail;
9070                                 }
9071                         }
9072
9073                         state = IF_STATE_END;
9074                         break;
9075
9076                 default:
9077                         goto fail;
9078                 }
9079
9080                 /*
9081                  * Filter definition is fully parsed, validate and install it.
9082                  * Make sure that it doesn't contradict itself or the event's
9083                  * attribute.
9084                  */
9085                 if (state == IF_STATE_END) {
9086                         ret = -EINVAL;
9087                         if (kernel && event->attr.exclude_kernel)
9088                                 goto fail;
9089
9090                         /*
9091                          * ACTION "filter" must have a non-zero length region
9092                          * specified.
9093                          */
9094                         if (filter->action == PERF_ADDR_FILTER_ACTION_FILTER &&
9095                             !filter->size)
9096                                 goto fail;
9097
9098                         if (!kernel) {
9099                                 if (!filename)
9100                                         goto fail;
9101
9102                                 /*
9103                                  * For now, we only support file-based filters
9104                                  * in per-task events; doing so for CPU-wide
9105                                  * events requires additional context switching
9106                                  * trickery, since same object code will be
9107                                  * mapped at different virtual addresses in
9108                                  * different processes.
9109                                  */
9110                                 ret = -EOPNOTSUPP;
9111                                 if (!event->ctx->task)
9112                                         goto fail;
9113
9114                                 /* look up the path and grab its inode */
9115                                 ret = kern_path(filename, LOOKUP_FOLLOW,
9116                                                 &filter->path);
9117                                 if (ret)
9118                                         goto fail;
9119
9120                                 ret = -EINVAL;
9121                                 if (!filter->path.dentry ||
9122                                     !S_ISREG(d_inode(filter->path.dentry)
9123                                              ->i_mode))
9124                                         goto fail;
9125
9126                                 event->addr_filters.nr_file_filters++;
9127                         }
9128
9129                         /* ready to consume more filters */
9130                         state = IF_STATE_ACTION;
9131                         filter = NULL;
9132                 }
9133         }
9134
9135         if (state != IF_STATE_ACTION)
9136                 goto fail;
9137
9138         kfree(filename);
9139         kfree(orig);
9140
9141         return 0;
9142
9143 fail:
9144         kfree(filename);
9145         free_filters_list(filters);
9146         kfree(orig);
9147
9148         return ret;
9149 }
9150
9151 static int
9152 perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
9153 {
9154         LIST_HEAD(filters);
9155         int ret;
9156
9157         /*
9158          * Since this is called in perf_ioctl() path, we're already holding
9159          * ctx::mutex.
9160          */
9161         lockdep_assert_held(&event->ctx->mutex);
9162
9163         if (WARN_ON_ONCE(event->parent))
9164                 return -EINVAL;
9165
9166         ret = perf_event_parse_addr_filter(event, filter_str, &filters);
9167         if (ret)
9168                 goto fail_clear_files;
9169
9170         ret = event->pmu->addr_filters_validate(&filters);
9171         if (ret)
9172                 goto fail_free_filters;
9173
9174         /* remove existing filters, if any */
9175         perf_addr_filters_splice(event, &filters);
9176
9177         /* install new filters */
9178         perf_event_for_each_child(event, perf_event_addr_filters_apply);
9179
9180         return ret;
9181
9182 fail_free_filters:
9183         free_filters_list(&filters);
9184
9185 fail_clear_files:
9186         event->addr_filters.nr_file_filters = 0;
9187
9188         return ret;
9189 }
9190
9191 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
9192 {
9193         int ret = -EINVAL;
9194         char *filter_str;
9195
9196         filter_str = strndup_user(arg, PAGE_SIZE);
9197         if (IS_ERR(filter_str))
9198                 return PTR_ERR(filter_str);
9199
9200 #ifdef CONFIG_EVENT_TRACING
9201         if (perf_event_is_tracing(event)) {
9202                 struct perf_event_context *ctx = event->ctx;
9203
9204                 /*
9205                  * Beware, here be dragons!!
9206                  *
9207                  * the tracepoint muck will deadlock against ctx->mutex, but
9208                  * the tracepoint stuff does not actually need it. So
9209                  * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we
9210                  * already have a reference on ctx.
9211                  *
9212                  * This can result in event getting moved to a different ctx,
9213                  * but that does not affect the tracepoint state.
9214                  */
9215                 mutex_unlock(&ctx->mutex);
9216                 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
9217                 mutex_lock(&ctx->mutex);
9218         } else
9219 #endif
9220         if (has_addr_filter(event))
9221                 ret = perf_event_set_addr_filter(event, filter_str);
9222
9223         kfree(filter_str);
9224         return ret;
9225 }
9226
9227 /*
9228  * hrtimer based swevent callback
9229  */
9230
9231 static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
9232 {
9233         enum hrtimer_restart ret = HRTIMER_RESTART;
9234         struct perf_sample_data data;
9235         struct pt_regs *regs;
9236         struct perf_event *event;
9237         u64 period;
9238
9239         event = container_of(hrtimer, struct perf_event, hw.hrtimer);
9240
9241         if (event->state != PERF_EVENT_STATE_ACTIVE)
9242                 return HRTIMER_NORESTART;
9243
9244         event->pmu->read(event);
9245
9246         perf_sample_data_init(&data, 0, event->hw.last_period);
9247         regs = get_irq_regs();
9248
9249         if (regs && !perf_exclude_event(event, regs)) {
9250                 if (!(event->attr.exclude_idle && is_idle_task(current)))
9251                         if (__perf_event_overflow(event, 1, &data, regs))
9252                                 ret = HRTIMER_NORESTART;
9253         }
9254
9255         period = max_t(u64, 10000, event->hw.sample_period);
9256         hrtimer_forward_now(hrtimer, ns_to_ktime(period));
9257
9258         return ret;
9259 }
9260
9261 static void perf_swevent_start_hrtimer(struct perf_event *event)
9262 {
9263         struct hw_perf_event *hwc = &event->hw;
9264         s64 period;
9265
9266         if (!is_sampling_event(event))
9267                 return;
9268
9269         period = local64_read(&hwc->period_left);
9270         if (period) {
9271                 if (period < 0)
9272                         period = 10000;
9273
9274                 local64_set(&hwc->period_left, 0);
9275         } else {
9276                 period = max_t(u64, 10000, hwc->sample_period);
9277         }
9278         hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
9279                       HRTIMER_MODE_REL_PINNED);
9280 }
9281
9282 static void perf_swevent_cancel_hrtimer(struct perf_event *event)
9283 {
9284         struct hw_perf_event *hwc = &event->hw;
9285
9286         if (is_sampling_event(event)) {
9287                 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
9288                 local64_set(&hwc->period_left, ktime_to_ns(remaining));
9289
9290                 hrtimer_cancel(&hwc->hrtimer);
9291         }
9292 }
9293
9294 static void perf_swevent_init_hrtimer(struct perf_event *event)
9295 {
9296         struct hw_perf_event *hwc = &event->hw;
9297
9298         if (!is_sampling_event(event))
9299                 return;
9300
9301         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
9302         hwc->hrtimer.function = perf_swevent_hrtimer;
9303
9304         /*
9305          * Since hrtimers have a fixed rate, we can do a static freq->period
9306          * mapping and avoid the whole period adjust feedback stuff.
9307          */
9308         if (event->attr.freq) {
9309                 long freq = event->attr.sample_freq;
9310
9311                 event->attr.sample_period = NSEC_PER_SEC / freq;
9312                 hwc->sample_period = event->attr.sample_period;
9313                 local64_set(&hwc->period_left, hwc->sample_period);
9314                 hwc->last_period = hwc->sample_period;
9315                 event->attr.freq = 0;
9316         }
9317 }
9318
9319 /*
9320  * Software event: cpu wall time clock
9321  */
9322
9323 static void cpu_clock_event_update(struct perf_event *event)
9324 {
9325         s64 prev;
9326         u64 now;
9327
9328         now = local_clock();
9329         prev = local64_xchg(&event->hw.prev_count, now);
9330         local64_add(now - prev, &event->count);
9331 }
9332
9333 static void cpu_clock_event_start(struct perf_event *event, int flags)
9334 {
9335         local64_set(&event->hw.prev_count, local_clock());
9336         perf_swevent_start_hrtimer(event);
9337 }
9338
9339 static void cpu_clock_event_stop(struct perf_event *event, int flags)
9340 {
9341         perf_swevent_cancel_hrtimer(event);
9342         cpu_clock_event_update(event);
9343 }
9344
9345 static int cpu_clock_event_add(struct perf_event *event, int flags)
9346 {
9347         if (flags & PERF_EF_START)
9348                 cpu_clock_event_start(event, flags);
9349         perf_event_update_userpage(event);
9350
9351         return 0;
9352 }
9353
9354 static void cpu_clock_event_del(struct perf_event *event, int flags)
9355 {
9356         cpu_clock_event_stop(event, flags);
9357 }
9358
9359 static void cpu_clock_event_read(struct perf_event *event)
9360 {
9361         cpu_clock_event_update(event);
9362 }
9363
9364 static int cpu_clock_event_init(struct perf_event *event)
9365 {
9366         if (event->attr.type != PERF_TYPE_SOFTWARE)
9367                 return -ENOENT;
9368
9369         if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
9370                 return -ENOENT;
9371
9372         /*
9373          * no branch sampling for software events
9374          */
9375         if (has_branch_stack(event))
9376                 return -EOPNOTSUPP;
9377
9378         perf_swevent_init_hrtimer(event);
9379
9380         return 0;
9381 }
9382
9383 static struct pmu perf_cpu_clock = {
9384         .task_ctx_nr    = perf_sw_context,
9385
9386         .capabilities   = PERF_PMU_CAP_NO_NMI,
9387
9388         .event_init     = cpu_clock_event_init,
9389         .add            = cpu_clock_event_add,
9390         .del            = cpu_clock_event_del,
9391         .start          = cpu_clock_event_start,
9392         .stop           = cpu_clock_event_stop,
9393         .read           = cpu_clock_event_read,
9394 };
9395
9396 /*
9397  * Software event: task time clock
9398  */
9399
9400 static void task_clock_event_update(struct perf_event *event, u64 now)
9401 {
9402         u64 prev;
9403         s64 delta;
9404
9405         prev = local64_xchg(&event->hw.prev_count, now);
9406         delta = now - prev;
9407         local64_add(delta, &event->count);
9408 }
9409
9410 static void task_clock_event_start(struct perf_event *event, int flags)
9411 {
9412         local64_set(&event->hw.prev_count, event->ctx->time);
9413         perf_swevent_start_hrtimer(event);
9414 }
9415
9416 static void task_clock_event_stop(struct perf_event *event, int flags)
9417 {
9418         perf_swevent_cancel_hrtimer(event);
9419         task_clock_event_update(event, event->ctx->time);
9420 }
9421
9422 static int task_clock_event_add(struct perf_event *event, int flags)
9423 {
9424         if (flags & PERF_EF_START)
9425                 task_clock_event_start(event, flags);
9426         perf_event_update_userpage(event);
9427
9428         return 0;
9429 }
9430
9431 static void task_clock_event_del(struct perf_event *event, int flags)
9432 {
9433         task_clock_event_stop(event, PERF_EF_UPDATE);
9434 }
9435
9436 static void task_clock_event_read(struct perf_event *event)
9437 {
9438         u64 now = perf_clock();
9439         u64 delta = now - event->ctx->timestamp;
9440         u64 time = event->ctx->time + delta;
9441
9442         task_clock_event_update(event, time);
9443 }
9444
9445 static int task_clock_event_init(struct perf_event *event)
9446 {
9447         if (event->attr.type != PERF_TYPE_SOFTWARE)
9448                 return -ENOENT;
9449
9450         if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
9451                 return -ENOENT;
9452
9453         /*
9454          * no branch sampling for software events
9455          */
9456         if (has_branch_stack(event))
9457                 return -EOPNOTSUPP;
9458
9459         perf_swevent_init_hrtimer(event);
9460
9461         return 0;
9462 }
9463
9464 static struct pmu perf_task_clock = {
9465         .task_ctx_nr    = perf_sw_context,
9466
9467         .capabilities   = PERF_PMU_CAP_NO_NMI,
9468
9469         .event_init     = task_clock_event_init,
9470         .add            = task_clock_event_add,
9471         .del            = task_clock_event_del,
9472         .start          = task_clock_event_start,
9473         .stop           = task_clock_event_stop,
9474         .read           = task_clock_event_read,
9475 };
9476
9477 static void perf_pmu_nop_void(struct pmu *pmu)
9478 {
9479 }
9480
9481 static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
9482 {
9483 }
9484
9485 static int perf_pmu_nop_int(struct pmu *pmu)
9486 {
9487         return 0;
9488 }
9489
9490 static int perf_event_nop_int(struct perf_event *event, u64 value)
9491 {
9492         return 0;
9493 }
9494
9495 static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
9496
9497 static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
9498 {
9499         __this_cpu_write(nop_txn_flags, flags);
9500
9501         if (flags & ~PERF_PMU_TXN_ADD)
9502                 return;
9503
9504         perf_pmu_disable(pmu);
9505 }
9506
9507 static int perf_pmu_commit_txn(struct pmu *pmu)
9508 {
9509         unsigned int flags = __this_cpu_read(nop_txn_flags);
9510
9511         __this_cpu_write(nop_txn_flags, 0);
9512
9513         if (flags & ~PERF_PMU_TXN_ADD)
9514                 return 0;
9515
9516         perf_pmu_enable(pmu);
9517         return 0;
9518 }
9519
9520 static void perf_pmu_cancel_txn(struct pmu *pmu)
9521 {
9522         unsigned int flags =  __this_cpu_read(nop_txn_flags);
9523
9524         __this_cpu_write(nop_txn_flags, 0);
9525
9526         if (flags & ~PERF_PMU_TXN_ADD)
9527                 return;
9528
9529         perf_pmu_enable(pmu);
9530 }
9531
9532 static int perf_event_idx_default(struct perf_event *event)
9533 {
9534         return 0;
9535 }
9536
9537 /*
9538  * Ensures all contexts with the same task_ctx_nr have the same
9539  * pmu_cpu_context too.
9540  */
9541 static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
9542 {
9543         struct pmu *pmu;
9544
9545         if (ctxn < 0)
9546                 return NULL;
9547
9548         list_for_each_entry(pmu, &pmus, entry) {
9549                 if (pmu->task_ctx_nr == ctxn)
9550                         return pmu->pmu_cpu_context;
9551         }
9552
9553         return NULL;
9554 }
9555
9556 static void free_pmu_context(struct pmu *pmu)
9557 {
9558         /*
9559          * Static contexts such as perf_sw_context have a global lifetime
9560          * and may be shared between different PMUs. Avoid freeing them
9561          * when a single PMU is going away.
9562          */
9563         if (pmu->task_ctx_nr > perf_invalid_context)
9564                 return;
9565
9566         free_percpu(pmu->pmu_cpu_context);
9567 }
9568
9569 /*
9570  * Let userspace know that this PMU supports address range filtering:
9571  */
9572 static ssize_t nr_addr_filters_show(struct device *dev,
9573                                     struct device_attribute *attr,
9574                                     char *page)
9575 {
9576         struct pmu *pmu = dev_get_drvdata(dev);
9577
9578         return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
9579 }
9580 DEVICE_ATTR_RO(nr_addr_filters);
9581
9582 static struct idr pmu_idr;
9583
9584 static ssize_t
9585 type_show(struct device *dev, struct device_attribute *attr, char *page)
9586 {
9587         struct pmu *pmu = dev_get_drvdata(dev);
9588
9589         return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
9590 }
9591 static DEVICE_ATTR_RO(type);
9592
9593 static ssize_t
9594 perf_event_mux_interval_ms_show(struct device *dev,
9595                                 struct device_attribute *attr,
9596                                 char *page)
9597 {
9598         struct pmu *pmu = dev_get_drvdata(dev);
9599
9600         return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
9601 }
9602
9603 static DEFINE_MUTEX(mux_interval_mutex);
9604
9605 static ssize_t
9606 perf_event_mux_interval_ms_store(struct device *dev,
9607                                  struct device_attribute *attr,
9608                                  const char *buf, size_t count)
9609 {
9610         struct pmu *pmu = dev_get_drvdata(dev);
9611         int timer, cpu, ret;
9612
9613         ret = kstrtoint(buf, 0, &timer);
9614         if (ret)
9615                 return ret;
9616
9617         if (timer < 1)
9618                 return -EINVAL;
9619
9620         /* same value, noting to do */
9621         if (timer == pmu->hrtimer_interval_ms)
9622                 return count;
9623
9624         mutex_lock(&mux_interval_mutex);
9625         pmu->hrtimer_interval_ms = timer;
9626
9627         /* update all cpuctx for this PMU */
9628         cpus_read_lock();
9629         for_each_online_cpu(cpu) {
9630                 struct perf_cpu_context *cpuctx;
9631                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
9632                 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
9633
9634                 cpu_function_call(cpu,
9635                         (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
9636         }
9637         cpus_read_unlock();
9638         mutex_unlock(&mux_interval_mutex);
9639
9640         return count;
9641 }
9642 static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
9643
9644 static struct attribute *pmu_dev_attrs[] = {
9645         &dev_attr_type.attr,
9646         &dev_attr_perf_event_mux_interval_ms.attr,
9647         NULL,
9648 };
9649 ATTRIBUTE_GROUPS(pmu_dev);
9650
9651 static int pmu_bus_running;
9652 static struct bus_type pmu_bus = {
9653         .name           = "event_source",
9654         .dev_groups     = pmu_dev_groups,
9655 };
9656
9657 static void pmu_dev_release(struct device *dev)
9658 {
9659         kfree(dev);
9660 }
9661
9662 static int pmu_dev_alloc(struct pmu *pmu)
9663 {
9664         int ret = -ENOMEM;
9665
9666         pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
9667         if (!pmu->dev)
9668                 goto out;
9669
9670         pmu->dev->groups = pmu->attr_groups;
9671         device_initialize(pmu->dev);
9672         ret = dev_set_name(pmu->dev, "%s", pmu->name);
9673         if (ret)
9674                 goto free_dev;
9675
9676         dev_set_drvdata(pmu->dev, pmu);
9677         pmu->dev->bus = &pmu_bus;
9678         pmu->dev->release = pmu_dev_release;
9679         ret = device_add(pmu->dev);
9680         if (ret)
9681                 goto free_dev;
9682
9683         /* For PMUs with address filters, throw in an extra attribute: */
9684         if (pmu->nr_addr_filters)
9685                 ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
9686
9687         if (ret)
9688                 goto del_dev;
9689
9690 out:
9691         return ret;
9692
9693 del_dev:
9694         device_del(pmu->dev);
9695
9696 free_dev:
9697         put_device(pmu->dev);
9698         goto out;
9699 }
9700
9701 static struct lock_class_key cpuctx_mutex;
9702 static struct lock_class_key cpuctx_lock;
9703
9704 int perf_pmu_register(struct pmu *pmu, const char *name, int type)
9705 {
9706         int cpu, ret;
9707
9708         mutex_lock(&pmus_lock);
9709         ret = -ENOMEM;
9710         pmu->pmu_disable_count = alloc_percpu(int);
9711         if (!pmu->pmu_disable_count)
9712                 goto unlock;
9713
9714         pmu->type = -1;
9715         if (!name)
9716                 goto skip_type;
9717         pmu->name = name;
9718
9719         if (type < 0) {
9720                 type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
9721                 if (type < 0) {
9722                         ret = type;
9723                         goto free_pdc;
9724                 }
9725         }
9726         pmu->type = type;
9727
9728         if (pmu_bus_running) {
9729                 ret = pmu_dev_alloc(pmu);
9730                 if (ret)
9731                         goto free_idr;
9732         }
9733
9734 skip_type:
9735         if (pmu->task_ctx_nr == perf_hw_context) {
9736                 static int hw_context_taken = 0;
9737
9738                 /*
9739                  * Other than systems with heterogeneous CPUs, it never makes
9740                  * sense for two PMUs to share perf_hw_context. PMUs which are
9741                  * uncore must use perf_invalid_context.
9742                  */
9743                 if (WARN_ON_ONCE(hw_context_taken &&
9744                     !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
9745                         pmu->task_ctx_nr = perf_invalid_context;
9746
9747                 hw_context_taken = 1;
9748         }
9749
9750         pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
9751         if (pmu->pmu_cpu_context)
9752                 goto got_cpu_context;
9753
9754         ret = -ENOMEM;
9755         pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
9756         if (!pmu->pmu_cpu_context)
9757                 goto free_dev;
9758
9759         for_each_possible_cpu(cpu) {
9760                 struct perf_cpu_context *cpuctx;
9761
9762                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
9763                 __perf_event_init_context(&cpuctx->ctx);
9764                 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
9765                 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
9766                 cpuctx->ctx.pmu = pmu;
9767                 cpuctx->online = cpumask_test_cpu(cpu, perf_online_mask);
9768
9769                 __perf_mux_hrtimer_init(cpuctx, cpu);
9770         }
9771
9772 got_cpu_context:
9773         if (!pmu->start_txn) {
9774                 if (pmu->pmu_enable) {
9775                         /*
9776                          * If we have pmu_enable/pmu_disable calls, install
9777                          * transaction stubs that use that to try and batch
9778                          * hardware accesses.
9779                          */
9780                         pmu->start_txn  = perf_pmu_start_txn;
9781                         pmu->commit_txn = perf_pmu_commit_txn;
9782                         pmu->cancel_txn = perf_pmu_cancel_txn;
9783                 } else {
9784                         pmu->start_txn  = perf_pmu_nop_txn;
9785                         pmu->commit_txn = perf_pmu_nop_int;
9786                         pmu->cancel_txn = perf_pmu_nop_void;
9787                 }
9788         }
9789
9790         if (!pmu->pmu_enable) {
9791                 pmu->pmu_enable  = perf_pmu_nop_void;
9792                 pmu->pmu_disable = perf_pmu_nop_void;
9793         }
9794
9795         if (!pmu->check_period)
9796                 pmu->check_period = perf_event_nop_int;
9797
9798         if (!pmu->event_idx)
9799                 pmu->event_idx = perf_event_idx_default;
9800
9801         list_add_rcu(&pmu->entry, &pmus);
9802         atomic_set(&pmu->exclusive_cnt, 0);
9803         ret = 0;
9804 unlock:
9805         mutex_unlock(&pmus_lock);
9806
9807         return ret;
9808
9809 free_dev:
9810         device_del(pmu->dev);
9811         put_device(pmu->dev);
9812
9813 free_idr:
9814         if (pmu->type >= PERF_TYPE_MAX)
9815                 idr_remove(&pmu_idr, pmu->type);
9816
9817 free_pdc:
9818         free_percpu(pmu->pmu_disable_count);
9819         goto unlock;
9820 }
9821 EXPORT_SYMBOL_GPL(perf_pmu_register);
9822
9823 void perf_pmu_unregister(struct pmu *pmu)
9824 {
9825         mutex_lock(&pmus_lock);
9826         list_del_rcu(&pmu->entry);
9827
9828         /*
9829          * We dereference the pmu list under both SRCU and regular RCU, so
9830          * synchronize against both of those.
9831          */
9832         synchronize_srcu(&pmus_srcu);
9833         synchronize_rcu();
9834
9835         free_percpu(pmu->pmu_disable_count);
9836         if (pmu->type >= PERF_TYPE_MAX)
9837                 idr_remove(&pmu_idr, pmu->type);
9838         if (pmu_bus_running) {
9839                 if (pmu->nr_addr_filters)
9840                         device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
9841                 device_del(pmu->dev);
9842                 put_device(pmu->dev);
9843         }
9844         free_pmu_context(pmu);
9845         mutex_unlock(&pmus_lock);
9846 }
9847 EXPORT_SYMBOL_GPL(perf_pmu_unregister);
9848
9849 static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
9850 {
9851         struct perf_event_context *ctx = NULL;
9852         int ret;
9853
9854         if (!try_module_get(pmu->module))
9855                 return -ENODEV;
9856
9857         /*
9858          * A number of pmu->event_init() methods iterate the sibling_list to,
9859          * for example, validate if the group fits on the PMU. Therefore,
9860          * if this is a sibling event, acquire the ctx->mutex to protect
9861          * the sibling_list.
9862          */
9863         if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) {
9864                 /*
9865                  * This ctx->mutex can nest when we're called through
9866                  * inheritance. See the perf_event_ctx_lock_nested() comment.
9867                  */
9868                 ctx = perf_event_ctx_lock_nested(event->group_leader,
9869                                                  SINGLE_DEPTH_NESTING);
9870                 BUG_ON(!ctx);
9871         }
9872
9873         event->pmu = pmu;
9874         ret = pmu->event_init(event);
9875
9876         if (ctx)
9877                 perf_event_ctx_unlock(event->group_leader, ctx);
9878
9879         if (ret)
9880                 module_put(pmu->module);
9881
9882         return ret;
9883 }
9884
9885 static struct pmu *perf_init_event(struct perf_event *event)
9886 {
9887         struct pmu *pmu;
9888         int idx;
9889         int ret;
9890
9891         idx = srcu_read_lock(&pmus_srcu);
9892
9893         /* Try parent's PMU first: */
9894         if (event->parent && event->parent->pmu) {
9895                 pmu = event->parent->pmu;
9896                 ret = perf_try_init_event(pmu, event);
9897                 if (!ret)
9898                         goto unlock;
9899         }
9900
9901         rcu_read_lock();
9902         pmu = idr_find(&pmu_idr, event->attr.type);
9903         rcu_read_unlock();
9904         if (pmu) {
9905                 ret = perf_try_init_event(pmu, event);
9906                 if (ret)
9907                         pmu = ERR_PTR(ret);
9908                 goto unlock;
9909         }
9910
9911         list_for_each_entry_rcu(pmu, &pmus, entry) {
9912                 ret = perf_try_init_event(pmu, event);
9913                 if (!ret)
9914                         goto unlock;
9915
9916                 if (ret != -ENOENT) {
9917                         pmu = ERR_PTR(ret);
9918                         goto unlock;
9919                 }
9920         }
9921         pmu = ERR_PTR(-ENOENT);
9922 unlock:
9923         srcu_read_unlock(&pmus_srcu, idx);
9924
9925         return pmu;
9926 }
9927
9928 static void attach_sb_event(struct perf_event *event)
9929 {
9930         struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
9931
9932         raw_spin_lock(&pel->lock);
9933         list_add_rcu(&event->sb_list, &pel->list);
9934         raw_spin_unlock(&pel->lock);
9935 }
9936
9937 /*
9938  * We keep a list of all !task (and therefore per-cpu) events
9939  * that need to receive side-band records.
9940  *
9941  * This avoids having to scan all the various PMU per-cpu contexts
9942  * looking for them.
9943  */
9944 static void account_pmu_sb_event(struct perf_event *event)
9945 {
9946         if (is_sb_event(event))
9947                 attach_sb_event(event);
9948 }
9949
9950 static void account_event_cpu(struct perf_event *event, int cpu)
9951 {
9952         if (event->parent)
9953                 return;
9954
9955         if (is_cgroup_event(event))
9956                 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
9957 }
9958
9959 /* Freq events need the tick to stay alive (see perf_event_task_tick). */
9960 static void account_freq_event_nohz(void)
9961 {
9962 #ifdef CONFIG_NO_HZ_FULL
9963         /* Lock so we don't race with concurrent unaccount */
9964         spin_lock(&nr_freq_lock);
9965         if (atomic_inc_return(&nr_freq_events) == 1)
9966                 tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
9967         spin_unlock(&nr_freq_lock);
9968 #endif
9969 }
9970
9971 static void account_freq_event(void)
9972 {
9973         if (tick_nohz_full_enabled())
9974                 account_freq_event_nohz();
9975         else
9976                 atomic_inc(&nr_freq_events);
9977 }
9978
9979
9980 static void account_event(struct perf_event *event)
9981 {
9982         bool inc = false;
9983
9984         if (event->parent)
9985                 return;
9986
9987         if (event->attach_state & PERF_ATTACH_TASK)
9988                 inc = true;
9989         if (event->attr.mmap || event->attr.mmap_data)
9990                 atomic_inc(&nr_mmap_events);
9991         if (event->attr.comm)
9992                 atomic_inc(&nr_comm_events);
9993         if (event->attr.namespaces)
9994                 atomic_inc(&nr_namespaces_events);
9995         if (event->attr.task)
9996                 atomic_inc(&nr_task_events);
9997         if (event->attr.freq)
9998                 account_freq_event();
9999         if (event->attr.context_switch) {
10000                 atomic_inc(&nr_switch_events);
10001                 inc = true;
10002         }
10003         if (has_branch_stack(event))
10004                 inc = true;
10005         if (is_cgroup_event(event))
10006                 inc = true;
10007
10008         if (inc) {
10009                 /*
10010                  * We need the mutex here because static_branch_enable()
10011                  * must complete *before* the perf_sched_count increment
10012                  * becomes visible.
10013                  */
10014                 if (atomic_inc_not_zero(&perf_sched_count))
10015                         goto enabled;
10016
10017                 mutex_lock(&perf_sched_mutex);
10018                 if (!atomic_read(&perf_sched_count)) {
10019                         static_branch_enable(&perf_sched_events);
10020                         /*
10021                          * Guarantee that all CPUs observe they key change and
10022                          * call the perf scheduling hooks before proceeding to
10023                          * install events that need them.
10024                          */
10025                         synchronize_sched();
10026                 }
10027                 /*
10028                  * Now that we have waited for the sync_sched(), allow further
10029                  * increments to by-pass the mutex.
10030                  */
10031                 atomic_inc(&perf_sched_count);
10032                 mutex_unlock(&perf_sched_mutex);
10033         }
10034 enabled:
10035
10036         account_event_cpu(event, event->cpu);
10037
10038         account_pmu_sb_event(event);
10039 }
10040
10041 /*
10042  * Allocate and initialize an event structure
10043  */
10044 static struct perf_event *
10045 perf_event_alloc(struct perf_event_attr *attr, int cpu,
10046                  struct task_struct *task,
10047                  struct perf_event *group_leader,
10048                  struct perf_event *parent_event,
10049                  perf_overflow_handler_t overflow_handler,
10050                  void *context, int cgroup_fd)
10051 {
10052         struct pmu *pmu;
10053         struct perf_event *event;
10054         struct hw_perf_event *hwc;
10055         long err = -EINVAL;
10056
10057         if ((unsigned)cpu >= nr_cpu_ids) {
10058                 if (!task || cpu != -1)
10059                         return ERR_PTR(-EINVAL);
10060         }
10061
10062         event = kzalloc(sizeof(*event), GFP_KERNEL);
10063         if (!event)
10064                 return ERR_PTR(-ENOMEM);
10065
10066         /*
10067          * Single events are their own group leaders, with an
10068          * empty sibling list:
10069          */
10070         if (!group_leader)
10071                 group_leader = event;
10072
10073         mutex_init(&event->child_mutex);
10074         INIT_LIST_HEAD(&event->child_list);
10075
10076         INIT_LIST_HEAD(&event->event_entry);
10077         INIT_LIST_HEAD(&event->sibling_list);
10078         INIT_LIST_HEAD(&event->active_list);
10079         init_event_group(event);
10080         INIT_LIST_HEAD(&event->rb_entry);
10081         INIT_LIST_HEAD(&event->active_entry);
10082         INIT_LIST_HEAD(&event->addr_filters.list);
10083         INIT_HLIST_NODE(&event->hlist_entry);
10084
10085
10086         init_waitqueue_head(&event->waitq);
10087         event->pending_disable = -1;
10088         init_irq_work(&event->pending, perf_pending_event);
10089
10090         mutex_init(&event->mmap_mutex);
10091         raw_spin_lock_init(&event->addr_filters.lock);
10092
10093         atomic_long_set(&event->refcount, 1);
10094         event->cpu              = cpu;
10095         event->attr             = *attr;
10096         event->group_leader     = group_leader;
10097         event->pmu              = NULL;
10098         event->oncpu            = -1;
10099
10100         event->parent           = parent_event;
10101
10102         event->ns               = get_pid_ns(task_active_pid_ns(current));
10103         event->id               = atomic64_inc_return(&perf_event_id);
10104
10105         event->state            = PERF_EVENT_STATE_INACTIVE;
10106
10107         if (task) {
10108                 event->attach_state = PERF_ATTACH_TASK;
10109                 /*
10110                  * XXX pmu::event_init needs to know what task to account to
10111                  * and we cannot use the ctx information because we need the
10112                  * pmu before we get a ctx.
10113                  */
10114                 get_task_struct(task);
10115                 event->hw.target = task;
10116         }
10117
10118         event->clock = &local_clock;
10119         if (parent_event)
10120                 event->clock = parent_event->clock;
10121
10122         if (!overflow_handler && parent_event) {
10123                 overflow_handler = parent_event->overflow_handler;
10124                 context = parent_event->overflow_handler_context;
10125 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
10126                 if (overflow_handler == bpf_overflow_handler) {
10127                         struct bpf_prog *prog = bpf_prog_inc(parent_event->prog);
10128
10129                         if (IS_ERR(prog)) {
10130                                 err = PTR_ERR(prog);
10131                                 goto err_ns;
10132                         }
10133                         event->prog = prog;
10134                         event->orig_overflow_handler =
10135                                 parent_event->orig_overflow_handler;
10136                 }
10137 #endif
10138         }
10139
10140         if (overflow_handler) {
10141                 event->overflow_handler = overflow_handler;
10142                 event->overflow_handler_context = context;
10143         } else if (is_write_backward(event)){
10144                 event->overflow_handler = perf_event_output_backward;
10145                 event->overflow_handler_context = NULL;
10146         } else {
10147                 event->overflow_handler = perf_event_output_forward;
10148                 event->overflow_handler_context = NULL;
10149         }
10150
10151         perf_event__state_init(event);
10152
10153         pmu = NULL;
10154
10155         hwc = &event->hw;
10156         hwc->sample_period = attr->sample_period;
10157         if (attr->freq && attr->sample_freq)
10158                 hwc->sample_period = 1;
10159         hwc->last_period = hwc->sample_period;
10160
10161         local64_set(&hwc->period_left, hwc->sample_period);
10162
10163         /*
10164          * We currently do not support PERF_SAMPLE_READ on inherited events.
10165          * See perf_output_read().
10166          */
10167         if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
10168                 goto err_ns;
10169
10170         if (!has_branch_stack(event))
10171                 event->attr.branch_sample_type = 0;
10172
10173         if (cgroup_fd != -1) {
10174                 err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
10175                 if (err)
10176                         goto err_ns;
10177         }
10178
10179         pmu = perf_init_event(event);
10180         if (IS_ERR(pmu)) {
10181                 err = PTR_ERR(pmu);
10182                 goto err_ns;
10183         }
10184
10185         err = exclusive_event_init(event);
10186         if (err)
10187                 goto err_pmu;
10188
10189         if (has_addr_filter(event)) {
10190                 event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
10191                                                     sizeof(struct perf_addr_filter_range),
10192                                                     GFP_KERNEL);
10193                 if (!event->addr_filter_ranges) {
10194                         err = -ENOMEM;
10195                         goto err_per_task;
10196                 }
10197
10198                 /*
10199                  * Clone the parent's vma offsets: they are valid until exec()
10200                  * even if the mm is not shared with the parent.
10201                  */
10202                 if (event->parent) {
10203                         struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
10204
10205                         raw_spin_lock_irq(&ifh->lock);
10206                         memcpy(event->addr_filter_ranges,
10207                                event->parent->addr_filter_ranges,
10208                                pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range));
10209                         raw_spin_unlock_irq(&ifh->lock);
10210                 }
10211
10212                 /* force hw sync on the address filters */
10213                 event->addr_filters_gen = 1;
10214         }
10215
10216         if (!event->parent) {
10217                 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
10218                         err = get_callchain_buffers(attr->sample_max_stack);
10219                         if (err)
10220                                 goto err_addr_filters;
10221                 }
10222         }
10223
10224         /* symmetric to unaccount_event() in _free_event() */
10225         account_event(event);
10226
10227         return event;
10228
10229 err_addr_filters:
10230         kfree(event->addr_filter_ranges);
10231
10232 err_per_task:
10233         exclusive_event_destroy(event);
10234
10235 err_pmu:
10236         if (event->destroy)
10237                 event->destroy(event);
10238         module_put(pmu->module);
10239 err_ns:
10240         if (is_cgroup_event(event))
10241                 perf_detach_cgroup(event);
10242         if (event->ns)
10243                 put_pid_ns(event->ns);
10244         if (event->hw.target)
10245                 put_task_struct(event->hw.target);
10246         kfree(event);
10247
10248         return ERR_PTR(err);
10249 }
10250
10251 static int perf_copy_attr(struct perf_event_attr __user *uattr,
10252                           struct perf_event_attr *attr)
10253 {
10254         u32 size;
10255         int ret;
10256
10257         if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
10258                 return -EFAULT;
10259
10260         /*
10261          * zero the full structure, so that a short copy will be nice.
10262          */
10263         memset(attr, 0, sizeof(*attr));
10264
10265         ret = get_user(size, &uattr->size);
10266         if (ret)
10267                 return ret;
10268
10269         if (size > PAGE_SIZE)   /* silly large */
10270                 goto err_size;
10271
10272         if (!size)              /* abi compat */
10273                 size = PERF_ATTR_SIZE_VER0;
10274
10275         if (size < PERF_ATTR_SIZE_VER0)
10276                 goto err_size;
10277
10278         /*
10279          * If we're handed a bigger struct than we know of,
10280          * ensure all the unknown bits are 0 - i.e. new
10281          * user-space does not rely on any kernel feature
10282          * extensions we dont know about yet.
10283          */
10284         if (size > sizeof(*attr)) {
10285                 unsigned char __user *addr;
10286                 unsigned char __user *end;
10287                 unsigned char val;
10288
10289                 addr = (void __user *)uattr + sizeof(*attr);
10290                 end  = (void __user *)uattr + size;
10291
10292                 for (; addr < end; addr++) {
10293                         ret = get_user(val, addr);
10294                         if (ret)
10295                                 return ret;
10296                         if (val)
10297                                 goto err_size;
10298                 }
10299                 size = sizeof(*attr);
10300         }
10301
10302         ret = copy_from_user(attr, uattr, size);
10303         if (ret)
10304                 return -EFAULT;
10305
10306         attr->size = size;
10307
10308         if (attr->__reserved_1)
10309                 return -EINVAL;
10310
10311         if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
10312                 return -EINVAL;
10313
10314         if (attr->read_format & ~(PERF_FORMAT_MAX-1))
10315                 return -EINVAL;
10316
10317         if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
10318                 u64 mask = attr->branch_sample_type;
10319
10320                 /* only using defined bits */
10321                 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
10322                         return -EINVAL;
10323
10324                 /* at least one branch bit must be set */
10325                 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
10326                         return -EINVAL;
10327
10328                 /* propagate priv level, when not set for branch */
10329                 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
10330
10331                         /* exclude_kernel checked on syscall entry */
10332                         if (!attr->exclude_kernel)
10333                                 mask |= PERF_SAMPLE_BRANCH_KERNEL;
10334
10335                         if (!attr->exclude_user)
10336                                 mask |= PERF_SAMPLE_BRANCH_USER;
10337
10338                         if (!attr->exclude_hv)
10339                                 mask |= PERF_SAMPLE_BRANCH_HV;
10340                         /*
10341                          * adjust user setting (for HW filter setup)
10342                          */
10343                         attr->branch_sample_type = mask;
10344                 }
10345                 /* privileged levels capture (kernel, hv): check permissions */
10346                 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
10347                     && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
10348                         return -EACCES;
10349         }
10350
10351         if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
10352                 ret = perf_reg_validate(attr->sample_regs_user);
10353                 if (ret)
10354                         return ret;
10355         }
10356
10357         if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
10358                 if (!arch_perf_have_user_stack_dump())
10359                         return -ENOSYS;
10360
10361                 /*
10362                  * We have __u32 type for the size, but so far
10363                  * we can only use __u16 as maximum due to the
10364                  * __u16 sample size limit.
10365                  */
10366                 if (attr->sample_stack_user >= USHRT_MAX)
10367                         return -EINVAL;
10368                 else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
10369                         return -EINVAL;
10370         }
10371
10372         if (!attr->sample_max_stack)
10373                 attr->sample_max_stack = sysctl_perf_event_max_stack;
10374
10375         if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
10376                 ret = perf_reg_validate(attr->sample_regs_intr);
10377 out:
10378         return ret;
10379
10380 err_size:
10381         put_user(sizeof(*attr), &uattr->size);
10382         ret = -E2BIG;
10383         goto out;
10384 }
10385
10386 static int
10387 perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
10388 {
10389         struct ring_buffer *rb = NULL;
10390         int ret = -EINVAL;
10391
10392         if (!output_event)
10393                 goto set;
10394
10395         /* don't allow circular references */
10396         if (event == output_event)
10397                 goto out;
10398
10399         /*
10400          * Don't allow cross-cpu buffers
10401          */
10402         if (output_event->cpu != event->cpu)
10403                 goto out;
10404
10405         /*
10406          * If its not a per-cpu rb, it must be the same task.
10407          */
10408         if (output_event->cpu == -1 && output_event->ctx != event->ctx)
10409                 goto out;
10410
10411         /*
10412          * Mixing clocks in the same buffer is trouble you don't need.
10413          */
10414         if (output_event->clock != event->clock)
10415                 goto out;
10416
10417         /*
10418          * Either writing ring buffer from beginning or from end.
10419          * Mixing is not allowed.
10420          */
10421         if (is_write_backward(output_event) != is_write_backward(event))
10422                 goto out;
10423
10424         /*
10425          * If both events generate aux data, they must be on the same PMU
10426          */
10427         if (has_aux(event) && has_aux(output_event) &&
10428             event->pmu != output_event->pmu)
10429                 goto out;
10430
10431 set:
10432         mutex_lock(&event->mmap_mutex);
10433         /* Can't redirect output if we've got an active mmap() */
10434         if (atomic_read(&event->mmap_count))
10435                 goto unlock;
10436
10437         if (output_event) {
10438                 /* get the rb we want to redirect to */
10439                 rb = ring_buffer_get(output_event);
10440                 if (!rb)
10441                         goto unlock;
10442         }
10443
10444         ring_buffer_attach(event, rb);
10445
10446         ret = 0;
10447 unlock:
10448         mutex_unlock(&event->mmap_mutex);
10449
10450 out:
10451         return ret;
10452 }
10453
10454 static void mutex_lock_double(struct mutex *a, struct mutex *b)
10455 {
10456         if (b < a)
10457                 swap(a, b);
10458
10459         mutex_lock(a);
10460         mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
10461 }
10462
10463 static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
10464 {
10465         bool nmi_safe = false;
10466
10467         switch (clk_id) {
10468         case CLOCK_MONOTONIC:
10469                 event->clock = &ktime_get_mono_fast_ns;
10470                 nmi_safe = true;
10471                 break;
10472
10473         case CLOCK_MONOTONIC_RAW:
10474                 event->clock = &ktime_get_raw_fast_ns;
10475                 nmi_safe = true;
10476                 break;
10477
10478         case CLOCK_REALTIME:
10479                 event->clock = &ktime_get_real_ns;
10480                 break;
10481
10482         case CLOCK_BOOTTIME:
10483                 event->clock = &ktime_get_boot_ns;
10484                 break;
10485
10486         case CLOCK_TAI:
10487                 event->clock = &ktime_get_tai_ns;
10488                 break;
10489
10490         default:
10491                 return -EINVAL;
10492         }
10493
10494         if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
10495                 return -EINVAL;
10496
10497         return 0;
10498 }
10499
10500 /*
10501  * Variation on perf_event_ctx_lock_nested(), except we take two context
10502  * mutexes.
10503  */
10504 static struct perf_event_context *
10505 __perf_event_ctx_lock_double(struct perf_event *group_leader,
10506                              struct perf_event_context *ctx)
10507 {
10508         struct perf_event_context *gctx;
10509
10510 again:
10511         rcu_read_lock();
10512         gctx = READ_ONCE(group_leader->ctx);
10513         if (!atomic_inc_not_zero(&gctx->refcount)) {
10514                 rcu_read_unlock();
10515                 goto again;
10516         }
10517         rcu_read_unlock();
10518
10519         mutex_lock_double(&gctx->mutex, &ctx->mutex);
10520
10521         if (group_leader->ctx != gctx) {
10522                 mutex_unlock(&ctx->mutex);
10523                 mutex_unlock(&gctx->mutex);
10524                 put_ctx(gctx);
10525                 goto again;
10526         }
10527
10528         return gctx;
10529 }
10530
10531 /**
10532  * sys_perf_event_open - open a performance event, associate it to a task/cpu
10533  *
10534  * @attr_uptr:  event_id type attributes for monitoring/sampling
10535  * @pid:                target pid
10536  * @cpu:                target cpu
10537  * @group_fd:           group leader event fd
10538  */
10539 SYSCALL_DEFINE5(perf_event_open,
10540                 struct perf_event_attr __user *, attr_uptr,
10541                 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
10542 {
10543         struct perf_event *group_leader = NULL, *output_event = NULL;
10544         struct perf_event *event, *sibling;
10545         struct perf_event_attr attr;
10546         struct perf_event_context *ctx, *uninitialized_var(gctx);
10547         struct file *event_file = NULL;
10548         struct fd group = {NULL, 0};
10549         struct task_struct *task = NULL;
10550         struct pmu *pmu;
10551         int event_fd;
10552         int move_group = 0;
10553         int err;
10554         int f_flags = O_RDWR;
10555         int cgroup_fd = -1;
10556
10557         /* for future expandability... */
10558         if (flags & ~PERF_FLAG_ALL)
10559                 return -EINVAL;
10560
10561         err = perf_copy_attr(attr_uptr, &attr);
10562         if (err)
10563                 return err;
10564
10565         if (!attr.exclude_kernel) {
10566                 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
10567                         return -EACCES;
10568         }
10569
10570         if (attr.namespaces) {
10571                 if (!capable(CAP_SYS_ADMIN))
10572                         return -EACCES;
10573         }
10574
10575         if (attr.freq) {
10576                 if (attr.sample_freq > sysctl_perf_event_sample_rate)
10577                         return -EINVAL;
10578         } else {
10579                 if (attr.sample_period & (1ULL << 63))
10580                         return -EINVAL;
10581         }
10582
10583         /* Only privileged users can get physical addresses */
10584         if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
10585             perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
10586                 return -EACCES;
10587
10588         /*
10589          * In cgroup mode, the pid argument is used to pass the fd
10590          * opened to the cgroup directory in cgroupfs. The cpu argument
10591          * designates the cpu on which to monitor threads from that
10592          * cgroup.
10593          */
10594         if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
10595                 return -EINVAL;
10596
10597         if (flags & PERF_FLAG_FD_CLOEXEC)
10598                 f_flags |= O_CLOEXEC;
10599
10600         event_fd = get_unused_fd_flags(f_flags);
10601         if (event_fd < 0)
10602                 return event_fd;
10603
10604         if (group_fd != -1) {
10605                 err = perf_fget_light(group_fd, &group);
10606                 if (err)
10607                         goto err_fd;
10608                 group_leader = group.file->private_data;
10609                 if (flags & PERF_FLAG_FD_OUTPUT)
10610                         output_event = group_leader;
10611                 if (flags & PERF_FLAG_FD_NO_GROUP)
10612                         group_leader = NULL;
10613         }
10614
10615         if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
10616                 task = find_lively_task_by_vpid(pid);
10617                 if (IS_ERR(task)) {
10618                         err = PTR_ERR(task);
10619                         goto err_group_fd;
10620                 }
10621         }
10622
10623         if (task && group_leader &&
10624             group_leader->attr.inherit != attr.inherit) {
10625                 err = -EINVAL;
10626                 goto err_task;
10627         }
10628
10629         if (task) {
10630                 err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
10631                 if (err)
10632                         goto err_task;
10633
10634                 /*
10635                  * Reuse ptrace permission checks for now.
10636                  *
10637                  * We must hold cred_guard_mutex across this and any potential
10638                  * perf_install_in_context() call for this new event to
10639                  * serialize against exec() altering our credentials (and the
10640                  * perf_event_exit_task() that could imply).
10641                  */
10642                 err = -EACCES;
10643                 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
10644                         goto err_cred;
10645         }
10646
10647         if (flags & PERF_FLAG_PID_CGROUP)
10648                 cgroup_fd = pid;
10649
10650         event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
10651                                  NULL, NULL, cgroup_fd);
10652         if (IS_ERR(event)) {
10653                 err = PTR_ERR(event);
10654                 goto err_cred;
10655         }
10656
10657         if (is_sampling_event(event)) {
10658                 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
10659                         err = -EOPNOTSUPP;
10660                         goto err_alloc;
10661                 }
10662         }
10663
10664         /*
10665          * Special case software events and allow them to be part of
10666          * any hardware group.
10667          */
10668         pmu = event->pmu;
10669
10670         if (attr.use_clockid) {
10671                 err = perf_event_set_clock(event, attr.clockid);
10672                 if (err)
10673                         goto err_alloc;
10674         }
10675
10676         if (pmu->task_ctx_nr == perf_sw_context)
10677                 event->event_caps |= PERF_EV_CAP_SOFTWARE;
10678
10679         if (group_leader) {
10680                 if (is_software_event(event) &&
10681                     !in_software_context(group_leader)) {
10682                         /*
10683                          * If the event is a sw event, but the group_leader
10684                          * is on hw context.
10685                          *
10686                          * Allow the addition of software events to hw
10687                          * groups, this is safe because software events
10688                          * never fail to schedule.
10689                          */
10690                         pmu = group_leader->ctx->pmu;
10691                 } else if (!is_software_event(event) &&
10692                            is_software_event(group_leader) &&
10693                            (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
10694                         /*
10695                          * In case the group is a pure software group, and we
10696                          * try to add a hardware event, move the whole group to
10697                          * the hardware context.
10698                          */
10699                         move_group = 1;
10700                 }
10701         }
10702
10703         /*
10704          * Get the target context (task or percpu):
10705          */
10706         ctx = find_get_context(pmu, task, event);
10707         if (IS_ERR(ctx)) {
10708                 err = PTR_ERR(ctx);
10709                 goto err_alloc;
10710         }
10711
10712         /*
10713          * Look up the group leader (we will attach this event to it):
10714          */
10715         if (group_leader) {
10716                 err = -EINVAL;
10717
10718                 /*
10719                  * Do not allow a recursive hierarchy (this new sibling
10720                  * becoming part of another group-sibling):
10721                  */
10722                 if (group_leader->group_leader != group_leader)
10723                         goto err_context;
10724
10725                 /* All events in a group should have the same clock */
10726                 if (group_leader->clock != event->clock)
10727                         goto err_context;
10728
10729                 /*
10730                  * Make sure we're both events for the same CPU;
10731                  * grouping events for different CPUs is broken; since
10732                  * you can never concurrently schedule them anyhow.
10733                  */
10734                 if (group_leader->cpu != event->cpu)
10735                         goto err_context;
10736
10737                 /*
10738                  * Make sure we're both on the same task, or both
10739                  * per-CPU events.
10740                  */
10741                 if (group_leader->ctx->task != ctx->task)
10742                         goto err_context;
10743
10744                 /*
10745                  * Do not allow to attach to a group in a different task
10746                  * or CPU context. If we're moving SW events, we'll fix
10747                  * this up later, so allow that.
10748                  */
10749                 if (!move_group && group_leader->ctx != ctx)
10750                         goto err_context;
10751
10752                 /*
10753                  * Only a group leader can be exclusive or pinned
10754                  */
10755                 if (attr.exclusive || attr.pinned)
10756                         goto err_context;
10757         }
10758
10759         if (output_event) {
10760                 err = perf_event_set_output(event, output_event);
10761                 if (err)
10762                         goto err_context;
10763         }
10764
10765         event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
10766                                         f_flags);
10767         if (IS_ERR(event_file)) {
10768                 err = PTR_ERR(event_file);
10769                 event_file = NULL;
10770                 goto err_context;
10771         }
10772
10773         if (move_group) {
10774                 gctx = __perf_event_ctx_lock_double(group_leader, ctx);
10775
10776                 if (gctx->task == TASK_TOMBSTONE) {
10777                         err = -ESRCH;
10778                         goto err_locked;
10779                 }
10780
10781                 /*
10782                  * Check if we raced against another sys_perf_event_open() call
10783                  * moving the software group underneath us.
10784                  */
10785                 if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
10786                         /*
10787                          * If someone moved the group out from under us, check
10788                          * if this new event wound up on the same ctx, if so
10789                          * its the regular !move_group case, otherwise fail.
10790                          */
10791                         if (gctx != ctx) {
10792                                 err = -EINVAL;
10793                                 goto err_locked;
10794                         } else {
10795                                 perf_event_ctx_unlock(group_leader, gctx);
10796                                 move_group = 0;
10797                         }
10798                 }
10799
10800                 /*
10801                  * Failure to create exclusive events returns -EBUSY.
10802                  */
10803                 err = -EBUSY;
10804                 if (!exclusive_event_installable(group_leader, ctx))
10805                         goto err_locked;
10806
10807                 for_each_sibling_event(sibling, group_leader) {
10808                         if (!exclusive_event_installable(sibling, ctx))
10809                                 goto err_locked;
10810                 }
10811         } else {
10812                 mutex_lock(&ctx->mutex);
10813         }
10814
10815         if (ctx->task == TASK_TOMBSTONE) {
10816                 err = -ESRCH;
10817                 goto err_locked;
10818         }
10819
10820         if (!perf_event_validate_size(event)) {
10821                 err = -E2BIG;
10822                 goto err_locked;
10823         }
10824
10825         if (!task) {
10826                 /*
10827                  * Check if the @cpu we're creating an event for is online.
10828                  *
10829                  * We use the perf_cpu_context::ctx::mutex to serialize against
10830                  * the hotplug notifiers. See perf_event_{init,exit}_cpu().
10831                  */
10832                 struct perf_cpu_context *cpuctx =
10833                         container_of(ctx, struct perf_cpu_context, ctx);
10834
10835                 if (!cpuctx->online) {
10836                         err = -ENODEV;
10837                         goto err_locked;
10838                 }
10839         }
10840
10841
10842         /*
10843          * Must be under the same ctx::mutex as perf_install_in_context(),
10844          * because we need to serialize with concurrent event creation.
10845          */
10846         if (!exclusive_event_installable(event, ctx)) {
10847                 err = -EBUSY;
10848                 goto err_locked;
10849         }
10850
10851         WARN_ON_ONCE(ctx->parent_ctx);
10852
10853         /*
10854          * This is the point on no return; we cannot fail hereafter. This is
10855          * where we start modifying current state.
10856          */
10857
10858         if (move_group) {
10859                 /*
10860                  * See perf_event_ctx_lock() for comments on the details
10861                  * of swizzling perf_event::ctx.
10862                  */
10863                 perf_remove_from_context(group_leader, 0);
10864                 put_ctx(gctx);
10865
10866                 for_each_sibling_event(sibling, group_leader) {
10867                         perf_remove_from_context(sibling, 0);
10868                         put_ctx(gctx);
10869                 }
10870
10871                 /*
10872                  * Wait for everybody to stop referencing the events through
10873                  * the old lists, before installing it on new lists.
10874                  */
10875                 synchronize_rcu();
10876
10877                 /*
10878                  * Install the group siblings before the group leader.
10879                  *
10880                  * Because a group leader will try and install the entire group
10881                  * (through the sibling list, which is still in-tact), we can
10882                  * end up with siblings installed in the wrong context.
10883                  *
10884                  * By installing siblings first we NO-OP because they're not
10885                  * reachable through the group lists.
10886                  */
10887                 for_each_sibling_event(sibling, group_leader) {
10888                         perf_event__state_init(sibling);
10889                         perf_install_in_context(ctx, sibling, sibling->cpu);
10890                         get_ctx(ctx);
10891                 }
10892
10893                 /*
10894                  * Removing from the context ends up with disabled
10895                  * event. What we want here is event in the initial
10896                  * startup state, ready to be add into new context.
10897                  */
10898                 perf_event__state_init(group_leader);
10899                 perf_install_in_context(ctx, group_leader, group_leader->cpu);
10900                 get_ctx(ctx);
10901         }
10902
10903         /*
10904          * Precalculate sample_data sizes; do while holding ctx::mutex such
10905          * that we're serialized against further additions and before
10906          * perf_install_in_context() which is the point the event is active and
10907          * can use these values.
10908          */
10909         perf_event__header_size(event);
10910         perf_event__id_header_size(event);
10911
10912         event->owner = current;
10913
10914         perf_install_in_context(ctx, event, event->cpu);
10915         perf_unpin_context(ctx);
10916
10917         if (move_group)
10918                 perf_event_ctx_unlock(group_leader, gctx);
10919         mutex_unlock(&ctx->mutex);
10920
10921         if (task) {
10922                 mutex_unlock(&task->signal->cred_guard_mutex);
10923                 put_task_struct(task);
10924         }
10925
10926         mutex_lock(&current->perf_event_mutex);
10927         list_add_tail(&event->owner_entry, &current->perf_event_list);
10928         mutex_unlock(&current->perf_event_mutex);
10929
10930         /*
10931          * Drop the reference on the group_event after placing the
10932          * new event on the sibling_list. This ensures destruction
10933          * of the group leader will find the pointer to itself in
10934          * perf_group_detach().
10935          */
10936         fdput(group);
10937         fd_install(event_fd, event_file);
10938         return event_fd;
10939
10940 err_locked:
10941         if (move_group)
10942                 perf_event_ctx_unlock(group_leader, gctx);
10943         mutex_unlock(&ctx->mutex);
10944 /* err_file: */
10945         fput(event_file);
10946 err_context:
10947         perf_unpin_context(ctx);
10948         put_ctx(ctx);
10949 err_alloc:
10950         /*
10951          * If event_file is set, the fput() above will have called ->release()
10952          * and that will take care of freeing the event.
10953          */
10954         if (!event_file)
10955                 free_event(event);
10956 err_cred:
10957         if (task)
10958                 mutex_unlock(&task->signal->cred_guard_mutex);
10959 err_task:
10960         if (task)
10961                 put_task_struct(task);
10962 err_group_fd:
10963         fdput(group);
10964 err_fd:
10965         put_unused_fd(event_fd);
10966         return err;
10967 }
10968
10969 /**
10970  * perf_event_create_kernel_counter
10971  *
10972  * @attr: attributes of the counter to create
10973  * @cpu: cpu in which the counter is bound
10974  * @task: task to profile (NULL for percpu)
10975  */
10976 struct perf_event *
10977 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
10978                                  struct task_struct *task,
10979                                  perf_overflow_handler_t overflow_handler,
10980                                  void *context)
10981 {
10982         struct perf_event_context *ctx;
10983         struct perf_event *event;
10984         int err;
10985
10986         /*
10987          * Get the target context (task or percpu):
10988          */
10989
10990         event = perf_event_alloc(attr, cpu, task, NULL, NULL,
10991                                  overflow_handler, context, -1);
10992         if (IS_ERR(event)) {
10993                 err = PTR_ERR(event);
10994                 goto err;
10995         }
10996
10997         /* Mark owner so we could distinguish it from user events. */
10998         event->owner = TASK_TOMBSTONE;
10999
11000         ctx = find_get_context(event->pmu, task, event);
11001         if (IS_ERR(ctx)) {
11002                 err = PTR_ERR(ctx);
11003                 goto err_free;
11004         }
11005
11006         WARN_ON_ONCE(ctx->parent_ctx);
11007         mutex_lock(&ctx->mutex);
11008         if (ctx->task == TASK_TOMBSTONE) {
11009                 err = -ESRCH;
11010                 goto err_unlock;
11011         }
11012
11013         if (!task) {
11014                 /*
11015                  * Check if the @cpu we're creating an event for is online.
11016                  *
11017                  * We use the perf_cpu_context::ctx::mutex to serialize against
11018                  * the hotplug notifiers. See perf_event_{init,exit}_cpu().
11019                  */
11020                 struct perf_cpu_context *cpuctx =
11021                         container_of(ctx, struct perf_cpu_context, ctx);
11022                 if (!cpuctx->online) {
11023                         err = -ENODEV;
11024                         goto err_unlock;
11025                 }
11026         }
11027
11028         if (!exclusive_event_installable(event, ctx)) {
11029                 err = -EBUSY;
11030                 goto err_unlock;
11031         }
11032
11033         perf_install_in_context(ctx, event, event->cpu);
11034         perf_unpin_context(ctx);
11035         mutex_unlock(&ctx->mutex);
11036
11037         return event;
11038
11039 err_unlock:
11040         mutex_unlock(&ctx->mutex);
11041         perf_unpin_context(ctx);
11042         put_ctx(ctx);
11043 err_free:
11044         free_event(event);
11045 err:
11046         return ERR_PTR(err);
11047 }
11048 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
11049
11050 void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
11051 {
11052         struct perf_event_context *src_ctx;
11053         struct perf_event_context *dst_ctx;
11054         struct perf_event *event, *tmp;
11055         LIST_HEAD(events);
11056
11057         src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
11058         dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
11059
11060         /*
11061          * See perf_event_ctx_lock() for comments on the details
11062          * of swizzling perf_event::ctx.
11063          */
11064         mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
11065         list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
11066                                  event_entry) {
11067                 perf_remove_from_context(event, 0);
11068                 unaccount_event_cpu(event, src_cpu);
11069                 put_ctx(src_ctx);
11070                 list_add(&event->migrate_entry, &events);
11071         }
11072
11073         /*
11074          * Wait for the events to quiesce before re-instating them.
11075          */
11076         synchronize_rcu();
11077
11078         /*
11079          * Re-instate events in 2 passes.
11080          *
11081          * Skip over group leaders and only install siblings on this first
11082          * pass, siblings will not get enabled without a leader, however a
11083          * leader will enable its siblings, even if those are still on the old
11084          * context.
11085          */
11086         list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
11087                 if (event->group_leader == event)
11088                         continue;
11089
11090                 list_del(&event->migrate_entry);
11091                 if (event->state >= PERF_EVENT_STATE_OFF)
11092                         event->state = PERF_EVENT_STATE_INACTIVE;
11093                 account_event_cpu(event, dst_cpu);
11094                 perf_install_in_context(dst_ctx, event, dst_cpu);
11095                 get_ctx(dst_ctx);
11096         }
11097
11098         /*
11099          * Once all the siblings are setup properly, install the group leaders
11100          * to make it go.
11101          */
11102         list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
11103                 list_del(&event->migrate_entry);
11104                 if (event->state >= PERF_EVENT_STATE_OFF)
11105                         event->state = PERF_EVENT_STATE_INACTIVE;
11106                 account_event_cpu(event, dst_cpu);
11107                 perf_install_in_context(dst_ctx, event, dst_cpu);
11108                 get_ctx(dst_ctx);
11109         }
11110         mutex_unlock(&dst_ctx->mutex);
11111         mutex_unlock(&src_ctx->mutex);
11112 }
11113 EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
11114
11115 static void sync_child_event(struct perf_event *child_event,
11116                                struct task_struct *child)
11117 {
11118         struct perf_event *parent_event = child_event->parent;
11119         u64 child_val;
11120
11121         if (child_event->attr.inherit_stat)
11122                 perf_event_read_event(child_event, child);
11123
11124         child_val = perf_event_count(child_event);
11125
11126         /*
11127          * Add back the child's count to the parent's count:
11128          */
11129         atomic64_add(child_val, &parent_event->child_count);
11130         atomic64_add(child_event->total_time_enabled,
11131                      &parent_event->child_total_time_enabled);
11132         atomic64_add(child_event->total_time_running,
11133                      &parent_event->child_total_time_running);
11134 }
11135
11136 static void
11137 perf_event_exit_event(struct perf_event *child_event,
11138                       struct perf_event_context *child_ctx,
11139                       struct task_struct *child)
11140 {
11141         struct perf_event *parent_event = child_event->parent;
11142
11143         /*
11144          * Do not destroy the 'original' grouping; because of the context
11145          * switch optimization the original events could've ended up in a
11146          * random child task.
11147          *
11148          * If we were to destroy the original group, all group related
11149          * operations would cease to function properly after this random
11150          * child dies.
11151          *
11152          * Do destroy all inherited groups, we don't care about those
11153          * and being thorough is better.
11154          */
11155         raw_spin_lock_irq(&child_ctx->lock);
11156         WARN_ON_ONCE(child_ctx->is_active);
11157
11158         if (parent_event)
11159                 perf_group_detach(child_event);
11160         list_del_event(child_event, child_ctx);
11161         perf_event_set_state(child_event, PERF_EVENT_STATE_EXIT); /* is_event_hup() */
11162         raw_spin_unlock_irq(&child_ctx->lock);
11163
11164         /*
11165          * Parent events are governed by their filedesc, retain them.
11166          */
11167         if (!parent_event) {
11168                 perf_event_wakeup(child_event);
11169                 return;
11170         }
11171         /*
11172          * Child events can be cleaned up.
11173          */
11174
11175         sync_child_event(child_event, child);
11176
11177         /*
11178          * Remove this event from the parent's list
11179          */
11180         WARN_ON_ONCE(parent_event->ctx->parent_ctx);
11181         mutex_lock(&parent_event->child_mutex);
11182         list_del_init(&child_event->child_list);
11183         mutex_unlock(&parent_event->child_mutex);
11184
11185         /*
11186          * Kick perf_poll() for is_event_hup().
11187          */
11188         perf_event_wakeup(parent_event);
11189         free_event(child_event);
11190         put_event(parent_event);
11191 }
11192
11193 static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
11194 {
11195         struct perf_event_context *child_ctx, *clone_ctx = NULL;
11196         struct perf_event *child_event, *next;
11197
11198         WARN_ON_ONCE(child != current);
11199
11200         child_ctx = perf_pin_task_context(child, ctxn);
11201         if (!child_ctx)
11202                 return;
11203
11204         /*
11205          * In order to reduce the amount of tricky in ctx tear-down, we hold
11206          * ctx::mutex over the entire thing. This serializes against almost
11207          * everything that wants to access the ctx.
11208          *
11209          * The exception is sys_perf_event_open() /
11210          * perf_event_create_kernel_count() which does find_get_context()
11211          * without ctx::mutex (it cannot because of the move_group double mutex
11212          * lock thing). See the comments in perf_install_in_context().
11213          */
11214         mutex_lock(&child_ctx->mutex);
11215
11216         /*
11217          * In a single ctx::lock section, de-schedule the events and detach the
11218          * context from the task such that we cannot ever get it scheduled back
11219          * in.
11220          */
11221         raw_spin_lock_irq(&child_ctx->lock);
11222         task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx, EVENT_ALL);
11223
11224         /*
11225          * Now that the context is inactive, destroy the task <-> ctx relation
11226          * and mark the context dead.
11227          */
11228         RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
11229         put_ctx(child_ctx); /* cannot be last */
11230         WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
11231         put_task_struct(current); /* cannot be last */
11232
11233         clone_ctx = unclone_ctx(child_ctx);
11234         raw_spin_unlock_irq(&child_ctx->lock);
11235
11236         if (clone_ctx)
11237                 put_ctx(clone_ctx);
11238
11239         /*
11240          * Report the task dead after unscheduling the events so that we
11241          * won't get any samples after PERF_RECORD_EXIT. We can however still
11242          * get a few PERF_RECORD_READ events.
11243          */
11244         perf_event_task(child, child_ctx, 0);
11245
11246         list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
11247                 perf_event_exit_event(child_event, child_ctx, child);
11248
11249         mutex_unlock(&child_ctx->mutex);
11250
11251         put_ctx(child_ctx);
11252 }
11253
11254 /*
11255  * When a child task exits, feed back event values to parent events.
11256  *
11257  * Can be called with cred_guard_mutex held when called from
11258  * install_exec_creds().
11259  */
11260 void perf_event_exit_task(struct task_struct *child)
11261 {
11262         struct perf_event *event, *tmp;
11263         int ctxn;
11264
11265         mutex_lock(&child->perf_event_mutex);
11266         list_for_each_entry_safe(event, tmp, &child->perf_event_list,
11267                                  owner_entry) {
11268                 list_del_init(&event->owner_entry);
11269
11270                 /*
11271                  * Ensure the list deletion is visible before we clear
11272                  * the owner, closes a race against perf_release() where
11273                  * we need to serialize on the owner->perf_event_mutex.
11274                  */
11275                 smp_store_release(&event->owner, NULL);
11276         }
11277         mutex_unlock(&child->perf_event_mutex);
11278
11279         for_each_task_context_nr(ctxn)
11280                 perf_event_exit_task_context(child, ctxn);
11281
11282         /*
11283          * The perf_event_exit_task_context calls perf_event_task
11284          * with child's task_ctx, which generates EXIT events for
11285          * child contexts and sets child->perf_event_ctxp[] to NULL.
11286          * At this point we need to send EXIT events to cpu contexts.
11287          */
11288         perf_event_task(child, NULL, 0);
11289 }
11290
11291 static void perf_free_event(struct perf_event *event,
11292                             struct perf_event_context *ctx)
11293 {
11294         struct perf_event *parent = event->parent;
11295
11296         if (WARN_ON_ONCE(!parent))
11297                 return;
11298
11299         mutex_lock(&parent->child_mutex);
11300         list_del_init(&event->child_list);
11301         mutex_unlock(&parent->child_mutex);
11302
11303         put_event(parent);
11304
11305         raw_spin_lock_irq(&ctx->lock);
11306         perf_group_detach(event);
11307         list_del_event(event, ctx);
11308         raw_spin_unlock_irq(&ctx->lock);
11309         free_event(event);
11310 }
11311
11312 /*
11313  * Free a context as created by inheritance by perf_event_init_task() below,
11314  * used by fork() in case of fail.
11315  *
11316  * Even though the task has never lived, the context and events have been
11317  * exposed through the child_list, so we must take care tearing it all down.
11318  */
11319 void perf_event_free_task(struct task_struct *task)
11320 {
11321         struct perf_event_context *ctx;
11322         struct perf_event *event, *tmp;
11323         int ctxn;
11324
11325         for_each_task_context_nr(ctxn) {
11326                 ctx = task->perf_event_ctxp[ctxn];
11327                 if (!ctx)
11328                         continue;
11329
11330                 mutex_lock(&ctx->mutex);
11331                 raw_spin_lock_irq(&ctx->lock);
11332                 /*
11333                  * Destroy the task <-> ctx relation and mark the context dead.
11334                  *
11335                  * This is important because even though the task hasn't been
11336                  * exposed yet the context has been (through child_list).
11337                  */
11338                 RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
11339                 WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
11340                 put_task_struct(task); /* cannot be last */
11341                 raw_spin_unlock_irq(&ctx->lock);
11342
11343                 list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry)
11344                         perf_free_event(event, ctx);
11345
11346                 mutex_unlock(&ctx->mutex);
11347
11348                 /*
11349                  * perf_event_release_kernel() could've stolen some of our
11350                  * child events and still have them on its free_list. In that
11351                  * case we must wait for these events to have been freed (in
11352                  * particular all their references to this task must've been
11353                  * dropped).
11354                  *
11355                  * Without this copy_process() will unconditionally free this
11356                  * task (irrespective of its reference count) and
11357                  * _free_event()'s put_task_struct(event->hw.target) will be a
11358                  * use-after-free.
11359                  *
11360                  * Wait for all events to drop their context reference.
11361                  */
11362                 wait_var_event(&ctx->refcount, atomic_read(&ctx->refcount) == 1);
11363                 put_ctx(ctx); /* must be last */
11364         }
11365 }
11366
11367 void perf_event_delayed_put(struct task_struct *task)
11368 {
11369         int ctxn;
11370
11371         for_each_task_context_nr(ctxn)
11372                 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
11373 }
11374
11375 struct file *perf_event_get(unsigned int fd)
11376 {
11377         struct file *file;
11378
11379         file = fget_raw(fd);
11380         if (!file)
11381                 return ERR_PTR(-EBADF);
11382
11383         if (file->f_op != &perf_fops) {
11384                 fput(file);
11385                 return ERR_PTR(-EBADF);
11386         }
11387
11388         return file;
11389 }
11390
11391 const struct perf_event *perf_get_event(struct file *file)
11392 {
11393         if (file->f_op != &perf_fops)
11394                 return ERR_PTR(-EINVAL);
11395
11396         return file->private_data;
11397 }
11398
11399 const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
11400 {
11401         if (!event)
11402                 return ERR_PTR(-EINVAL);
11403
11404         return &event->attr;
11405 }
11406
11407 /*
11408  * Inherit an event from parent task to child task.
11409  *
11410  * Returns:
11411  *  - valid pointer on success
11412  *  - NULL for orphaned events
11413  *  - IS_ERR() on error
11414  */
11415 static struct perf_event *
11416 inherit_event(struct perf_event *parent_event,
11417               struct task_struct *parent,
11418               struct perf_event_context *parent_ctx,
11419               struct task_struct *child,
11420               struct perf_event *group_leader,
11421               struct perf_event_context *child_ctx)
11422 {
11423         enum perf_event_state parent_state = parent_event->state;
11424         struct perf_event *child_event;
11425         unsigned long flags;
11426
11427         /*
11428          * Instead of creating recursive hierarchies of events,
11429          * we link inherited events back to the original parent,
11430          * which has a filp for sure, which we use as the reference
11431          * count:
11432          */
11433         if (parent_event->parent)
11434                 parent_event = parent_event->parent;
11435
11436         child_event = perf_event_alloc(&parent_event->attr,
11437                                            parent_event->cpu,
11438                                            child,
11439                                            group_leader, parent_event,
11440                                            NULL, NULL, -1);
11441         if (IS_ERR(child_event))
11442                 return child_event;
11443
11444
11445         if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) &&
11446             !child_ctx->task_ctx_data) {
11447                 struct pmu *pmu = child_event->pmu;
11448
11449                 child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size,
11450                                                    GFP_KERNEL);
11451                 if (!child_ctx->task_ctx_data) {
11452                         free_event(child_event);
11453                         return ERR_PTR(-ENOMEM);
11454                 }
11455         }
11456
11457         /*
11458          * is_orphaned_event() and list_add_tail(&parent_event->child_list)
11459          * must be under the same lock in order to serialize against
11460          * perf_event_release_kernel(), such that either we must observe
11461          * is_orphaned_event() or they will observe us on the child_list.
11462          */
11463         mutex_lock(&parent_event->child_mutex);
11464         if (is_orphaned_event(parent_event) ||
11465             !atomic_long_inc_not_zero(&parent_event->refcount)) {
11466                 mutex_unlock(&parent_event->child_mutex);
11467                 /* task_ctx_data is freed with child_ctx */
11468                 free_event(child_event);
11469                 return NULL;
11470         }
11471
11472         get_ctx(child_ctx);
11473
11474         /*
11475          * Make the child state follow the state of the parent event,
11476          * not its attr.disabled bit.  We hold the parent's mutex,
11477          * so we won't race with perf_event_{en, dis}able_family.
11478          */
11479         if (parent_state >= PERF_EVENT_STATE_INACTIVE)
11480                 child_event->state = PERF_EVENT_STATE_INACTIVE;
11481         else
11482                 child_event->state = PERF_EVENT_STATE_OFF;
11483
11484         if (parent_event->attr.freq) {
11485                 u64 sample_period = parent_event->hw.sample_period;
11486                 struct hw_perf_event *hwc = &child_event->hw;
11487
11488                 hwc->sample_period = sample_period;
11489                 hwc->last_period   = sample_period;
11490
11491                 local64_set(&hwc->period_left, sample_period);
11492         }
11493
11494         child_event->ctx = child_ctx;
11495         child_event->overflow_handler = parent_event->overflow_handler;
11496         child_event->overflow_handler_context
11497                 = parent_event->overflow_handler_context;
11498
11499         /*
11500          * Precalculate sample_data sizes
11501          */
11502         perf_event__header_size(child_event);
11503         perf_event__id_header_size(child_event);
11504
11505         /*
11506          * Link it up in the child's context:
11507          */
11508         raw_spin_lock_irqsave(&child_ctx->lock, flags);
11509         add_event_to_ctx(child_event, child_ctx);
11510         raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
11511
11512         /*
11513          * Link this into the parent event's child list
11514          */
11515         list_add_tail(&child_event->child_list, &parent_event->child_list);
11516         mutex_unlock(&parent_event->child_mutex);
11517
11518         return child_event;
11519 }
11520
11521 /*
11522  * Inherits an event group.
11523  *
11524  * This will quietly suppress orphaned events; !inherit_event() is not an error.
11525  * This matches with perf_event_release_kernel() removing all child events.
11526  *
11527  * Returns:
11528  *  - 0 on success
11529  *  - <0 on error
11530  */
11531 static int inherit_group(struct perf_event *parent_event,
11532               struct task_struct *parent,
11533               struct perf_event_context *parent_ctx,
11534               struct task_struct *child,
11535               struct perf_event_context *child_ctx)
11536 {
11537         struct perf_event *leader;
11538         struct perf_event *sub;
11539         struct perf_event *child_ctr;
11540
11541         leader = inherit_event(parent_event, parent, parent_ctx,
11542                                  child, NULL, child_ctx);
11543         if (IS_ERR(leader))
11544                 return PTR_ERR(leader);
11545         /*
11546          * @leader can be NULL here because of is_orphaned_event(). In this
11547          * case inherit_event() will create individual events, similar to what
11548          * perf_group_detach() would do anyway.
11549          */
11550         for_each_sibling_event(sub, parent_event) {
11551                 child_ctr = inherit_event(sub, parent, parent_ctx,
11552                                             child, leader, child_ctx);
11553                 if (IS_ERR(child_ctr))
11554                         return PTR_ERR(child_ctr);
11555         }
11556         return 0;
11557 }
11558
11559 /*
11560  * Creates the child task context and tries to inherit the event-group.
11561  *
11562  * Clears @inherited_all on !attr.inherited or error. Note that we'll leave
11563  * inherited_all set when we 'fail' to inherit an orphaned event; this is
11564  * consistent with perf_event_release_kernel() removing all child events.
11565  *
11566  * Returns:
11567  *  - 0 on success
11568  *  - <0 on error
11569  */
11570 static int
11571 inherit_task_group(struct perf_event *event, struct task_struct *parent,
11572                    struct perf_event_context *parent_ctx,
11573                    struct task_struct *child, int ctxn,
11574                    int *inherited_all)
11575 {
11576         int ret;
11577         struct perf_event_context *child_ctx;
11578
11579         if (!event->attr.inherit) {
11580                 *inherited_all = 0;
11581                 return 0;
11582         }
11583
11584         child_ctx = child->perf_event_ctxp[ctxn];
11585         if (!child_ctx) {
11586                 /*
11587                  * This is executed from the parent task context, so
11588                  * inherit events that have been marked for cloning.
11589                  * First allocate and initialize a context for the
11590                  * child.
11591                  */
11592                 child_ctx = alloc_perf_context(parent_ctx->pmu, child);
11593                 if (!child_ctx)
11594                         return -ENOMEM;
11595
11596                 child->perf_event_ctxp[ctxn] = child_ctx;
11597         }
11598
11599         ret = inherit_group(event, parent, parent_ctx,
11600                             child, child_ctx);
11601
11602         if (ret)
11603                 *inherited_all = 0;
11604
11605         return ret;
11606 }
11607
11608 /*
11609  * Initialize the perf_event context in task_struct
11610  */
11611 static int perf_event_init_context(struct task_struct *child, int ctxn)
11612 {
11613         struct perf_event_context *child_ctx, *parent_ctx;
11614         struct perf_event_context *cloned_ctx;
11615         struct perf_event *event;
11616         struct task_struct *parent = current;
11617         int inherited_all = 1;
11618         unsigned long flags;
11619         int ret = 0;
11620
11621         if (likely(!parent->perf_event_ctxp[ctxn]))
11622                 return 0;
11623
11624         /*
11625          * If the parent's context is a clone, pin it so it won't get
11626          * swapped under us.
11627          */
11628         parent_ctx = perf_pin_task_context(parent, ctxn);
11629         if (!parent_ctx)
11630                 return 0;
11631
11632         /*
11633          * No need to check if parent_ctx != NULL here; since we saw
11634          * it non-NULL earlier, the only reason for it to become NULL
11635          * is if we exit, and since we're currently in the middle of
11636          * a fork we can't be exiting at the same time.
11637          */
11638
11639         /*
11640          * Lock the parent list. No need to lock the child - not PID
11641          * hashed yet and not running, so nobody can access it.
11642          */
11643         mutex_lock(&parent_ctx->mutex);
11644
11645         /*
11646          * We dont have to disable NMIs - we are only looking at
11647          * the list, not manipulating it:
11648          */
11649         perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
11650                 ret = inherit_task_group(event, parent, parent_ctx,
11651                                          child, ctxn, &inherited_all);
11652                 if (ret)
11653                         goto out_unlock;
11654         }
11655
11656         /*
11657          * We can't hold ctx->lock when iterating the ->flexible_group list due
11658          * to allocations, but we need to prevent rotation because
11659          * rotate_ctx() will change the list from interrupt context.
11660          */
11661         raw_spin_lock_irqsave(&parent_ctx->lock, flags);
11662         parent_ctx->rotate_disable = 1;
11663         raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
11664
11665         perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
11666                 ret = inherit_task_group(event, parent, parent_ctx,
11667                                          child, ctxn, &inherited_all);
11668                 if (ret)
11669                         goto out_unlock;
11670         }
11671
11672         raw_spin_lock_irqsave(&parent_ctx->lock, flags);
11673         parent_ctx->rotate_disable = 0;
11674
11675         child_ctx = child->perf_event_ctxp[ctxn];
11676
11677         if (child_ctx && inherited_all) {
11678                 /*
11679                  * Mark the child context as a clone of the parent
11680                  * context, or of whatever the parent is a clone of.
11681                  *
11682                  * Note that if the parent is a clone, the holding of
11683                  * parent_ctx->lock avoids it from being uncloned.
11684                  */
11685                 cloned_ctx = parent_ctx->parent_ctx;
11686                 if (cloned_ctx) {
11687                         child_ctx->parent_ctx = cloned_ctx;
11688                         child_ctx->parent_gen = parent_ctx->parent_gen;
11689                 } else {
11690                         child_ctx->parent_ctx = parent_ctx;
11691                         child_ctx->parent_gen = parent_ctx->generation;
11692                 }
11693                 get_ctx(child_ctx->parent_ctx);
11694         }
11695
11696         raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
11697 out_unlock:
11698         mutex_unlock(&parent_ctx->mutex);
11699
11700         perf_unpin_context(parent_ctx);
11701         put_ctx(parent_ctx);
11702
11703         return ret;
11704 }
11705
11706 /*
11707  * Initialize the perf_event context in task_struct
11708  */
11709 int perf_event_init_task(struct task_struct *child)
11710 {
11711         int ctxn, ret;
11712
11713         memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
11714         mutex_init(&child->perf_event_mutex);
11715         INIT_LIST_HEAD(&child->perf_event_list);
11716
11717         for_each_task_context_nr(ctxn) {
11718                 ret = perf_event_init_context(child, ctxn);
11719                 if (ret) {
11720                         perf_event_free_task(child);
11721                         return ret;
11722                 }
11723         }
11724
11725         return 0;
11726 }
11727
11728 static void __init perf_event_init_all_cpus(void)
11729 {
11730         struct swevent_htable *swhash;
11731         int cpu;
11732
11733         zalloc_cpumask_var(&perf_online_mask, GFP_KERNEL);
11734
11735         for_each_possible_cpu(cpu) {
11736                 swhash = &per_cpu(swevent_htable, cpu);
11737                 mutex_init(&swhash->hlist_mutex);
11738                 INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
11739
11740                 INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
11741                 raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
11742
11743 #ifdef CONFIG_CGROUP_PERF
11744                 INIT_LIST_HEAD(&per_cpu(cgrp_cpuctx_list, cpu));
11745 #endif
11746                 INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
11747         }
11748 }
11749
11750 void perf_swevent_init_cpu(unsigned int cpu)
11751 {
11752         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
11753
11754         mutex_lock(&swhash->hlist_mutex);
11755         if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
11756                 struct swevent_hlist *hlist;
11757
11758                 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
11759                 WARN_ON(!hlist);
11760                 rcu_assign_pointer(swhash->swevent_hlist, hlist);
11761         }
11762         mutex_unlock(&swhash->hlist_mutex);
11763 }
11764
11765 #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
11766 static void __perf_event_exit_context(void *__info)
11767 {
11768         struct perf_event_context *ctx = __info;
11769         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
11770         struct perf_event *event;
11771
11772         raw_spin_lock(&ctx->lock);
11773         ctx_sched_out(ctx, cpuctx, EVENT_TIME);
11774         list_for_each_entry(event, &ctx->event_list, event_entry)
11775                 __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
11776         raw_spin_unlock(&ctx->lock);
11777 }
11778
11779 static void perf_event_exit_cpu_context(int cpu)
11780 {
11781         struct perf_cpu_context *cpuctx;
11782         struct perf_event_context *ctx;
11783         struct pmu *pmu;
11784
11785         mutex_lock(&pmus_lock);
11786         list_for_each_entry(pmu, &pmus, entry) {
11787                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
11788                 ctx = &cpuctx->ctx;
11789
11790                 mutex_lock(&ctx->mutex);
11791                 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
11792                 cpuctx->online = 0;
11793                 mutex_unlock(&ctx->mutex);
11794         }
11795         cpumask_clear_cpu(cpu, perf_online_mask);
11796         mutex_unlock(&pmus_lock);
11797 }
11798 #else
11799
11800 static void perf_event_exit_cpu_context(int cpu) { }
11801
11802 #endif
11803
11804 int perf_event_init_cpu(unsigned int cpu)
11805 {
11806         struct perf_cpu_context *cpuctx;
11807         struct perf_event_context *ctx;
11808         struct pmu *pmu;
11809
11810         perf_swevent_init_cpu(cpu);
11811
11812         mutex_lock(&pmus_lock);
11813         cpumask_set_cpu(cpu, perf_online_mask);
11814         list_for_each_entry(pmu, &pmus, entry) {
11815                 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
11816                 ctx = &cpuctx->ctx;
11817
11818                 mutex_lock(&ctx->mutex);
11819                 cpuctx->online = 1;
11820                 mutex_unlock(&ctx->mutex);
11821         }
11822         mutex_unlock(&pmus_lock);
11823
11824         return 0;
11825 }
11826
11827 int perf_event_exit_cpu(unsigned int cpu)
11828 {
11829         perf_event_exit_cpu_context(cpu);
11830         return 0;
11831 }
11832
11833 static int
11834 perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
11835 {
11836         int cpu;
11837
11838         for_each_online_cpu(cpu)
11839                 perf_event_exit_cpu(cpu);
11840
11841         return NOTIFY_OK;
11842 }
11843
11844 /*
11845  * Run the perf reboot notifier at the very last possible moment so that
11846  * the generic watchdog code runs as long as possible.
11847  */
11848 static struct notifier_block perf_reboot_notifier = {
11849         .notifier_call = perf_reboot,
11850         .priority = INT_MIN,
11851 };
11852
11853 void __init perf_event_init(void)
11854 {
11855         int ret;
11856
11857         idr_init(&pmu_idr);
11858
11859         perf_event_init_all_cpus();
11860         init_srcu_struct(&pmus_srcu);
11861         perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
11862         perf_pmu_register(&perf_cpu_clock, NULL, -1);
11863         perf_pmu_register(&perf_task_clock, NULL, -1);
11864         perf_tp_register();
11865         perf_event_init_cpu(smp_processor_id());
11866         register_reboot_notifier(&perf_reboot_notifier);
11867
11868         ret = init_hw_breakpoint();
11869         WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
11870
11871         /*
11872          * Build time assertion that we keep the data_head at the intended
11873          * location.  IOW, validation we got the __reserved[] size right.
11874          */
11875         BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
11876                      != 1024);
11877 }
11878
11879 ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
11880                               char *page)
11881 {
11882         struct perf_pmu_events_attr *pmu_attr =
11883                 container_of(attr, struct perf_pmu_events_attr, attr);
11884
11885         if (pmu_attr->event_str)
11886                 return sprintf(page, "%s\n", pmu_attr->event_str);
11887
11888         return 0;
11889 }
11890 EXPORT_SYMBOL_GPL(perf_event_sysfs_show);
11891
11892 static int __init perf_event_sysfs_init(void)
11893 {
11894         struct pmu *pmu;
11895         int ret;
11896
11897         mutex_lock(&pmus_lock);
11898
11899         ret = bus_register(&pmu_bus);
11900         if (ret)
11901                 goto unlock;
11902
11903         list_for_each_entry(pmu, &pmus, entry) {
11904                 if (!pmu->name || pmu->type < 0)
11905                         continue;
11906
11907                 ret = pmu_dev_alloc(pmu);
11908                 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
11909         }
11910         pmu_bus_running = 1;
11911         ret = 0;
11912
11913 unlock:
11914         mutex_unlock(&pmus_lock);
11915
11916         return ret;
11917 }
11918 device_initcall(perf_event_sysfs_init);
11919
11920 #ifdef CONFIG_CGROUP_PERF
11921 static struct cgroup_subsys_state *
11922 perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
11923 {
11924         struct perf_cgroup *jc;
11925
11926         jc = kzalloc(sizeof(*jc), GFP_KERNEL);
11927         if (!jc)
11928                 return ERR_PTR(-ENOMEM);
11929
11930         jc->info = alloc_percpu(struct perf_cgroup_info);
11931         if (!jc->info) {
11932                 kfree(jc);
11933                 return ERR_PTR(-ENOMEM);
11934         }
11935
11936         return &jc->css;
11937 }
11938
11939 static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
11940 {
11941         struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
11942
11943         free_percpu(jc->info);
11944         kfree(jc);
11945 }
11946
11947 static int __perf_cgroup_move(void *info)
11948 {
11949         struct task_struct *task = info;
11950         rcu_read_lock();
11951         perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
11952         rcu_read_unlock();
11953         return 0;
11954 }
11955
11956 static void perf_cgroup_attach(struct cgroup_taskset *tset)
11957 {
11958         struct task_struct *task;
11959         struct cgroup_subsys_state *css;
11960
11961         cgroup_taskset_for_each(task, css, tset)
11962                 task_function_call(task, __perf_cgroup_move, task);
11963 }
11964
11965 struct cgroup_subsys perf_event_cgrp_subsys = {
11966         .css_alloc      = perf_cgroup_css_alloc,
11967         .css_free       = perf_cgroup_css_free,
11968         .attach         = perf_cgroup_attach,
11969         /*
11970          * Implicitly enable on dfl hierarchy so that perf events can
11971          * always be filtered by cgroup2 path as long as perf_event
11972          * controller is not mounted on a legacy hierarchy.
11973          */
11974         .implicit_on_dfl = true,
11975         .threaded       = true,
11976 };
11977 #endif /* CONFIG_CGROUP_PERF */