OSDN Git Service

Merge tag 'v4.4.214' into 10
[sagit-ice-cold/kernel_xiaomi_msm8998.git] / kernel / events / core.c
index 203384a..aad3f61 100644 (file)
@@ -158,6 +158,7 @@ enum event_type_t {
 struct static_key_deferred perf_sched_events __read_mostly;
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
 static DEFINE_PER_CPU(int, perf_sched_cb_usages);
+static DEFINE_PER_CPU(bool, is_idle);
 
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
@@ -175,8 +176,15 @@ static struct srcu_struct pmus_srcu;
  *   0 - disallow raw tracepoint access for unpriv
  *   1 - disallow cpu events for unpriv
  *   2 - disallow kernel profiling for unpriv
+ *   3 - disallow all unpriv perf event use
  */
+#ifdef CONFIG_PERF_EVENTS_USERMODE
+int sysctl_perf_event_paranoid __read_mostly = -1;
+#elif defined CONFIG_SECURITY_PERF_EVENTS_RESTRICT
+int sysctl_perf_event_paranoid __read_mostly = 3;
+#else
 int sysctl_perf_event_paranoid __read_mostly = 1;
+#endif
 
 /* Minimum for 512 kiB + 1 user control page */
 int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
@@ -1492,10 +1500,17 @@ static void perf_group_detach(struct perf_event *event)
         * If this was a group event with sibling events then
         * upgrade the siblings to singleton events by adding them
         * to whatever list we are on.
+        * If this isn't on a list, make sure we still remove the sibling's
+        * group_entry from this sibling_list; otherwise, when that sibling
+        * is later deallocated, it will try to remove itself from this
+        * sibling_list, which may well have been deallocated already,
+        * resulting in a use-after-free.
         */
        list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
                if (list)
                        list_move_tail(&sibling->group_entry, list);
+               else
+                       list_del_init(&sibling->group_entry);
                sibling->group_leader = sibling;
 
                /* Inherit group flags from the previous leader */
@@ -1690,7 +1705,32 @@ static int __perf_remove_from_context(void *info)
 }
 
 
-/*
+#ifdef CONFIG_SMP
+static void perf_retry_remove(struct perf_event *event,
+                             struct remove_event *rep)
+{
+       int up_ret;
+       /*
+        * CPU was offline. Bring it online so we can
+        * gracefully exit a perf context.
+        */
+       up_ret = cpu_up(event->cpu);
+       if (!up_ret)
+               /* Try the remove call once again. */
+               cpu_function_call(event->cpu, __perf_remove_from_context,
+                                 rep);
+       else
+               pr_err("Failed to bring up CPU: %d, ret: %d\n",
+                      event->cpu, up_ret);
+}
+#else
+static void perf_retry_remove(struct perf_event *event,
+                             struct remove_event *rep)
+{
+}
+#endif
+
+ /*
  * Remove the event from a task's (or a CPU's) list of events.
  *
  * CPU events are removed with a smp call. For task events we only
@@ -1703,7 +1743,8 @@ static int __perf_remove_from_context(void *info)
  * When called from perf_event_exit_task, it's OK because the
  * context has been detached from its task.
  */
-static void perf_remove_from_context(struct perf_event *event, bool detach_group)
+static void __ref perf_remove_from_context(struct perf_event *event,
+                                          bool detach_group)
 {
        struct perf_event_context *ctx = event->ctx;
        struct task_struct *task = ctx->task;
@@ -1711,6 +1752,7 @@ static void perf_remove_from_context(struct perf_event *event, bool detach_group
                .event = event,
                .detach_group = detach_group,
        };
+       int ret;
 
        lockdep_assert_held(&ctx->mutex);
 
@@ -1721,7 +1763,11 @@ static void perf_remove_from_context(struct perf_event *event, bool detach_group
                 * already called __perf_remove_from_context from
                 * perf_event_exit_cpu.
                 */
-               cpu_function_call(event->cpu, __perf_remove_from_context, &re);
+               ret = cpu_function_call(event->cpu, __perf_remove_from_context,
+                                       &re);
+               if (ret == -ENXIO)
+                       perf_retry_remove(event, &re);
+
                return;
        }
 
@@ -1917,8 +1963,13 @@ event_sched_in(struct perf_event *event,
        if (event->state <= PERF_EVENT_STATE_OFF)
                return 0;
 
-       event->state = PERF_EVENT_STATE_ACTIVE;
-       event->oncpu = smp_processor_id();
+       WRITE_ONCE(event->oncpu, smp_processor_id());
+       /*
+        * Order event::oncpu write to happen before the ACTIVE state
+        * is visible.
+        */
+       smp_wmb();
+       WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE);
 
        /*
         * Unthrottle events, since we scheduled we might have missed several
@@ -2399,6 +2450,29 @@ void perf_event_enable(struct perf_event *event)
 }
 EXPORT_SYMBOL_GPL(perf_event_enable);
 
+static int __perf_event_stop(void *info)
+{
+       struct perf_event *event = info;
+
+       /* for AUX events, our job is done if the event is already inactive */
+       if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
+               return 0;
+
+       /* matches smp_wmb() in event_sched_in() */
+       smp_rmb();
+
+       /*
+        * There is a window with interrupts enabled before we get here,
+        * so we need to check again lest we try to stop another CPU's event.
+        */
+       if (READ_ONCE(event->oncpu) != smp_processor_id())
+               return -EAGAIN;
+
+       event->pmu->stop(event, PERF_EF_UPDATE);
+
+       return 0;
+}
+
 static int _perf_event_refresh(struct perf_event *event, int refresh)
 {
        /*
@@ -3374,21 +3448,30 @@ u64 perf_event_read_local(struct perf_event *event)
 
 static int perf_event_read(struct perf_event *event, bool group)
 {
-       int ret = 0;
+       int event_cpu, ret = 0;
 
        /*
         * If event is enabled and currently active on a CPU, update the
         * value in the event structure:
         */
-       if (event->state == PERF_EVENT_STATE_ACTIVE) {
+       event_cpu = READ_ONCE(event->oncpu);
+
+       if (event->state == PERF_EVENT_STATE_ACTIVE &&
+                                               !cpu_isolated(event_cpu)) {
                struct perf_read_data data = {
                        .event = event,
                        .group = group,
                        .ret = 0,
                };
-               smp_call_function_single(event->oncpu,
-                                        __perf_event_read, &data, 1);
-               ret = data.ret;
+
+               if ((unsigned int)event_cpu >= nr_cpu_ids)
+                       return 0;
+               if (!event->attr.exclude_idle ||
+                                       !per_cpu(is_idle, event_cpu)) {
+                       smp_call_function_single(event_cpu,
+                               __perf_event_read, &data, 1);
+                       ret = data.ret;
+               }
        } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
                struct perf_event_context *ctx = event->ctx;
                unsigned long flags;
@@ -3483,7 +3566,8 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
 
        if (!task) {
                /* Must be root to operate on a CPU event: */
-               if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+               if (event->owner != EVENT_OWNER_KERNEL && perf_paranoid_cpu() &&
+                       !capable(CAP_SYS_ADMIN))
                        return ERR_PTR(-EACCES);
 
                /*
@@ -3724,6 +3808,9 @@ static void __free_event(struct perf_event *event)
        if (event->destroy)
                event->destroy(event);
 
+       if (event->pmu->free_drv_configs)
+               event->pmu->free_drv_configs(event);
+
        if (event->ctx)
                put_ctx(event->ctx);
 
@@ -3867,6 +3954,15 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel);
  */
 static int perf_release(struct inode *inode, struct file *file)
 {
+       struct perf_event *event = file->private_data;
+
+       /*
+        * Event can be in state OFF because of a constraint check.
+        * Change to ACTIVE so that it gets cleaned up correctly.
+        */
+       if ((event->state == PERF_EVENT_STATE_OFF) &&
+           event->attr.constraint_duplicate)
+               event->state = PERF_EVENT_STATE_ACTIVE;
        put_event(file->private_data);
        return 0;
 }
@@ -4276,6 +4372,8 @@ static int perf_event_set_output(struct perf_event *event,
                                 struct perf_event *output_event);
 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
+static int perf_event_drv_configs(struct perf_event *event,
+                                 void __user *arg);
 
 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
 {
@@ -4332,6 +4430,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
        case PERF_EVENT_IOC_SET_BPF:
                return perf_event_set_bpf_prog(event, arg);
 
+       case PERF_EVENT_IOC_SET_DRV_CONFIGS:
+               return perf_event_drv_configs(event, (void __user *)arg);
+
        default:
                return -ENOTTY;
        }
@@ -4364,6 +4465,7 @@ static long perf_compat_ioctl(struct file *file, unsigned int cmd,
        switch (_IOC_NR(cmd)) {
        case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
        case _IOC_NR(PERF_EVENT_IOC_ID):
+       case _IOC_NR(PERF_EVENT_IOC_SET_DRV_CONFIGS):
                /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
                if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
                        cmd &= ~IOCSIZE_MASK;
@@ -4648,6 +4750,8 @@ static void perf_mmap_open(struct vm_area_struct *vma)
                event->pmu->event_mapped(event);
 }
 
+static void perf_pmu_output_stop(struct perf_event *event);
+
 /*
  * A buffer can be mmap()ed multiple times; either directly through the same
  * event, or through other events by use of perf_event_set_output().
@@ -4675,10 +4779,22 @@ static void perf_mmap_close(struct vm_area_struct *vma)
         */
        if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
            atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
+               /*
+                * Stop all AUX events that are writing to this buffer,
+                * so that we can free its AUX pages and corresponding PMU
+                * data. Note that after rb::aux_mmap_count dropped to zero,
+                * they won't start any more (see perf_aux_output_begin()).
+                */
+               perf_pmu_output_stop(event);
+
+               /* now it's safe to free the pages */
                atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
                vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
 
+               /* this has to be the last one */
                rb_free_aux(rb);
+               WARN_ON_ONCE(atomic_read(&rb->aux_refcount));
+
                mutex_unlock(&event->mmap_mutex);
        }
 
@@ -5762,6 +5878,80 @@ next:
        rcu_read_unlock();
 }
 
+struct remote_output {
+       struct ring_buffer      *rb;
+       int                     err;
+};
+
+static void __perf_event_output_stop(struct perf_event *event, void *data)
+{
+       struct perf_event *parent = event->parent;
+       struct remote_output *ro = data;
+       struct ring_buffer *rb = ro->rb;
+
+       if (!has_aux(event))
+               return;
+
+       if (!parent)
+               parent = event;
+
+       /*
+        * In case of inheritance, it will be the parent that links to the
+        * ring-buffer, but it will be the child that's actually using it:
+        */
+       if (rcu_dereference(parent->rb) == rb)
+               ro->err = __perf_event_stop(event);
+}
+
+static int __perf_pmu_output_stop(void *info)
+{
+       struct perf_event *event = info;
+       struct pmu *pmu = event->pmu;
+       struct perf_cpu_context *cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+       struct remote_output ro = {
+               .rb     = event->rb,
+       };
+
+       rcu_read_lock();
+       perf_event_aux_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro);
+       if (cpuctx->task_ctx)
+               perf_event_aux_ctx(cpuctx->task_ctx, __perf_event_output_stop,
+                                  &ro);
+       rcu_read_unlock();
+
+       return ro.err;
+}
+
+static void perf_pmu_output_stop(struct perf_event *event)
+{
+       struct perf_event *iter;
+       int err, cpu;
+
+restart:
+       rcu_read_lock();
+       list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
+               /*
+                * For per-CPU events, we need to make sure that neither they
+                * nor their children are running; for cpu==-1 events it's
+                * sufficient to stop the event itself if it's active, since
+                * it can't have children.
+                */
+               cpu = iter->cpu;
+               if (cpu == -1)
+                       cpu = READ_ONCE(iter->oncpu);
+
+               if (cpu == -1)
+                       continue;
+
+               err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
+               if (err == -EAGAIN) {
+                       rcu_read_unlock();
+                       goto restart;
+               }
+       }
+       rcu_read_unlock();
+}
+
 /*
  * task tracking -- fork/exit
  *
@@ -6958,6 +7148,8 @@ static struct pmu perf_swevent = {
        .start          = perf_swevent_start,
        .stop           = perf_swevent_stop,
        .read           = perf_swevent_read,
+
+       .events_across_hotplug = 1,
 };
 
 #ifdef CONFIG_EVENT_TRACING
@@ -7081,6 +7273,8 @@ static struct pmu perf_tracepoint = {
        .start          = perf_swevent_start,
        .stop           = perf_swevent_stop,
        .read           = perf_swevent_read,
+
+       .events_across_hotplug = 1,
 };
 
 static inline void perf_tp_register(void)
@@ -7193,6 +7387,15 @@ void perf_bp_event(struct perf_event *bp, void *data)
 }
 #endif
 
+static int perf_event_drv_configs(struct perf_event *event,
+                                 void __user *arg)
+{
+       if (!event->pmu->get_drv_configs)
+               return -EINVAL;
+
+       return event->pmu->get_drv_configs(event, arg);
+}
+
 /*
  * hrtimer based swevent callback
  */
@@ -7360,6 +7563,8 @@ static struct pmu perf_cpu_clock = {
        .start          = cpu_clock_event_start,
        .stop           = cpu_clock_event_stop,
        .read           = cpu_clock_event_read,
+
+       .events_across_hotplug = 1,
 };
 
 /*
@@ -7441,6 +7646,8 @@ static struct pmu perf_task_clock = {
        .start          = task_clock_event_start,
        .stop           = task_clock_event_stop,
        .read           = task_clock_event_read,
+
+       .events_across_hotplug = 1,
 };
 
 static void perf_pmu_nop_void(struct pmu *pmu)
@@ -7921,6 +8128,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
        if (!group_leader)
                group_leader = event;
 
+       mutex_init(&event->group_leader_mutex);
        mutex_init(&event->child_mutex);
        INIT_LIST_HEAD(&event->child_list);
 
@@ -7929,6 +8137,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
        INIT_LIST_HEAD(&event->sibling_list);
        INIT_LIST_HEAD(&event->rb_entry);
        INIT_LIST_HEAD(&event->active_entry);
+       INIT_LIST_HEAD(&event->drv_configs);
        INIT_HLIST_NODE(&event->hlist_entry);
 
 
@@ -8341,10 +8550,16 @@ SYSCALL_DEFINE5(perf_event_open,
        if (flags & ~PERF_FLAG_ALL)
                return -EINVAL;
 
+       if (perf_paranoid_any() && !capable(CAP_SYS_ADMIN))
+               return -EACCES;
+
        err = perf_copy_attr(attr_uptr, &attr);
        if (err)
                return err;
 
+       if (attr.constraint_duplicate || attr.__reserved_1)
+               return -EINVAL;
+
        if (!attr.exclude_kernel) {
                if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
                        return -EACCES;
@@ -8385,6 +8600,16 @@ SYSCALL_DEFINE5(perf_event_open,
                        group_leader = NULL;
        }
 
+       /*
+        * Take the group_leader's group_leader_mutex before observing
+        * anything in the group leader that leads to changes in ctx,
+        * many of which may be changing on another thread.
+        * In particular, we want to take this lock before deciding
+        * whether we need to move_group.
+        */
+       if (group_leader)
+               mutex_lock(&group_leader->group_leader_mutex);
+
        if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
                task = find_lively_task_by_vpid(pid);
                if (IS_ERR(task)) {
@@ -8663,6 +8888,8 @@ SYSCALL_DEFINE5(perf_event_open,
        if (move_group)
                perf_event_ctx_unlock(group_leader, gctx);
        mutex_unlock(&ctx->mutex);
+       if (group_leader)
+               mutex_unlock(&group_leader->group_leader_mutex);
 
        if (task) {
                mutex_unlock(&task->signal->cred_guard_mutex);
@@ -8712,6 +8939,8 @@ err_task:
        if (task)
                put_task_struct(task);
 err_group_fd:
+       if (group_leader)
+               mutex_unlock(&group_leader->group_leader_mutex);
        fdput(group);
 err_fd:
        put_unused_fd(event_fd);
@@ -9423,29 +9652,90 @@ static void __perf_event_exit_context(void *__info)
        rcu_read_unlock();
 }
 
+static void __perf_event_stop_swclock(void *__info)
+{
+       struct perf_event_context *ctx = __info;
+       struct perf_event *event, *tmp;
+
+       list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) {
+               if (event->attr.config == PERF_COUNT_SW_CPU_CLOCK &&
+                   event->attr.type == PERF_TYPE_SOFTWARE)
+                       cpu_clock_event_stop(event, 0);
+       }
+}
+
 static void perf_event_exit_cpu_context(int cpu)
 {
+       struct perf_cpu_context *cpuctx;
        struct perf_event_context *ctx;
+       unsigned long flags;
        struct pmu *pmu;
        int idx;
 
        idx = srcu_read_lock(&pmus_srcu);
        list_for_each_entry_rcu(pmu, &pmus, entry) {
-               ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
+               cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+               ctx = &cpuctx->ctx;
+
+               /* Cancel the mux hrtimer to avoid CPU migration */
+               if (pmu->task_ctx_nr != perf_sw_context) {
+                       raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
+                       hrtimer_cancel(&cpuctx->hrtimer);
+                       cpuctx->hrtimer_active = 0;
+                       raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock,
+                                                       flags);
+               }
 
                mutex_lock(&ctx->mutex);
-               smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
+               /*
+                * If keeping events across hotplugging is supported, do not
+                * remove the event list, but keep it alive across CPU hotplug.
+                * The context is exited via an fd close path when userspace
+                * is done and the target CPU is online. If software clock
+                * event is active, then stop hrtimer associated with it.
+                * Start the timer when the CPU comes back online.
+                */
+               if (!pmu->events_across_hotplug)
+                       smp_call_function_single(cpu, __perf_event_exit_context,
+                                                ctx, 1);
+               else
+                       smp_call_function_single(cpu, __perf_event_stop_swclock,
+                                                ctx, 1);
                mutex_unlock(&ctx->mutex);
        }
        srcu_read_unlock(&pmus_srcu, idx);
 }
 
+static void perf_event_start_swclock(int cpu)
+{
+       struct perf_event_context *ctx;
+       struct pmu *pmu;
+       int idx;
+       struct perf_event *event, *tmp;
+
+       idx = srcu_read_lock(&pmus_srcu);
+       list_for_each_entry_rcu(pmu, &pmus, entry) {
+               if (pmu->events_across_hotplug) {
+                       ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
+                       list_for_each_entry_safe(event, tmp, &ctx->event_list,
+                                                event_entry) {
+                               if (event->attr.config ==
+                                   PERF_COUNT_SW_CPU_CLOCK &&
+                                   event->attr.type == PERF_TYPE_SOFTWARE)
+                                       cpu_clock_event_start(event, 0);
+                       }
+               }
+       }
+       srcu_read_unlock(&pmus_srcu, idx);
+}
+
 static void perf_event_exit_cpu(int cpu)
 {
        perf_event_exit_cpu_context(cpu);
 }
 #else
 static inline void perf_event_exit_cpu(int cpu) { }
+static inline void perf_event_start_swclock(int cpu) { }
 #endif
 
 static int
@@ -9484,6 +9774,11 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
        case CPU_DOWN_PREPARE:
                perf_event_exit_cpu(cpu);
                break;
+
+       case CPU_STARTING:
+               perf_event_start_swclock(cpu);
+               break;
+
        default:
                break;
        }
@@ -9491,6 +9786,25 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
        return NOTIFY_OK;
 }
 
+static int event_idle_notif(struct notifier_block *nb, unsigned long action,
+                                                       void *data)
+{
+       switch (action) {
+       case IDLE_START:
+               __this_cpu_write(is_idle, true);
+               break;
+       case IDLE_END:
+               __this_cpu_write(is_idle, false);
+               break;
+       }
+
+       return NOTIFY_OK;
+}
+
+static struct notifier_block perf_event_idle_nb = {
+       .notifier_call = event_idle_notif,
+};
+
 void __init perf_event_init(void)
 {
        int ret;
@@ -9504,6 +9818,7 @@ void __init perf_event_init(void)
        perf_pmu_register(&perf_task_clock, NULL, -1);
        perf_tp_register();
        perf_cpu_notifier(perf_cpu_notify);
+       idle_notifier_register(&perf_event_idle_nb);
        register_reboot_notifier(&perf_reboot_notifier);
 
        ret = init_hw_breakpoint();