OSDN Git Service

perf: Fix data race between pin_count increment/decrement
[android-x86/kernel.git] / kernel / events / core.c
index 5a97f34..dd740f9 100644 (file)
@@ -94,11 +94,11 @@ static void remote_function(void *data)
  * @info:      the function call argument
  *
  * Calls the function @func when the task is currently running. This might
- * be on the current CPU, which just calls the function directly
+ * be on the current CPU, which just calls the function directly.  This will
+ * retry due to any failures in smp_call_function_single(), such as if the
+ * task_cpu() goes offline concurrently.
  *
- * returns: @func return value, or
- *         -ESRCH  - when the process isn't running
- *         -EAGAIN - when the process moved away
+ * returns @func return value or -ESRCH or -ENXIO when the process isn't running
  */
 static int
 task_function_call(struct task_struct *p, remote_function_f func, void *info)
@@ -111,11 +111,17 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info)
        };
        int ret;
 
-       do {
-               ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1);
+       for (;;) {
+               ret = smp_call_function_single(task_cpu(p), remote_function,
+                                              &data, 1);
                if (!ret)
                        ret = data.ret;
-       } while (ret == -EAGAIN);
+
+               if (ret != -EAGAIN)
+                       break;
+
+               cond_resched();
+       }
 
        return ret;
 }
@@ -436,18 +442,18 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
                loff_t *ppos)
 {
-       int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-
-       if (ret || !write)
-               return ret;
-
+       int ret;
+       int perf_cpu = sysctl_perf_cpu_time_max_percent;
        /*
         * If throttling is disabled don't allow the write:
         */
-       if (sysctl_perf_cpu_time_max_percent == 100 ||
-           sysctl_perf_cpu_time_max_percent == 0)
+       if (write && (perf_cpu == 100 || perf_cpu == 0))
                return -EINVAL;
 
+       ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+       if (ret || !write)
+               return ret;
+
        max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
        perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
        update_perf_cpu_limits();
@@ -1254,6 +1260,7 @@ static void put_ctx(struct perf_event_context *ctx)
  *           perf_event_context::lock
  *         perf_event::mmap_mutex
  *         mmap_sem
+ *           perf_addr_filters_head::lock
  *
  *    cpu_hotplug_lock
  *      pmus_lock
@@ -2007,8 +2014,8 @@ event_sched_out(struct perf_event *event,
        event->pmu->del(event, 0);
        event->oncpu = -1;
 
-       if (event->pending_disable) {
-               event->pending_disable = 0;
+       if (READ_ONCE(event->pending_disable) >= 0) {
+               WRITE_ONCE(event->pending_disable, -1);
                state = PERF_EVENT_STATE_OFF;
        }
        perf_event_set_state(event, state);
@@ -2079,6 +2086,7 @@ __perf_remove_from_context(struct perf_event *event,
 
        if (!ctx->nr_events && ctx->is_active) {
                ctx->is_active = 0;
+               ctx->rotate_necessary = 0;
                if (ctx->task) {
                        WARN_ON_ONCE(cpuctx->task_ctx != ctx);
                        cpuctx->task_ctx = NULL;
@@ -2196,7 +2204,8 @@ EXPORT_SYMBOL_GPL(perf_event_disable);
 
 void perf_event_disable_inatomic(struct perf_event *event)
 {
-       event->pending_disable = 1;
+       WRITE_ONCE(event->pending_disable, smp_processor_id());
+       /* can fail, see perf_pending_event_disable() */
        irq_work_queue(&event->pending);
 }
 
@@ -2540,6 +2549,9 @@ unlock:
        return ret;
 }
 
+static bool exclusive_event_installable(struct perf_event *event,
+                                       struct perf_event_context *ctx);
+
 /*
  * Attach a performance event to a context.
  *
@@ -2554,6 +2566,8 @@ perf_install_in_context(struct perf_event_context *ctx,
 
        lockdep_assert_held(&ctx->mutex);
 
+       WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
+
        if (event->cpu != -1)
                event->cpu = cpu;
 
@@ -2797,7 +2811,7 @@ static int perf_event_stop(struct perf_event *event, int restart)
  *
  * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
  *      we update the addresses of corresponding vmas in
- *     event::addr_filters_offs array and bump the event::addr_filters_gen;
+ *     event::addr_filter_ranges array and bump the event::addr_filters_gen;
  * (p2) when an event is scheduled in (pmu::add), it calls
  *      perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
  *      if the generation has changed since the previous call.
@@ -2948,6 +2962,13 @@ static void ctx_sched_out(struct perf_event_context *ctx,
        if (is_active & EVENT_FLEXIBLE) {
                list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
                        group_sched_out(event, cpuctx, ctx);
+
+               /*
+                * Since we cleared EVENT_FLEXIBLE, also clear
+                * rotate_necessary, is will be reset by
+                * ctx_flexible_sched_in() when needed.
+                */
+               ctx->rotate_necessary = 0;
        }
        perf_pmu_enable(ctx->pmu);
 }
@@ -3306,10 +3327,13 @@ static int flexible_sched_in(struct perf_event *event, void *data)
                return 0;
 
        if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
-               if (!group_sched_in(event, sid->cpuctx, sid->ctx))
-                       list_add_tail(&event->active_list, &sid->ctx->flexible_active);
-               else
+               int ret = group_sched_in(event, sid->cpuctx, sid->ctx);
+               if (ret) {
                        sid->can_add_hw = 0;
+                       sid->ctx->rotate_necessary = 1;
+                       return 0;
+               }
+               list_add_tail(&event->active_list, &sid->ctx->flexible_active);
        }
 
        return 0;
@@ -3667,34 +3691,45 @@ static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
        perf_event_groups_insert(&ctx->flexible_groups, event);
 }
 
+/* pick an event from the flexible_groups to rotate */
 static inline struct perf_event *
-ctx_first_active(struct perf_event_context *ctx)
+ctx_event_to_rotate(struct perf_event_context *ctx)
 {
-       return list_first_entry_or_null(&ctx->flexible_active,
-                                       struct perf_event, active_list);
+       struct perf_event *event;
+
+       /* pick the first active flexible event */
+       event = list_first_entry_or_null(&ctx->flexible_active,
+                                        struct perf_event, active_list);
+
+       /* if no active flexible event, pick the first event */
+       if (!event) {
+               event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
+                                     typeof(*event), group_node);
+       }
+
+       /*
+        * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
+        * finds there are unschedulable events, it will set it again.
+        */
+       ctx->rotate_necessary = 0;
+
+       return event;
 }
 
 static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
 {
        struct perf_event *cpu_event = NULL, *task_event = NULL;
-       bool cpu_rotate = false, task_rotate = false;
-       struct perf_event_context *ctx = NULL;
+       struct perf_event_context *task_ctx = NULL;
+       int cpu_rotate, task_rotate;
 
        /*
         * Since we run this from IRQ context, nobody can install new
         * events, thus the event count values are stable.
         */
 
-       if (cpuctx->ctx.nr_events) {
-               if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
-                       cpu_rotate = true;
-       }
-
-       ctx = cpuctx->task_ctx;
-       if (ctx && ctx->nr_events) {
-               if (ctx->nr_events != ctx->nr_active)
-                       task_rotate = true;
-       }
+       cpu_rotate = cpuctx->ctx.rotate_necessary;
+       task_ctx = cpuctx->task_ctx;
+       task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
 
        if (!(cpu_rotate || task_rotate))
                return false;
@@ -3703,25 +3738,25 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
        perf_pmu_disable(cpuctx->ctx.pmu);
 
        if (task_rotate)
-               task_event = ctx_first_active(ctx);
+               task_event = ctx_event_to_rotate(task_ctx);
        if (cpu_rotate)
-               cpu_event = ctx_first_active(&cpuctx->ctx);
+               cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
 
        /*
         * As per the order given at ctx_resched() first 'pop' task flexible
         * and then, if needed CPU flexible.
         */
-       if (task_event || (ctx && cpu_event))
-               ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
+       if (task_event || (task_ctx && cpu_event))
+               ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
        if (cpu_event)
                cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 
        if (task_event)
-               rotate_ctx(ctx, task_event);
+               rotate_ctx(task_ctx, task_event);
        if (cpu_event)
                rotate_ctx(&cpuctx->ctx, cpu_event);
 
-       perf_event_sched_in(cpuctx, ctx, current);
+       perf_event_sched_in(cpuctx, task_ctx, current);
 
        perf_pmu_enable(cpuctx->ctx.pmu);
        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
@@ -4120,7 +4155,9 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
                ctx = &cpuctx->ctx;
                get_ctx(ctx);
+               raw_spin_lock_irqsave(&ctx->lock, flags);
                ++ctx->pin_count;
+               raw_spin_unlock_irqrestore(&ctx->lock, flags);
 
                return ctx;
        }
@@ -4340,7 +4377,7 @@ static int exclusive_event_init(struct perf_event *event)
 {
        struct pmu *pmu = event->pmu;
 
-       if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+       if (!is_exclusive_pmu(pmu))
                return 0;
 
        /*
@@ -4371,7 +4408,7 @@ static void exclusive_event_destroy(struct perf_event *event)
 {
        struct pmu *pmu = event->pmu;
 
-       if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+       if (!is_exclusive_pmu(pmu))
                return;
 
        /* see comment in exclusive_event_init() */
@@ -4391,14 +4428,15 @@ static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
        return false;
 }
 
-/* Called under the same ctx::mutex as perf_install_in_context() */
 static bool exclusive_event_installable(struct perf_event *event,
                                        struct perf_event_context *ctx)
 {
        struct perf_event *iter_event;
        struct pmu *pmu = event->pmu;
 
-       if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+       lockdep_assert_held(&ctx->mutex);
+
+       if (!is_exclusive_pmu(pmu))
                return true;
 
        list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
@@ -4440,17 +4478,25 @@ static void _free_event(struct perf_event *event)
 
        perf_event_free_bpf_prog(event);
        perf_addr_filters_splice(event, NULL);
-       kfree(event->addr_filters_offs);
+       kfree(event->addr_filter_ranges);
 
        if (event->destroy)
                event->destroy(event);
 
-       if (event->ctx)
-               put_ctx(event->ctx);
-
+       /*
+        * Must be after ->destroy(), due to uprobe_perf_close() using
+        * hw.target.
+        */
        if (event->hw.target)
                put_task_struct(event->hw.target);
 
+       /*
+        * perf_event_free_task() relies on put_ctx() being 'last', in particular
+        * all task references must be cleaned up.
+        */
+       if (event->ctx)
+               put_ctx(event->ctx);
+
        exclusive_event_destroy(event);
        module_put(event->pmu->module);
 
@@ -4630,8 +4676,17 @@ again:
        mutex_unlock(&event->child_mutex);
 
        list_for_each_entry_safe(child, tmp, &free_list, child_list) {
+               void *var = &child->ctx->refcount;
+
                list_del(&child->child_list);
                free_event(child);
+
+               /*
+                * Wake any perf_event_free_task() waiting for this event to be
+                * freed.
+                */
+               smp_mb(); /* pairs with wait_var_event() */
+               wake_up_var(var);
        }
 
 no_ctx:
@@ -4963,6 +5018,11 @@ static void __perf_event_period(struct perf_event *event,
        }
 }
 
+static int perf_event_check_period(struct perf_event *event, u64 value)
+{
+       return event->pmu->check_period(event, value);
+}
+
 static int perf_event_period(struct perf_event *event, u64 __user *arg)
 {
        u64 value;
@@ -4979,6 +5039,12 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
        if (event->attr.freq && value > sysctl_perf_event_sample_rate)
                return -EINVAL;
 
+       if (perf_event_check_period(event, value))
+               return -EINVAL;
+
+       if (!event->attr.freq && (value & (1ULL << 63)))
+               return -EINVAL;
+
        event_function_call(event, __perf_event_period, &value);
 
        return 0;
@@ -5433,11 +5499,11 @@ static void perf_pmu_output_stop(struct perf_event *event);
 static void perf_mmap_close(struct vm_area_struct *vma)
 {
        struct perf_event *event = vma->vm_file->private_data;
-
        struct ring_buffer *rb = ring_buffer_get(event);
        struct user_struct *mmap_user = rb->mmap_user;
        int mmap_locked = rb->mmap_locked;
        unsigned long size = perf_data_size(rb);
+       bool detach_rest = false;
 
        if (event->pmu->event_unmapped)
                event->pmu->event_unmapped(event, vma->vm_mm);
@@ -5468,7 +5534,8 @@ static void perf_mmap_close(struct vm_area_struct *vma)
                mutex_unlock(&event->mmap_mutex);
        }
 
-       atomic_dec(&rb->mmap_count);
+       if (atomic_dec_and_test(&rb->mmap_count))
+               detach_rest = true;
 
        if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
                goto out_put;
@@ -5477,7 +5544,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
        mutex_unlock(&event->mmap_mutex);
 
        /* If there's still other mmap()s of this buffer, we're done. */
-       if (atomic_read(&rb->mmap_count))
+       if (!detach_rest)
                goto out_put;
 
        /*
@@ -5673,7 +5740,15 @@ accounting:
         */
        user_lock_limit *= num_online_cpus();
 
-       user_locked = atomic_long_read(&user->locked_vm) + user_extra;
+       user_locked = atomic_long_read(&user->locked_vm);
+
+       /*
+        * sysctl_perf_event_mlock may have changed, so that
+        *     user->locked_vm > user_lock_limit
+        */
+       if (user_locked > user_lock_limit)
+               user_locked = user_lock_limit;
+       user_locked += user_extra;
 
        if (user_locked > user_lock_limit)
                extra = user_locked - user_lock_limit;
@@ -5795,10 +5870,45 @@ void perf_event_wakeup(struct perf_event *event)
        }
 }
 
+static void perf_pending_event_disable(struct perf_event *event)
+{
+       int cpu = READ_ONCE(event->pending_disable);
+
+       if (cpu < 0)
+               return;
+
+       if (cpu == smp_processor_id()) {
+               WRITE_ONCE(event->pending_disable, -1);
+               perf_event_disable_local(event);
+               return;
+       }
+
+       /*
+        *  CPU-A                       CPU-B
+        *
+        *  perf_event_disable_inatomic()
+        *    @pending_disable = CPU-A;
+        *    irq_work_queue();
+        *
+        *  sched-out
+        *    @pending_disable = -1;
+        *
+        *                              sched-in
+        *                              perf_event_disable_inatomic()
+        *                                @pending_disable = CPU-B;
+        *                                irq_work_queue(); // FAILS
+        *
+        *  irq_work_run()
+        *    perf_pending_event()
+        *
+        * But the event runs on CPU-B and wants disabling there.
+        */
+       irq_work_queue_on(&event->pending, cpu);
+}
+
 static void perf_pending_event(struct irq_work *entry)
 {
-       struct perf_event *event = container_of(entry,
-                       struct perf_event, pending);
+       struct perf_event *event = container_of(entry, struct perf_event, pending);
        int rctx;
 
        rctx = perf_swevent_get_recursion_context();
@@ -5807,10 +5917,7 @@ static void perf_pending_event(struct irq_work *entry)
         * and we won't recurse 'further'.
         */
 
-       if (event->pending_disable) {
-               event->pending_disable = 0;
-               perf_event_disable_local(event);
-       }
+       perf_pending_event_disable(event);
 
        if (event->pending_wakeup) {
                event->pending_wakeup = 0;
@@ -5865,7 +5972,7 @@ static void perf_sample_regs_user(struct perf_regs *regs_user,
        if (user_mode(regs)) {
                regs_user->abi = perf_reg_abi(current);
                regs_user->regs = regs;
-       } else if (current->mm) {
+       } else if (!(current->flags & PF_KTHREAD)) {
                perf_get_regs_user(regs_user, regs, regs_user_copy);
        } else {
                regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
@@ -6335,9 +6442,12 @@ static u64 perf_virt_to_phys(u64 virt)
                 * Try IRQ-safe __get_user_pages_fast first.
                 * If failed, leave phys_addr as 0.
                 */
-               if ((current->mm != NULL) &&
-                   (__get_user_pages_fast(virt, 1, 0, &p) == 1))
-                       phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
+               if (current->mm != NULL) {
+                       pagefault_disable();
+                       if (__get_user_pages_fast(virt, 1, 0, &p) == 1)
+                               phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
+                       pagefault_enable();
+               }
 
                if (p)
                        put_page(p);
@@ -6678,7 +6788,8 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
        raw_spin_lock_irqsave(&ifh->lock, flags);
        list_for_each_entry(filter, &ifh->list, entry) {
                if (filter->path.dentry) {
-                       event->addr_filters_offs[count] = 0;
+                       event->addr_filter_ranges[count].start = 0;
+                       event->addr_filter_ranges[count].size = 0;
                        restart++;
                }
 
@@ -6749,7 +6860,7 @@ static void __perf_event_output_stop(struct perf_event *event, void *data)
 static int __perf_pmu_output_stop(void *info)
 {
        struct perf_event *event = info;
-       struct pmu *pmu = event->pmu;
+       struct pmu *pmu = event->ctx->pmu;
        struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
        struct remote_output ro = {
                .rb     = event->rb,
@@ -6843,10 +6954,17 @@ static void perf_event_task_output(struct perf_event *event,
                goto out;
 
        task_event->event_id.pid = perf_event_pid(event, task);
-       task_event->event_id.ppid = perf_event_pid(event, current);
-
        task_event->event_id.tid = perf_event_tid(event, task);
-       task_event->event_id.ptid = perf_event_tid(event, current);
+
+       if (task_event->event_id.header.type == PERF_RECORD_EXIT) {
+               task_event->event_id.ppid = perf_event_pid(event,
+                                                       task->real_parent);
+               task_event->event_id.ptid = perf_event_pid(event,
+                                                       task->real_parent);
+       } else {  /* PERF_RECORD_FORK */
+               task_event->event_id.ppid = perf_event_pid(event, current);
+               task_event->event_id.ptid = perf_event_tid(event, current);
+       }
 
        task_event->event_id.time = perf_event_clock(event);
 
@@ -7170,6 +7288,7 @@ static void perf_event_mmap_output(struct perf_event *event,
        struct perf_output_handle handle;
        struct perf_sample_data sample;
        int size = mmap_event->event_id.header.size;
+       u32 type = mmap_event->event_id.header.type;
        int ret;
 
        if (!perf_event_mmap_match(event, data))
@@ -7213,6 +7332,7 @@ static void perf_event_mmap_output(struct perf_event *event,
        perf_output_end(&handle);
 out:
        mmap_event->event_id.header.size = size;
+       mmap_event->event_id.header.type = type;
 }
 
 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
@@ -7358,28 +7478,47 @@ static bool perf_addr_filter_match(struct perf_addr_filter *filter,
        return true;
 }
 
+static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter,
+                                       struct vm_area_struct *vma,
+                                       struct perf_addr_filter_range *fr)
+{
+       unsigned long vma_size = vma->vm_end - vma->vm_start;
+       unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
+       struct file *file = vma->vm_file;
+
+       if (!perf_addr_filter_match(filter, file, off, vma_size))
+               return false;
+
+       if (filter->offset < off) {
+               fr->start = vma->vm_start;
+               fr->size = min(vma_size, filter->size - (off - filter->offset));
+       } else {
+               fr->start = vma->vm_start + filter->offset - off;
+               fr->size = min(vma->vm_end - fr->start, filter->size);
+       }
+
+       return true;
+}
+
 static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
 {
        struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
        struct vm_area_struct *vma = data;
-       unsigned long off = vma->vm_pgoff << PAGE_SHIFT, flags;
-       struct file *file = vma->vm_file;
        struct perf_addr_filter *filter;
        unsigned int restart = 0, count = 0;
+       unsigned long flags;
 
        if (!has_addr_filter(event))
                return;
 
-       if (!file)
+       if (!vma->vm_file)
                return;
 
        raw_spin_lock_irqsave(&ifh->lock, flags);
        list_for_each_entry(filter, &ifh->list, entry) {
-               if (perf_addr_filter_match(filter, file, off,
-                                            vma->vm_end - vma->vm_start)) {
-                       event->addr_filters_offs[count] = vma->vm_start;
+               if (perf_addr_filter_vma_adjust(filter, vma,
+                                               &event->addr_filter_ranges[count]))
                        restart++;
-               }
 
                count++;
        }
@@ -8739,26 +8878,19 @@ static void perf_addr_filters_splice(struct perf_event *event,
  * @filter; if so, adjust filter's address range.
  * Called with mm::mmap_sem down for reading.
  */
-static unsigned long perf_addr_filter_apply(struct perf_addr_filter *filter,
-                                           struct mm_struct *mm)
+static void perf_addr_filter_apply(struct perf_addr_filter *filter,
+                                  struct mm_struct *mm,
+                                  struct perf_addr_filter_range *fr)
 {
        struct vm_area_struct *vma;
 
        for (vma = mm->mmap; vma; vma = vma->vm_next) {
-               struct file *file = vma->vm_file;
-               unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
-               unsigned long vma_size = vma->vm_end - vma->vm_start;
-
-               if (!file)
-                       continue;
-
-               if (!perf_addr_filter_match(filter, file, off, vma_size))
+               if (!vma->vm_file)
                        continue;
 
-               return vma->vm_start;
+               if (perf_addr_filter_vma_adjust(filter, vma, fr))
+                       return;
        }
-
-       return 0;
 }
 
 /*
@@ -8781,26 +8913,29 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
        if (task == TASK_TOMBSTONE)
                return;
 
-       if (!ifh->nr_file_filters)
-               return;
-
-       mm = get_task_mm(event->ctx->task);
-       if (!mm)
-               goto restart;
+       if (ifh->nr_file_filters) {
+               mm = get_task_mm(event->ctx->task);
+               if (!mm)
+                       goto restart;
 
-       down_read(&mm->mmap_sem);
+               down_read(&mm->mmap_sem);
+       }
 
        raw_spin_lock_irqsave(&ifh->lock, flags);
        list_for_each_entry(filter, &ifh->list, entry) {
-               event->addr_filters_offs[count] = 0;
+               if (filter->path.dentry) {
+                       /*
+                        * Adjust base offset if the filter is associated to a
+                        * binary that needs to be mapped:
+                        */
+                       event->addr_filter_ranges[count].start = 0;
+                       event->addr_filter_ranges[count].size = 0;
 
-               /*
-                * Adjust base offset if the filter is associated to a binary
-                * that needs to be mapped:
-                */
-               if (filter->path.dentry)
-                       event->addr_filters_offs[count] =
-                               perf_addr_filter_apply(filter, mm);
+                       perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]);
+               } else {
+                       event->addr_filter_ranges[count].start = filter->offset;
+                       event->addr_filter_ranges[count].size  = filter->size;
+               }
 
                count++;
        }
@@ -8808,9 +8943,11 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
        event->addr_filters_gen++;
        raw_spin_unlock_irqrestore(&ifh->lock, flags);
 
-       up_read(&mm->mmap_sem);
+       if (ifh->nr_file_filters) {
+               up_read(&mm->mmap_sem);
 
-       mmput(mm);
+               mmput(mm);
+       }
 
 restart:
        perf_event_stop(event, 1);
@@ -8935,6 +9072,7 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
                        if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
                                int fpos = token == IF_SRC_FILE ? 2 : 1;
 
+                               kfree(filename);
                                filename = match_strdup(&args[fpos]);
                                if (!filename) {
                                        ret = -ENOMEM;
@@ -8981,16 +9119,13 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
                                 */
                                ret = -EOPNOTSUPP;
                                if (!event->ctx->task)
-                                       goto fail_free_name;
+                                       goto fail;
 
                                /* look up the path and grab its inode */
                                ret = kern_path(filename, LOOKUP_FOLLOW,
                                                &filter->path);
                                if (ret)
-                                       goto fail_free_name;
-
-                               kfree(filename);
-                               filename = NULL;
+                                       goto fail;
 
                                ret = -EINVAL;
                                if (!filter->path.dentry ||
@@ -9010,13 +9145,13 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
        if (state != IF_STATE_ACTION)
                goto fail;
 
+       kfree(filename);
        kfree(orig);
 
        return 0;
 
-fail_free_name:
-       kfree(filename);
 fail:
+       kfree(filename);
        free_filters_list(filters);
        kfree(orig);
 
@@ -9362,6 +9497,11 @@ static int perf_pmu_nop_int(struct pmu *pmu)
        return 0;
 }
 
+static int perf_event_nop_int(struct perf_event *event, u64 value)
+{
+       return 0;
+}
+
 static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
 
 static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
@@ -9662,6 +9802,9 @@ got_cpu_context:
                pmu->pmu_disable = perf_pmu_nop_void;
        }
 
+       if (!pmu->check_period)
+               pmu->check_period = perf_event_nop_int;
+
        if (!pmu->event_idx)
                pmu->event_idx = perf_event_idx_default;
 
@@ -9951,6 +10094,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 
 
        init_waitqueue_head(&event->waitq);
+       event->pending_disable = -1;
        init_irq_work(&event->pending, perf_pending_event);
 
        mutex_init(&event->mmap_mutex);
@@ -10053,14 +10197,28 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
                goto err_pmu;
 
        if (has_addr_filter(event)) {
-               event->addr_filters_offs = kcalloc(pmu->nr_addr_filters,
-                                                  sizeof(unsigned long),
-                                                  GFP_KERNEL);
-               if (!event->addr_filters_offs) {
+               event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
+                                                   sizeof(struct perf_addr_filter_range),
+                                                   GFP_KERNEL);
+               if (!event->addr_filter_ranges) {
                        err = -ENOMEM;
                        goto err_per_task;
                }
 
+               /*
+                * Clone the parent's vma offsets: they are valid until exec()
+                * even if the mm is not shared with the parent.
+                */
+               if (event->parent) {
+                       struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
+
+                       raw_spin_lock_irq(&ifh->lock);
+                       memcpy(event->addr_filter_ranges,
+                              event->parent->addr_filter_ranges,
+                              pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range));
+                       raw_spin_unlock_irq(&ifh->lock);
+               }
+
                /* force hw sync on the address filters */
                event->addr_filters_gen = 1;
        }
@@ -10079,7 +10237,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
        return event;
 
 err_addr_filters:
-       kfree(event->addr_filters_offs);
+       kfree(event->addr_filter_ranges);
 
 err_per_task:
        exclusive_event_destroy(event);
@@ -10561,11 +10719,6 @@ SYSCALL_DEFINE5(perf_event_open,
                goto err_alloc;
        }
 
-       if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
-               err = -EBUSY;
-               goto err_context;
-       }
-
        /*
         * Look up the group leader (we will attach this event to it):
         */
@@ -10653,6 +10806,18 @@ SYSCALL_DEFINE5(perf_event_open,
                                move_group = 0;
                        }
                }
+
+               /*
+                * Failure to create exclusive events returns -EBUSY.
+                */
+               err = -EBUSY;
+               if (!exclusive_event_installable(group_leader, ctx))
+                       goto err_locked;
+
+               for_each_sibling_event(sibling, group_leader) {
+                       if (!exclusive_event_installable(sibling, ctx))
+                               goto err_locked;
+               }
        } else {
                mutex_lock(&ctx->mutex);
        }
@@ -10689,9 +10854,6 @@ SYSCALL_DEFINE5(perf_event_open,
         * because we need to serialize with concurrent event creation.
         */
        if (!exclusive_event_installable(event, ctx)) {
-               /* exclusive and group stuff are assumed mutually exclusive */
-               WARN_ON_ONCE(move_group);
-
                err = -EBUSY;
                goto err_locked;
        }
@@ -10878,7 +11040,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
                goto err_unlock;
        }
 
-       perf_install_in_context(ctx, event, cpu);
+       perf_install_in_context(ctx, event, event->cpu);
        perf_unpin_context(ctx);
        mutex_unlock(&ctx->mutex);
 
@@ -11158,11 +11320,11 @@ static void perf_free_event(struct perf_event *event,
 }
 
 /*
- * Free an unexposed, unused context as created by inheritance by
- * perf_event_init_task below, used by fork() in case of fail.
+ * Free a context as created by inheritance by perf_event_init_task() below,
+ * used by fork() in case of fail.
  *
- * Not all locks are strictly required, but take them anyway to be nice and
- * help out with the lockdep assertions.
+ * Even though the task has never lived, the context and events have been
+ * exposed through the child_list, so we must take care tearing it all down.
  */
 void perf_event_free_task(struct task_struct *task)
 {
@@ -11192,7 +11354,23 @@ void perf_event_free_task(struct task_struct *task)
                        perf_free_event(event, ctx);
 
                mutex_unlock(&ctx->mutex);
-               put_ctx(ctx);
+
+               /*
+                * perf_event_release_kernel() could've stolen some of our
+                * child events and still have them on its free_list. In that
+                * case we must wait for these events to have been freed (in
+                * particular all their references to this task must've been
+                * dropped).
+                *
+                * Without this copy_process() will unconditionally free this
+                * task (irrespective of its reference count) and
+                * _free_event()'s put_task_struct(event->hw.target) will be a
+                * use-after-free.
+                *
+                * Wait for all events to drop their context reference.
+                */
+               wait_var_event(&ctx->refcount, atomic_read(&ctx->refcount) == 1);
+               put_ctx(ctx); /* must be last */
        }
 }
 
@@ -11282,7 +11460,7 @@ inherit_event(struct perf_event *parent_event,
                                                   GFP_KERNEL);
                if (!child_ctx->task_ctx_data) {
                        free_event(child_event);
-                       return NULL;
+                       return ERR_PTR(-ENOMEM);
                }
        }