* @info: the function call argument
*
* Calls the function @func when the task is currently running. This might
- * be on the current CPU, which just calls the function directly
+ * be on the current CPU, which just calls the function directly. This will
+ * retry due to any failures in smp_call_function_single(), such as if the
+ * task_cpu() goes offline concurrently.
*
- * returns: @func return value, or
- * -ESRCH - when the process isn't running
- * -EAGAIN - when the process moved away
+ * returns @func return value or -ESRCH or -ENXIO when the process isn't running
*/
static int
task_function_call(struct task_struct *p, remote_function_f func, void *info)
};
int ret;
- do {
- ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1);
+ for (;;) {
+ ret = smp_call_function_single(task_cpu(p), remote_function,
+ &data, 1);
if (!ret)
ret = data.ret;
- } while (ret == -EAGAIN);
+
+ if (ret != -EAGAIN)
+ break;
+
+ cond_resched();
+ }
return ret;
}
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
- int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-
- if (ret || !write)
- return ret;
-
+ int ret;
+ int perf_cpu = sysctl_perf_cpu_time_max_percent;
/*
* If throttling is disabled don't allow the write:
*/
- if (sysctl_perf_cpu_time_max_percent == 100 ||
- sysctl_perf_cpu_time_max_percent == 0)
+ if (write && (perf_cpu == 100 || perf_cpu == 0))
return -EINVAL;
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (ret || !write)
+ return ret;
+
max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
update_perf_cpu_limits();
* perf_event_context::lock
* perf_event::mmap_mutex
* mmap_sem
+ * perf_addr_filters_head::lock
*
* cpu_hotplug_lock
* pmus_lock
event->pmu->del(event, 0);
event->oncpu = -1;
- if (event->pending_disable) {
- event->pending_disable = 0;
+ if (READ_ONCE(event->pending_disable) >= 0) {
+ WRITE_ONCE(event->pending_disable, -1);
state = PERF_EVENT_STATE_OFF;
}
perf_event_set_state(event, state);
if (!ctx->nr_events && ctx->is_active) {
ctx->is_active = 0;
+ ctx->rotate_necessary = 0;
if (ctx->task) {
WARN_ON_ONCE(cpuctx->task_ctx != ctx);
cpuctx->task_ctx = NULL;
void perf_event_disable_inatomic(struct perf_event *event)
{
- event->pending_disable = 1;
+ WRITE_ONCE(event->pending_disable, smp_processor_id());
+ /* can fail, see perf_pending_event_disable() */
irq_work_queue(&event->pending);
}
return ret;
}
+static bool exclusive_event_installable(struct perf_event *event,
+ struct perf_event_context *ctx);
+
/*
* Attach a performance event to a context.
*
lockdep_assert_held(&ctx->mutex);
+ WARN_ON_ONCE(!exclusive_event_installable(event, ctx));
+
if (event->cpu != -1)
event->cpu = cpu;
*
* (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
* we update the addresses of corresponding vmas in
- * event::addr_filters_offs array and bump the event::addr_filters_gen;
+ * event::addr_filter_ranges array and bump the event::addr_filters_gen;
* (p2) when an event is scheduled in (pmu::add), it calls
* perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
* if the generation has changed since the previous call.
if (is_active & EVENT_FLEXIBLE) {
list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
group_sched_out(event, cpuctx, ctx);
+
+ /*
+ * Since we cleared EVENT_FLEXIBLE, also clear
+ * rotate_necessary, is will be reset by
+ * ctx_flexible_sched_in() when needed.
+ */
+ ctx->rotate_necessary = 0;
}
perf_pmu_enable(ctx->pmu);
}
return 0;
if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
- if (!group_sched_in(event, sid->cpuctx, sid->ctx))
- list_add_tail(&event->active_list, &sid->ctx->flexible_active);
- else
+ int ret = group_sched_in(event, sid->cpuctx, sid->ctx);
+ if (ret) {
sid->can_add_hw = 0;
+ sid->ctx->rotate_necessary = 1;
+ return 0;
+ }
+ list_add_tail(&event->active_list, &sid->ctx->flexible_active);
}
return 0;
perf_event_groups_insert(&ctx->flexible_groups, event);
}
+/* pick an event from the flexible_groups to rotate */
static inline struct perf_event *
-ctx_first_active(struct perf_event_context *ctx)
+ctx_event_to_rotate(struct perf_event_context *ctx)
{
- return list_first_entry_or_null(&ctx->flexible_active,
- struct perf_event, active_list);
+ struct perf_event *event;
+
+ /* pick the first active flexible event */
+ event = list_first_entry_or_null(&ctx->flexible_active,
+ struct perf_event, active_list);
+
+ /* if no active flexible event, pick the first event */
+ if (!event) {
+ event = rb_entry_safe(rb_first(&ctx->flexible_groups.tree),
+ typeof(*event), group_node);
+ }
+
+ /*
+ * Unconditionally clear rotate_necessary; if ctx_flexible_sched_in()
+ * finds there are unschedulable events, it will set it again.
+ */
+ ctx->rotate_necessary = 0;
+
+ return event;
}
static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
{
struct perf_event *cpu_event = NULL, *task_event = NULL;
- bool cpu_rotate = false, task_rotate = false;
- struct perf_event_context *ctx = NULL;
+ struct perf_event_context *task_ctx = NULL;
+ int cpu_rotate, task_rotate;
/*
* Since we run this from IRQ context, nobody can install new
* events, thus the event count values are stable.
*/
- if (cpuctx->ctx.nr_events) {
- if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
- cpu_rotate = true;
- }
-
- ctx = cpuctx->task_ctx;
- if (ctx && ctx->nr_events) {
- if (ctx->nr_events != ctx->nr_active)
- task_rotate = true;
- }
+ cpu_rotate = cpuctx->ctx.rotate_necessary;
+ task_ctx = cpuctx->task_ctx;
+ task_rotate = task_ctx ? task_ctx->rotate_necessary : 0;
if (!(cpu_rotate || task_rotate))
return false;
perf_pmu_disable(cpuctx->ctx.pmu);
if (task_rotate)
- task_event = ctx_first_active(ctx);
+ task_event = ctx_event_to_rotate(task_ctx);
if (cpu_rotate)
- cpu_event = ctx_first_active(&cpuctx->ctx);
+ cpu_event = ctx_event_to_rotate(&cpuctx->ctx);
/*
* As per the order given at ctx_resched() first 'pop' task flexible
* and then, if needed CPU flexible.
*/
- if (task_event || (ctx && cpu_event))
- ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
+ if (task_event || (task_ctx && cpu_event))
+ ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE);
if (cpu_event)
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
if (task_event)
- rotate_ctx(ctx, task_event);
+ rotate_ctx(task_ctx, task_event);
if (cpu_event)
rotate_ctx(&cpuctx->ctx, cpu_event);
- perf_event_sched_in(cpuctx, ctx, current);
+ perf_event_sched_in(cpuctx, task_ctx, current);
perf_pmu_enable(cpuctx->ctx.pmu);
perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
ctx = &cpuctx->ctx;
get_ctx(ctx);
+ raw_spin_lock_irqsave(&ctx->lock, flags);
++ctx->pin_count;
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
return ctx;
}
{
struct pmu *pmu = event->pmu;
- if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+ if (!is_exclusive_pmu(pmu))
return 0;
/*
{
struct pmu *pmu = event->pmu;
- if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+ if (!is_exclusive_pmu(pmu))
return;
/* see comment in exclusive_event_init() */
return false;
}
-/* Called under the same ctx::mutex as perf_install_in_context() */
static bool exclusive_event_installable(struct perf_event *event,
struct perf_event_context *ctx)
{
struct perf_event *iter_event;
struct pmu *pmu = event->pmu;
- if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
+ lockdep_assert_held(&ctx->mutex);
+
+ if (!is_exclusive_pmu(pmu))
return true;
list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
perf_event_free_bpf_prog(event);
perf_addr_filters_splice(event, NULL);
- kfree(event->addr_filters_offs);
+ kfree(event->addr_filter_ranges);
if (event->destroy)
event->destroy(event);
- if (event->ctx)
- put_ctx(event->ctx);
-
+ /*
+ * Must be after ->destroy(), due to uprobe_perf_close() using
+ * hw.target.
+ */
if (event->hw.target)
put_task_struct(event->hw.target);
+ /*
+ * perf_event_free_task() relies on put_ctx() being 'last', in particular
+ * all task references must be cleaned up.
+ */
+ if (event->ctx)
+ put_ctx(event->ctx);
+
exclusive_event_destroy(event);
module_put(event->pmu->module);
mutex_unlock(&event->child_mutex);
list_for_each_entry_safe(child, tmp, &free_list, child_list) {
+ void *var = &child->ctx->refcount;
+
list_del(&child->child_list);
free_event(child);
+
+ /*
+ * Wake any perf_event_free_task() waiting for this event to be
+ * freed.
+ */
+ smp_mb(); /* pairs with wait_var_event() */
+ wake_up_var(var);
}
no_ctx:
}
}
+static int perf_event_check_period(struct perf_event *event, u64 value)
+{
+ return event->pmu->check_period(event, value);
+}
+
static int perf_event_period(struct perf_event *event, u64 __user *arg)
{
u64 value;
if (event->attr.freq && value > sysctl_perf_event_sample_rate)
return -EINVAL;
+ if (perf_event_check_period(event, value))
+ return -EINVAL;
+
+ if (!event->attr.freq && (value & (1ULL << 63)))
+ return -EINVAL;
+
event_function_call(event, __perf_event_period, &value);
return 0;
static void perf_mmap_close(struct vm_area_struct *vma)
{
struct perf_event *event = vma->vm_file->private_data;
-
struct ring_buffer *rb = ring_buffer_get(event);
struct user_struct *mmap_user = rb->mmap_user;
int mmap_locked = rb->mmap_locked;
unsigned long size = perf_data_size(rb);
+ bool detach_rest = false;
if (event->pmu->event_unmapped)
event->pmu->event_unmapped(event, vma->vm_mm);
mutex_unlock(&event->mmap_mutex);
}
- atomic_dec(&rb->mmap_count);
+ if (atomic_dec_and_test(&rb->mmap_count))
+ detach_rest = true;
if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
goto out_put;
mutex_unlock(&event->mmap_mutex);
/* If there's still other mmap()s of this buffer, we're done. */
- if (atomic_read(&rb->mmap_count))
+ if (!detach_rest)
goto out_put;
/*
*/
user_lock_limit *= num_online_cpus();
- user_locked = atomic_long_read(&user->locked_vm) + user_extra;
+ user_locked = atomic_long_read(&user->locked_vm);
+
+ /*
+ * sysctl_perf_event_mlock may have changed, so that
+ * user->locked_vm > user_lock_limit
+ */
+ if (user_locked > user_lock_limit)
+ user_locked = user_lock_limit;
+ user_locked += user_extra;
if (user_locked > user_lock_limit)
extra = user_locked - user_lock_limit;
}
}
+static void perf_pending_event_disable(struct perf_event *event)
+{
+ int cpu = READ_ONCE(event->pending_disable);
+
+ if (cpu < 0)
+ return;
+
+ if (cpu == smp_processor_id()) {
+ WRITE_ONCE(event->pending_disable, -1);
+ perf_event_disable_local(event);
+ return;
+ }
+
+ /*
+ * CPU-A CPU-B
+ *
+ * perf_event_disable_inatomic()
+ * @pending_disable = CPU-A;
+ * irq_work_queue();
+ *
+ * sched-out
+ * @pending_disable = -1;
+ *
+ * sched-in
+ * perf_event_disable_inatomic()
+ * @pending_disable = CPU-B;
+ * irq_work_queue(); // FAILS
+ *
+ * irq_work_run()
+ * perf_pending_event()
+ *
+ * But the event runs on CPU-B and wants disabling there.
+ */
+ irq_work_queue_on(&event->pending, cpu);
+}
+
static void perf_pending_event(struct irq_work *entry)
{
- struct perf_event *event = container_of(entry,
- struct perf_event, pending);
+ struct perf_event *event = container_of(entry, struct perf_event, pending);
int rctx;
rctx = perf_swevent_get_recursion_context();
* and we won't recurse 'further'.
*/
- if (event->pending_disable) {
- event->pending_disable = 0;
- perf_event_disable_local(event);
- }
+ perf_pending_event_disable(event);
if (event->pending_wakeup) {
event->pending_wakeup = 0;
if (user_mode(regs)) {
regs_user->abi = perf_reg_abi(current);
regs_user->regs = regs;
- } else if (current->mm) {
+ } else if (!(current->flags & PF_KTHREAD)) {
perf_get_regs_user(regs_user, regs, regs_user_copy);
} else {
regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
* Try IRQ-safe __get_user_pages_fast first.
* If failed, leave phys_addr as 0.
*/
- if ((current->mm != NULL) &&
- (__get_user_pages_fast(virt, 1, 0, &p) == 1))
- phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
+ if (current->mm != NULL) {
+ pagefault_disable();
+ if (__get_user_pages_fast(virt, 1, 0, &p) == 1)
+ phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
+ pagefault_enable();
+ }
if (p)
put_page(p);
raw_spin_lock_irqsave(&ifh->lock, flags);
list_for_each_entry(filter, &ifh->list, entry) {
if (filter->path.dentry) {
- event->addr_filters_offs[count] = 0;
+ event->addr_filter_ranges[count].start = 0;
+ event->addr_filter_ranges[count].size = 0;
restart++;
}
static int __perf_pmu_output_stop(void *info)
{
struct perf_event *event = info;
- struct pmu *pmu = event->pmu;
+ struct pmu *pmu = event->ctx->pmu;
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
struct remote_output ro = {
.rb = event->rb,
goto out;
task_event->event_id.pid = perf_event_pid(event, task);
- task_event->event_id.ppid = perf_event_pid(event, current);
-
task_event->event_id.tid = perf_event_tid(event, task);
- task_event->event_id.ptid = perf_event_tid(event, current);
+
+ if (task_event->event_id.header.type == PERF_RECORD_EXIT) {
+ task_event->event_id.ppid = perf_event_pid(event,
+ task->real_parent);
+ task_event->event_id.ptid = perf_event_pid(event,
+ task->real_parent);
+ } else { /* PERF_RECORD_FORK */
+ task_event->event_id.ppid = perf_event_pid(event, current);
+ task_event->event_id.ptid = perf_event_tid(event, current);
+ }
task_event->event_id.time = perf_event_clock(event);
struct perf_output_handle handle;
struct perf_sample_data sample;
int size = mmap_event->event_id.header.size;
+ u32 type = mmap_event->event_id.header.type;
int ret;
if (!perf_event_mmap_match(event, data))
perf_output_end(&handle);
out:
mmap_event->event_id.header.size = size;
+ mmap_event->event_id.header.type = type;
}
static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
return true;
}
+static bool perf_addr_filter_vma_adjust(struct perf_addr_filter *filter,
+ struct vm_area_struct *vma,
+ struct perf_addr_filter_range *fr)
+{
+ unsigned long vma_size = vma->vm_end - vma->vm_start;
+ unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
+ struct file *file = vma->vm_file;
+
+ if (!perf_addr_filter_match(filter, file, off, vma_size))
+ return false;
+
+ if (filter->offset < off) {
+ fr->start = vma->vm_start;
+ fr->size = min(vma_size, filter->size - (off - filter->offset));
+ } else {
+ fr->start = vma->vm_start + filter->offset - off;
+ fr->size = min(vma->vm_end - fr->start, filter->size);
+ }
+
+ return true;
+}
+
static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
{
struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
struct vm_area_struct *vma = data;
- unsigned long off = vma->vm_pgoff << PAGE_SHIFT, flags;
- struct file *file = vma->vm_file;
struct perf_addr_filter *filter;
unsigned int restart = 0, count = 0;
+ unsigned long flags;
if (!has_addr_filter(event))
return;
- if (!file)
+ if (!vma->vm_file)
return;
raw_spin_lock_irqsave(&ifh->lock, flags);
list_for_each_entry(filter, &ifh->list, entry) {
- if (perf_addr_filter_match(filter, file, off,
- vma->vm_end - vma->vm_start)) {
- event->addr_filters_offs[count] = vma->vm_start;
+ if (perf_addr_filter_vma_adjust(filter, vma,
+ &event->addr_filter_ranges[count]))
restart++;
- }
count++;
}
* @filter; if so, adjust filter's address range.
* Called with mm::mmap_sem down for reading.
*/
-static unsigned long perf_addr_filter_apply(struct perf_addr_filter *filter,
- struct mm_struct *mm)
+static void perf_addr_filter_apply(struct perf_addr_filter *filter,
+ struct mm_struct *mm,
+ struct perf_addr_filter_range *fr)
{
struct vm_area_struct *vma;
for (vma = mm->mmap; vma; vma = vma->vm_next) {
- struct file *file = vma->vm_file;
- unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
- unsigned long vma_size = vma->vm_end - vma->vm_start;
-
- if (!file)
- continue;
-
- if (!perf_addr_filter_match(filter, file, off, vma_size))
+ if (!vma->vm_file)
continue;
- return vma->vm_start;
+ if (perf_addr_filter_vma_adjust(filter, vma, fr))
+ return;
}
-
- return 0;
}
/*
if (task == TASK_TOMBSTONE)
return;
- if (!ifh->nr_file_filters)
- return;
-
- mm = get_task_mm(event->ctx->task);
- if (!mm)
- goto restart;
+ if (ifh->nr_file_filters) {
+ mm = get_task_mm(event->ctx->task);
+ if (!mm)
+ goto restart;
- down_read(&mm->mmap_sem);
+ down_read(&mm->mmap_sem);
+ }
raw_spin_lock_irqsave(&ifh->lock, flags);
list_for_each_entry(filter, &ifh->list, entry) {
- event->addr_filters_offs[count] = 0;
+ if (filter->path.dentry) {
+ /*
+ * Adjust base offset if the filter is associated to a
+ * binary that needs to be mapped:
+ */
+ event->addr_filter_ranges[count].start = 0;
+ event->addr_filter_ranges[count].size = 0;
- /*
- * Adjust base offset if the filter is associated to a binary
- * that needs to be mapped:
- */
- if (filter->path.dentry)
- event->addr_filters_offs[count] =
- perf_addr_filter_apply(filter, mm);
+ perf_addr_filter_apply(filter, mm, &event->addr_filter_ranges[count]);
+ } else {
+ event->addr_filter_ranges[count].start = filter->offset;
+ event->addr_filter_ranges[count].size = filter->size;
+ }
count++;
}
event->addr_filters_gen++;
raw_spin_unlock_irqrestore(&ifh->lock, flags);
- up_read(&mm->mmap_sem);
+ if (ifh->nr_file_filters) {
+ up_read(&mm->mmap_sem);
- mmput(mm);
+ mmput(mm);
+ }
restart:
perf_event_stop(event, 1);
if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
int fpos = token == IF_SRC_FILE ? 2 : 1;
+ kfree(filename);
filename = match_strdup(&args[fpos]);
if (!filename) {
ret = -ENOMEM;
*/
ret = -EOPNOTSUPP;
if (!event->ctx->task)
- goto fail_free_name;
+ goto fail;
/* look up the path and grab its inode */
ret = kern_path(filename, LOOKUP_FOLLOW,
&filter->path);
if (ret)
- goto fail_free_name;
-
- kfree(filename);
- filename = NULL;
+ goto fail;
ret = -EINVAL;
if (!filter->path.dentry ||
if (state != IF_STATE_ACTION)
goto fail;
+ kfree(filename);
kfree(orig);
return 0;
-fail_free_name:
- kfree(filename);
fail:
+ kfree(filename);
free_filters_list(filters);
kfree(orig);
return 0;
}
+static int perf_event_nop_int(struct perf_event *event, u64 value)
+{
+ return 0;
+}
+
static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
pmu->pmu_disable = perf_pmu_nop_void;
}
+ if (!pmu->check_period)
+ pmu->check_period = perf_event_nop_int;
+
if (!pmu->event_idx)
pmu->event_idx = perf_event_idx_default;
init_waitqueue_head(&event->waitq);
+ event->pending_disable = -1;
init_irq_work(&event->pending, perf_pending_event);
mutex_init(&event->mmap_mutex);
goto err_pmu;
if (has_addr_filter(event)) {
- event->addr_filters_offs = kcalloc(pmu->nr_addr_filters,
- sizeof(unsigned long),
- GFP_KERNEL);
- if (!event->addr_filters_offs) {
+ event->addr_filter_ranges = kcalloc(pmu->nr_addr_filters,
+ sizeof(struct perf_addr_filter_range),
+ GFP_KERNEL);
+ if (!event->addr_filter_ranges) {
err = -ENOMEM;
goto err_per_task;
}
+ /*
+ * Clone the parent's vma offsets: they are valid until exec()
+ * even if the mm is not shared with the parent.
+ */
+ if (event->parent) {
+ struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
+
+ raw_spin_lock_irq(&ifh->lock);
+ memcpy(event->addr_filter_ranges,
+ event->parent->addr_filter_ranges,
+ pmu->nr_addr_filters * sizeof(struct perf_addr_filter_range));
+ raw_spin_unlock_irq(&ifh->lock);
+ }
+
/* force hw sync on the address filters */
event->addr_filters_gen = 1;
}
return event;
err_addr_filters:
- kfree(event->addr_filters_offs);
+ kfree(event->addr_filter_ranges);
err_per_task:
exclusive_event_destroy(event);
goto err_alloc;
}
- if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
- err = -EBUSY;
- goto err_context;
- }
-
/*
* Look up the group leader (we will attach this event to it):
*/
move_group = 0;
}
}
+
+ /*
+ * Failure to create exclusive events returns -EBUSY.
+ */
+ err = -EBUSY;
+ if (!exclusive_event_installable(group_leader, ctx))
+ goto err_locked;
+
+ for_each_sibling_event(sibling, group_leader) {
+ if (!exclusive_event_installable(sibling, ctx))
+ goto err_locked;
+ }
} else {
mutex_lock(&ctx->mutex);
}
* because we need to serialize with concurrent event creation.
*/
if (!exclusive_event_installable(event, ctx)) {
- /* exclusive and group stuff are assumed mutually exclusive */
- WARN_ON_ONCE(move_group);
-
err = -EBUSY;
goto err_locked;
}
goto err_unlock;
}
- perf_install_in_context(ctx, event, cpu);
+ perf_install_in_context(ctx, event, event->cpu);
perf_unpin_context(ctx);
mutex_unlock(&ctx->mutex);
}
/*
- * Free an unexposed, unused context as created by inheritance by
- * perf_event_init_task below, used by fork() in case of fail.
+ * Free a context as created by inheritance by perf_event_init_task() below,
+ * used by fork() in case of fail.
*
- * Not all locks are strictly required, but take them anyway to be nice and
- * help out with the lockdep assertions.
+ * Even though the task has never lived, the context and events have been
+ * exposed through the child_list, so we must take care tearing it all down.
*/
void perf_event_free_task(struct task_struct *task)
{
perf_free_event(event, ctx);
mutex_unlock(&ctx->mutex);
- put_ctx(ctx);
+
+ /*
+ * perf_event_release_kernel() could've stolen some of our
+ * child events and still have them on its free_list. In that
+ * case we must wait for these events to have been freed (in
+ * particular all their references to this task must've been
+ * dropped).
+ *
+ * Without this copy_process() will unconditionally free this
+ * task (irrespective of its reference count) and
+ * _free_event()'s put_task_struct(event->hw.target) will be a
+ * use-after-free.
+ *
+ * Wait for all events to drop their context reference.
+ */
+ wait_var_event(&ctx->refcount, atomic_read(&ctx->refcount) == 1);
+ put_ctx(ctx); /* must be last */
}
}
GFP_KERNEL);
if (!child_ctx->task_ctx_data) {
free_event(child_event);
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
}