struct static_key_deferred perf_sched_events __read_mostly;
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
static DEFINE_PER_CPU(int, perf_sched_cb_usages);
+static DEFINE_PER_CPU(bool, is_idle);
static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
* 0 - disallow raw tracepoint access for unpriv
* 1 - disallow cpu events for unpriv
* 2 - disallow kernel profiling for unpriv
+ * 3 - disallow all unpriv perf event use
*/
+#ifdef CONFIG_PERF_EVENTS_USERMODE
+int sysctl_perf_event_paranoid __read_mostly = -1;
+#elif defined CONFIG_SECURITY_PERF_EVENTS_RESTRICT
+int sysctl_perf_event_paranoid __read_mostly = 3;
+#else
int sysctl_perf_event_paranoid __read_mostly = 1;
+#endif
/* Minimum for 512 kiB + 1 user control page */
int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
* If this was a group event with sibling events then
* upgrade the siblings to singleton events by adding them
* to whatever list we are on.
+ * If this isn't on a list, make sure we still remove the sibling's
+ * group_entry from this sibling_list; otherwise, when that sibling
+ * is later deallocated, it will try to remove itself from this
+ * sibling_list, which may well have been deallocated already,
+ * resulting in a use-after-free.
*/
list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
if (list)
list_move_tail(&sibling->group_entry, list);
+ else
+ list_del_init(&sibling->group_entry);
sibling->group_leader = sibling;
/* Inherit group flags from the previous leader */
}
-/*
+#ifdef CONFIG_SMP
+static void perf_retry_remove(struct perf_event *event,
+ struct remove_event *rep)
+{
+ int up_ret;
+ /*
+ * CPU was offline. Bring it online so we can
+ * gracefully exit a perf context.
+ */
+ up_ret = cpu_up(event->cpu);
+ if (!up_ret)
+ /* Try the remove call once again. */
+ cpu_function_call(event->cpu, __perf_remove_from_context,
+ rep);
+ else
+ pr_err("Failed to bring up CPU: %d, ret: %d\n",
+ event->cpu, up_ret);
+}
+#else
+static void perf_retry_remove(struct perf_event *event,
+ struct remove_event *rep)
+{
+}
+#endif
+
+ /*
* Remove the event from a task's (or a CPU's) list of events.
*
* CPU events are removed with a smp call. For task events we only
* When called from perf_event_exit_task, it's OK because the
* context has been detached from its task.
*/
-static void perf_remove_from_context(struct perf_event *event, bool detach_group)
+static void __ref perf_remove_from_context(struct perf_event *event,
+ bool detach_group)
{
struct perf_event_context *ctx = event->ctx;
struct task_struct *task = ctx->task;
.event = event,
.detach_group = detach_group,
};
+ int ret;
lockdep_assert_held(&ctx->mutex);
* already called __perf_remove_from_context from
* perf_event_exit_cpu.
*/
- cpu_function_call(event->cpu, __perf_remove_from_context, &re);
+ ret = cpu_function_call(event->cpu, __perf_remove_from_context,
+ &re);
+ if (ret == -ENXIO)
+ perf_retry_remove(event, &re);
+
return;
}
if (event->state <= PERF_EVENT_STATE_OFF)
return 0;
- event->state = PERF_EVENT_STATE_ACTIVE;
- event->oncpu = smp_processor_id();
+ WRITE_ONCE(event->oncpu, smp_processor_id());
+ /*
+ * Order event::oncpu write to happen before the ACTIVE state
+ * is visible.
+ */
+ smp_wmb();
+ WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE);
/*
* Unthrottle events, since we scheduled we might have missed several
}
EXPORT_SYMBOL_GPL(perf_event_enable);
+static int __perf_event_stop(void *info)
+{
+ struct perf_event *event = info;
+
+ /* for AUX events, our job is done if the event is already inactive */
+ if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
+ return 0;
+
+ /* matches smp_wmb() in event_sched_in() */
+ smp_rmb();
+
+ /*
+ * There is a window with interrupts enabled before we get here,
+ * so we need to check again lest we try to stop another CPU's event.
+ */
+ if (READ_ONCE(event->oncpu) != smp_processor_id())
+ return -EAGAIN;
+
+ event->pmu->stop(event, PERF_EF_UPDATE);
+
+ return 0;
+}
+
static int _perf_event_refresh(struct perf_event *event, int refresh)
{
/*
static int perf_event_read(struct perf_event *event, bool group)
{
- int ret = 0;
+ int event_cpu, ret = 0;
/*
* If event is enabled and currently active on a CPU, update the
* value in the event structure:
*/
- if (event->state == PERF_EVENT_STATE_ACTIVE) {
+ event_cpu = READ_ONCE(event->oncpu);
+
+ if (event->state == PERF_EVENT_STATE_ACTIVE &&
+ !cpu_isolated(event_cpu)) {
struct perf_read_data data = {
.event = event,
.group = group,
.ret = 0,
};
- smp_call_function_single(event->oncpu,
- __perf_event_read, &data, 1);
- ret = data.ret;
+
+ if ((unsigned int)event_cpu >= nr_cpu_ids)
+ return 0;
+ if (!event->attr.exclude_idle ||
+ !per_cpu(is_idle, event_cpu)) {
+ smp_call_function_single(event_cpu,
+ __perf_event_read, &data, 1);
+ ret = data.ret;
+ }
} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
struct perf_event_context *ctx = event->ctx;
unsigned long flags;
if (!task) {
/* Must be root to operate on a CPU event: */
- if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+ if (event->owner != EVENT_OWNER_KERNEL && perf_paranoid_cpu() &&
+ !capable(CAP_SYS_ADMIN))
return ERR_PTR(-EACCES);
/*
if (event->destroy)
event->destroy(event);
+ if (event->pmu->free_drv_configs)
+ event->pmu->free_drv_configs(event);
+
if (event->ctx)
put_ctx(event->ctx);
*/
static int perf_release(struct inode *inode, struct file *file)
{
+ struct perf_event *event = file->private_data;
+
+ /*
+ * Event can be in state OFF because of a constraint check.
+ * Change to ACTIVE so that it gets cleaned up correctly.
+ */
+ if ((event->state == PERF_EVENT_STATE_OFF) &&
+ event->attr.constraint_duplicate)
+ event->state = PERF_EVENT_STATE_ACTIVE;
put_event(file->private_data);
return 0;
}
struct perf_event *output_event);
static int perf_event_set_filter(struct perf_event *event, void __user *arg);
static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
+static int perf_event_drv_configs(struct perf_event *event,
+ void __user *arg);
static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
{
case PERF_EVENT_IOC_SET_BPF:
return perf_event_set_bpf_prog(event, arg);
+ case PERF_EVENT_IOC_SET_DRV_CONFIGS:
+ return perf_event_drv_configs(event, (void __user *)arg);
+
default:
return -ENOTTY;
}
switch (_IOC_NR(cmd)) {
case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
case _IOC_NR(PERF_EVENT_IOC_ID):
+ case _IOC_NR(PERF_EVENT_IOC_SET_DRV_CONFIGS):
/* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
cmd &= ~IOCSIZE_MASK;
event->pmu->event_mapped(event);
}
+static void perf_pmu_output_stop(struct perf_event *event);
+
/*
* A buffer can be mmap()ed multiple times; either directly through the same
* event, or through other events by use of perf_event_set_output().
*/
if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
+ /*
+ * Stop all AUX events that are writing to this buffer,
+ * so that we can free its AUX pages and corresponding PMU
+ * data. Note that after rb::aux_mmap_count dropped to zero,
+ * they won't start any more (see perf_aux_output_begin()).
+ */
+ perf_pmu_output_stop(event);
+
+ /* now it's safe to free the pages */
atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
+ /* this has to be the last one */
rb_free_aux(rb);
+ WARN_ON_ONCE(atomic_read(&rb->aux_refcount));
+
mutex_unlock(&event->mmap_mutex);
}
rcu_read_unlock();
}
+struct remote_output {
+ struct ring_buffer *rb;
+ int err;
+};
+
+static void __perf_event_output_stop(struct perf_event *event, void *data)
+{
+ struct perf_event *parent = event->parent;
+ struct remote_output *ro = data;
+ struct ring_buffer *rb = ro->rb;
+
+ if (!has_aux(event))
+ return;
+
+ if (!parent)
+ parent = event;
+
+ /*
+ * In case of inheritance, it will be the parent that links to the
+ * ring-buffer, but it will be the child that's actually using it:
+ */
+ if (rcu_dereference(parent->rb) == rb)
+ ro->err = __perf_event_stop(event);
+}
+
+static int __perf_pmu_output_stop(void *info)
+{
+ struct perf_event *event = info;
+ struct pmu *pmu = event->pmu;
+ struct perf_cpu_context *cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+ struct remote_output ro = {
+ .rb = event->rb,
+ };
+
+ rcu_read_lock();
+ perf_event_aux_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro);
+ if (cpuctx->task_ctx)
+ perf_event_aux_ctx(cpuctx->task_ctx, __perf_event_output_stop,
+ &ro);
+ rcu_read_unlock();
+
+ return ro.err;
+}
+
+static void perf_pmu_output_stop(struct perf_event *event)
+{
+ struct perf_event *iter;
+ int err, cpu;
+
+restart:
+ rcu_read_lock();
+ list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
+ /*
+ * For per-CPU events, we need to make sure that neither they
+ * nor their children are running; for cpu==-1 events it's
+ * sufficient to stop the event itself if it's active, since
+ * it can't have children.
+ */
+ cpu = iter->cpu;
+ if (cpu == -1)
+ cpu = READ_ONCE(iter->oncpu);
+
+ if (cpu == -1)
+ continue;
+
+ err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
+ if (err == -EAGAIN) {
+ rcu_read_unlock();
+ goto restart;
+ }
+ }
+ rcu_read_unlock();
+}
+
/*
* task tracking -- fork/exit
*
.start = perf_swevent_start,
.stop = perf_swevent_stop,
.read = perf_swevent_read,
+
+ .events_across_hotplug = 1,
};
#ifdef CONFIG_EVENT_TRACING
.start = perf_swevent_start,
.stop = perf_swevent_stop,
.read = perf_swevent_read,
+
+ .events_across_hotplug = 1,
};
static inline void perf_tp_register(void)
}
#endif
+static int perf_event_drv_configs(struct perf_event *event,
+ void __user *arg)
+{
+ if (!event->pmu->get_drv_configs)
+ return -EINVAL;
+
+ return event->pmu->get_drv_configs(event, arg);
+}
+
/*
* hrtimer based swevent callback
*/
.start = cpu_clock_event_start,
.stop = cpu_clock_event_stop,
.read = cpu_clock_event_read,
+
+ .events_across_hotplug = 1,
};
/*
.start = task_clock_event_start,
.stop = task_clock_event_stop,
.read = task_clock_event_read,
+
+ .events_across_hotplug = 1,
};
static void perf_pmu_nop_void(struct pmu *pmu)
if (!group_leader)
group_leader = event;
+ mutex_init(&event->group_leader_mutex);
mutex_init(&event->child_mutex);
INIT_LIST_HEAD(&event->child_list);
INIT_LIST_HEAD(&event->sibling_list);
INIT_LIST_HEAD(&event->rb_entry);
INIT_LIST_HEAD(&event->active_entry);
+ INIT_LIST_HEAD(&event->drv_configs);
INIT_HLIST_NODE(&event->hlist_entry);
if (flags & ~PERF_FLAG_ALL)
return -EINVAL;
+ if (perf_paranoid_any() && !capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
err = perf_copy_attr(attr_uptr, &attr);
if (err)
return err;
+ if (attr.constraint_duplicate || attr.__reserved_1)
+ return -EINVAL;
+
if (!attr.exclude_kernel) {
if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
return -EACCES;
group_leader = NULL;
}
+ /*
+ * Take the group_leader's group_leader_mutex before observing
+ * anything in the group leader that leads to changes in ctx,
+ * many of which may be changing on another thread.
+ * In particular, we want to take this lock before deciding
+ * whether we need to move_group.
+ */
+ if (group_leader)
+ mutex_lock(&group_leader->group_leader_mutex);
+
if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
task = find_lively_task_by_vpid(pid);
if (IS_ERR(task)) {
if (move_group)
perf_event_ctx_unlock(group_leader, gctx);
mutex_unlock(&ctx->mutex);
+ if (group_leader)
+ mutex_unlock(&group_leader->group_leader_mutex);
if (task) {
mutex_unlock(&task->signal->cred_guard_mutex);
if (task)
put_task_struct(task);
err_group_fd:
+ if (group_leader)
+ mutex_unlock(&group_leader->group_leader_mutex);
fdput(group);
err_fd:
put_unused_fd(event_fd);
rcu_read_unlock();
}
+static void __perf_event_stop_swclock(void *__info)
+{
+ struct perf_event_context *ctx = __info;
+ struct perf_event *event, *tmp;
+
+ list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) {
+ if (event->attr.config == PERF_COUNT_SW_CPU_CLOCK &&
+ event->attr.type == PERF_TYPE_SOFTWARE)
+ cpu_clock_event_stop(event, 0);
+ }
+}
+
static void perf_event_exit_cpu_context(int cpu)
{
+ struct perf_cpu_context *cpuctx;
struct perf_event_context *ctx;
+ unsigned long flags;
struct pmu *pmu;
int idx;
idx = srcu_read_lock(&pmus_srcu);
list_for_each_entry_rcu(pmu, &pmus, entry) {
- ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
+ cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+ ctx = &cpuctx->ctx;
+
+ /* Cancel the mux hrtimer to avoid CPU migration */
+ if (pmu->task_ctx_nr != perf_sw_context) {
+ raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
+ hrtimer_cancel(&cpuctx->hrtimer);
+ cpuctx->hrtimer_active = 0;
+ raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock,
+ flags);
+ }
mutex_lock(&ctx->mutex);
- smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
+ /*
+ * If keeping events across hotplugging is supported, do not
+ * remove the event list, but keep it alive across CPU hotplug.
+ * The context is exited via an fd close path when userspace
+ * is done and the target CPU is online. If software clock
+ * event is active, then stop hrtimer associated with it.
+ * Start the timer when the CPU comes back online.
+ */
+ if (!pmu->events_across_hotplug)
+ smp_call_function_single(cpu, __perf_event_exit_context,
+ ctx, 1);
+ else
+ smp_call_function_single(cpu, __perf_event_stop_swclock,
+ ctx, 1);
mutex_unlock(&ctx->mutex);
}
srcu_read_unlock(&pmus_srcu, idx);
}
+static void perf_event_start_swclock(int cpu)
+{
+ struct perf_event_context *ctx;
+ struct pmu *pmu;
+ int idx;
+ struct perf_event *event, *tmp;
+
+ idx = srcu_read_lock(&pmus_srcu);
+ list_for_each_entry_rcu(pmu, &pmus, entry) {
+ if (pmu->events_across_hotplug) {
+ ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
+ list_for_each_entry_safe(event, tmp, &ctx->event_list,
+ event_entry) {
+ if (event->attr.config ==
+ PERF_COUNT_SW_CPU_CLOCK &&
+ event->attr.type == PERF_TYPE_SOFTWARE)
+ cpu_clock_event_start(event, 0);
+ }
+ }
+ }
+ srcu_read_unlock(&pmus_srcu, idx);
+}
+
static void perf_event_exit_cpu(int cpu)
{
perf_event_exit_cpu_context(cpu);
}
#else
static inline void perf_event_exit_cpu(int cpu) { }
+static inline void perf_event_start_swclock(int cpu) { }
#endif
static int
case CPU_DOWN_PREPARE:
perf_event_exit_cpu(cpu);
break;
+
+ case CPU_STARTING:
+ perf_event_start_swclock(cpu);
+ break;
+
default:
break;
}
return NOTIFY_OK;
}
+static int event_idle_notif(struct notifier_block *nb, unsigned long action,
+ void *data)
+{
+ switch (action) {
+ case IDLE_START:
+ __this_cpu_write(is_idle, true);
+ break;
+ case IDLE_END:
+ __this_cpu_write(is_idle, false);
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block perf_event_idle_nb = {
+ .notifier_call = event_idle_notif,
+};
+
void __init perf_event_init(void)
{
int ret;
perf_pmu_register(&perf_task_clock, NULL, -1);
perf_tp_register();
perf_cpu_notifier(perf_cpu_notify);
+ idle_notifier_register(&perf_event_idle_nb);
register_reboot_notifier(&perf_reboot_notifier);
ret = init_hw_breakpoint();