OSDN Git Service

Merge tag 'v4.16-rc6' into perf/core, to pick up fixes
authorIngo Molnar <mingo@kernel.org>
Mon, 19 Mar 2018 19:37:35 +0000 (20:37 +0100)
committerIngo Molnar <mingo@kernel.org>
Mon, 19 Mar 2018 19:37:35 +0000 (20:37 +0100)
Signed-off-by: Ingo Molnar <mingo@kernel.org>
1  2 
drivers/perf/arm_pmu.c
kernel/events/core.c

diff --combined drivers/perf/arm_pmu.c
@@@ -311,7 -311,7 +311,7 @@@ validate_group(struct perf_event *event
        if (!validate_event(event->pmu, &fake_pmu, leader))
                return -EINVAL;
  
 -      list_for_each_entry(sibling, &leader->sibling_list, group_entry) {
 +      for_each_sibling_event(sibling, leader) {
                if (!validate_event(event->pmu, &fake_pmu, sibling))
                        return -EINVAL;
        }
@@@ -638,7 -638,7 +638,7 @@@ static int arm_perf_teardown_cpu(unsign
                if (irq_is_percpu_devid(irq))
                        disable_percpu_irq(irq);
                else
-                       disable_irq(irq);
+                       disable_irq_nosync(irq);
        }
  
        per_cpu(cpu_armpmu, cpu) = NULL;
diff --combined kernel/events/core.c
@@@ -430,7 -430,7 +430,7 @@@ static void update_perf_cpu_limits(void
        WRITE_ONCE(perf_sample_allowed_ns, tmp);
  }
  
 -static int perf_rotate_context(struct perf_cpu_context *cpuctx);
 +static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
  
  int perf_proc_update_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
@@@ -643,7 -643,7 +643,7 @@@ static void perf_event_update_sibling_t
  {
        struct perf_event *sibling;
  
 -      list_for_each_entry(sibling, &leader->sibling_list, group_entry)
 +      for_each_sibling_event(sibling, leader)
                perf_event_update_time(sibling);
  }
  
@@@ -937,39 -937,27 +937,39 @@@ list_update_cgroup_event(struct perf_ev
        if (!is_cgroup_event(event))
                return;
  
 -      if (add && ctx->nr_cgroups++)
 -              return;
 -      else if (!add && --ctx->nr_cgroups)
 -              return;
        /*
         * Because cgroup events are always per-cpu events,
         * this will always be called from the right CPU.
         */
        cpuctx = __get_cpu_context(ctx);
 -      cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
 -      /* cpuctx->cgrp is NULL unless a cgroup event is active in this CPU .*/
 -      if (add) {
 +
 +      /*
 +       * Since setting cpuctx->cgrp is conditional on the current @cgrp
 +       * matching the event's cgroup, we must do this for every new event,
 +       * because if the first would mismatch, the second would not try again
 +       * and we would leave cpuctx->cgrp unset.
 +       */
 +      if (add && !cpuctx->cgrp) {
                struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
  
 -              list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
                if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup))
                        cpuctx->cgrp = cgrp;
 -      } else {
 -              list_del(cpuctx_entry);
 -              cpuctx->cgrp = NULL;
        }
 +
 +      if (add && ctx->nr_cgroups++)
 +              return;
 +      else if (!add && --ctx->nr_cgroups)
 +              return;
 +
 +      /* no cgroup running */
 +      if (!add)
 +              cpuctx->cgrp = NULL;
 +
 +      cpuctx_entry = &cpuctx->cgrp_cpuctx_entry;
 +      if (add)
 +              list_add(cpuctx_entry, this_cpu_ptr(&cgrp_cpuctx_list));
 +      else
 +              list_del(cpuctx_entry);
  }
  
  #else /* !CONFIG_CGROUP_PERF */
@@@ -1053,7 -1041,7 +1053,7 @@@ list_update_cgroup_event(struct perf_ev
  static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
  {
        struct perf_cpu_context *cpuctx;
 -      int rotations = 0;
 +      bool rotations;
  
        lockdep_assert_irqs_disabled();
  
@@@ -1472,21 -1460,8 +1472,21 @@@ static enum event_type_t get_event_type
        return event_type;
  }
  
 -static struct list_head *
 -ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
 +/*
 + * Helper function to initialize event group nodes.
 + */
 +static void init_event_group(struct perf_event *event)
 +{
 +      RB_CLEAR_NODE(&event->group_node);
 +      event->group_index = 0;
 +}
 +
 +/*
 + * Extract pinned or flexible groups from the context
 + * based on event attrs bits.
 + */
 +static struct perf_event_groups *
 +get_event_groups(struct perf_event *event, struct perf_event_context *ctx)
  {
        if (event->attr.pinned)
                return &ctx->pinned_groups;
  }
  
  /*
 + * Helper function to initializes perf_event_group trees.
 + */
 +static void perf_event_groups_init(struct perf_event_groups *groups)
 +{
 +      groups->tree = RB_ROOT;
 +      groups->index = 0;
 +}
 +
 +/*
 + * Compare function for event groups;
 + *
 + * Implements complex key that first sorts by CPU and then by virtual index
 + * which provides ordering when rotating groups for the same CPU.
 + */
 +static bool
 +perf_event_groups_less(struct perf_event *left, struct perf_event *right)
 +{
 +      if (left->cpu < right->cpu)
 +              return true;
 +      if (left->cpu > right->cpu)
 +              return false;
 +
 +      if (left->group_index < right->group_index)
 +              return true;
 +      if (left->group_index > right->group_index)
 +              return false;
 +
 +      return false;
 +}
 +
 +/*
 + * Insert @event into @groups' tree; using {@event->cpu, ++@groups->index} for
 + * key (see perf_event_groups_less). This places it last inside the CPU
 + * subtree.
 + */
 +static void
 +perf_event_groups_insert(struct perf_event_groups *groups,
 +                       struct perf_event *event)
 +{
 +      struct perf_event *node_event;
 +      struct rb_node *parent;
 +      struct rb_node **node;
 +
 +      event->group_index = ++groups->index;
 +
 +      node = &groups->tree.rb_node;
 +      parent = *node;
 +
 +      while (*node) {
 +              parent = *node;
 +              node_event = container_of(*node, struct perf_event, group_node);
 +
 +              if (perf_event_groups_less(event, node_event))
 +                      node = &parent->rb_left;
 +              else
 +                      node = &parent->rb_right;
 +      }
 +
 +      rb_link_node(&event->group_node, parent, node);
 +      rb_insert_color(&event->group_node, &groups->tree);
 +}
 +
 +/*
 + * Helper function to insert event into the pinned or flexible groups.
 + */
 +static void
 +add_event_to_groups(struct perf_event *event, struct perf_event_context *ctx)
 +{
 +      struct perf_event_groups *groups;
 +
 +      groups = get_event_groups(event, ctx);
 +      perf_event_groups_insert(groups, event);
 +}
 +
 +/*
 + * Delete a group from a tree.
 + */
 +static void
 +perf_event_groups_delete(struct perf_event_groups *groups,
 +                       struct perf_event *event)
 +{
 +      WARN_ON_ONCE(RB_EMPTY_NODE(&event->group_node) ||
 +                   RB_EMPTY_ROOT(&groups->tree));
 +
 +      rb_erase(&event->group_node, &groups->tree);
 +      init_event_group(event);
 +}
 +
 +/*
 + * Helper function to delete event from its groups.
 + */
 +static void
 +del_event_from_groups(struct perf_event *event, struct perf_event_context *ctx)
 +{
 +      struct perf_event_groups *groups;
 +
 +      groups = get_event_groups(event, ctx);
 +      perf_event_groups_delete(groups, event);
 +}
 +
 +/*
 + * Get the leftmost event in the @cpu subtree.
 + */
 +static struct perf_event *
 +perf_event_groups_first(struct perf_event_groups *groups, int cpu)
 +{
 +      struct perf_event *node_event = NULL, *match = NULL;
 +      struct rb_node *node = groups->tree.rb_node;
 +
 +      while (node) {
 +              node_event = container_of(node, struct perf_event, group_node);
 +
 +              if (cpu < node_event->cpu) {
 +                      node = node->rb_left;
 +              } else if (cpu > node_event->cpu) {
 +                      node = node->rb_right;
 +              } else {
 +                      match = node_event;
 +                      node = node->rb_left;
 +              }
 +      }
 +
 +      return match;
 +}
 +
 +/*
 + * Like rb_entry_next_safe() for the @cpu subtree.
 + */
 +static struct perf_event *
 +perf_event_groups_next(struct perf_event *event)
 +{
 +      struct perf_event *next;
 +
 +      next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node);
 +      if (next && next->cpu == event->cpu)
 +              return next;
 +
 +      return NULL;
 +}
 +
 +/*
 + * Iterate through the whole groups tree.
 + */
 +#define perf_event_groups_for_each(event, groups)                     \
 +      for (event = rb_entry_safe(rb_first(&((groups)->tree)),         \
 +                              typeof(*event), group_node); event;     \
 +              event = rb_entry_safe(rb_next(&event->group_node),      \
 +                              typeof(*event), group_node))
 +
 +/*
   * Add a event from the lists for its context.
   * Must be called with ctx->mutex and ctx->lock held.
   */
@@@ -1664,8 -1489,12 +1664,8 @@@ list_add_event(struct perf_event *event
         * perf_group_detach can, at all times, locate all siblings.
         */
        if (event->group_leader == event) {
 -              struct list_head *list;
 -
                event->group_caps = event->event_caps;
 -
 -              list = ctx_group_list(event, ctx);
 -              list_add_tail(&event->group_entry, list);
 +              add_event_to_groups(event, ctx);
        }
  
        list_update_cgroup_event(event, ctx, true);
@@@ -1823,12 -1652,12 +1823,12 @@@ static void perf_group_attach(struct pe
  
        group_leader->group_caps &= event->event_caps;
  
 -      list_add_tail(&event->group_entry, &group_leader->sibling_list);
 +      list_add_tail(&event->sibling_list, &group_leader->sibling_list);
        group_leader->nr_siblings++;
  
        perf_event__header_size(group_leader);
  
 -      list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
 +      for_each_sibling_event(pos, group_leader)
                perf_event__header_size(pos);
  }
  
@@@ -1859,7 -1688,7 +1859,7 @@@ list_del_event(struct perf_event *event
        list_del_rcu(&event->event_entry);
  
        if (event->group_leader == event)
 -              list_del_init(&event->group_entry);
 +              del_event_from_groups(event, ctx);
  
        /*
         * If event was in error state, then keep it
  static void perf_group_detach(struct perf_event *event)
  {
        struct perf_event *sibling, *tmp;
 -      struct list_head *list = NULL;
 +      struct perf_event_context *ctx = event->ctx;
  
 -      lockdep_assert_held(&event->ctx->lock);
 +      lockdep_assert_held(&ctx->lock);
  
        /*
         * We can have double detach due to exit/hot-unplug + close.
         * If this is a sibling, remove it from its group.
         */
        if (event->group_leader != event) {
 -              list_del_init(&event->group_entry);
 +              list_del_init(&event->sibling_list);
                event->group_leader->nr_siblings--;
                goto out;
        }
  
 -      if (!list_empty(&event->group_entry))
 -              list = &event->group_entry;
 -
        /*
         * If this was a group event with sibling events then
         * upgrade the siblings to singleton events by adding them
         * to whatever list we are on.
         */
 -      list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
 -              if (list)
 -                      list_move_tail(&sibling->group_entry, list);
 +      list_for_each_entry_safe(sibling, tmp, &event->sibling_list, sibling_list) {
 +
                sibling->group_leader = sibling;
 +              list_del_init(&sibling->sibling_list);
  
                /* Inherit group flags from the previous leader */
                sibling->group_caps = event->group_caps;
  
 +              if (!RB_EMPTY_NODE(&event->group_node)) {
 +                      add_event_to_groups(sibling, event->ctx);
 +
 +                      if (sibling->state == PERF_EVENT_STATE_ACTIVE) {
 +                              struct list_head *list = sibling->attr.pinned ?
 +                                      &ctx->pinned_active : &ctx->flexible_active;
 +
 +                              list_add_tail(&sibling->active_list, list);
 +                      }
 +              }
 +
                WARN_ON_ONCE(sibling->ctx != event->ctx);
        }
  
  out:
        perf_event__header_size(event->group_leader);
  
 -      list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
 +      for_each_sibling_event(tmp, event->group_leader)
                perf_event__header_size(tmp);
  }
  
@@@ -1951,13 -1772,13 +1951,13 @@@ static inline int __pmu_filter_match(st
   */
  static inline int pmu_filter_match(struct perf_event *event)
  {
 -      struct perf_event *child;
 +      struct perf_event *sibling;
  
        if (!__pmu_filter_match(event))
                return 0;
  
 -      list_for_each_entry(child, &event->sibling_list, group_entry) {
 -              if (!__pmu_filter_match(child))
 +      for_each_sibling_event(sibling, event) {
 +              if (!__pmu_filter_match(sibling))
                        return 0;
        }
  
@@@ -1984,13 -1805,6 +1984,13 @@@ event_sched_out(struct perf_event *even
        if (event->state != PERF_EVENT_STATE_ACTIVE)
                return;
  
 +      /*
 +       * Asymmetry; we only schedule events _IN_ through ctx_sched_in(), but
 +       * we can schedule events _OUT_ individually through things like
 +       * __perf_remove_from_context().
 +       */
 +      list_del_init(&event->active_list);
 +
        perf_pmu_disable(event->pmu);
  
        event->pmu->del(event, 0);
@@@ -2031,7 -1845,7 +2031,7 @@@ group_sched_out(struct perf_event *grou
        /*
         * Schedule out siblings (if any):
         */
 -      list_for_each_entry(event, &group_event->sibling_list, group_entry)
 +      for_each_sibling_event(event, group_event)
                event_sched_out(event, cpuctx, ctx);
  
        perf_pmu_enable(ctx->pmu);
@@@ -2310,7 -2124,7 +2310,7 @@@ group_sched_in(struct perf_event *group
        /*
         * Schedule in siblings as one group (if any):
         */
 -      list_for_each_entry(event, &group_event->sibling_list, group_entry) {
 +      for_each_sibling_event(event, group_event) {
                if (event_sched_in(event, cpuctx, ctx)) {
                        partial_group = event;
                        goto group_error;
@@@ -2326,7 -2140,7 +2326,7 @@@ group_error
         * partial group before returning:
         * The events up to the failed event are scheduled out normally.
         */
 -      list_for_each_entry(event, &group_event->sibling_list, group_entry) {
 +      for_each_sibling_event(event, group_event) {
                if (event == partial_group)
                        break;
  
@@@ -2432,7 -2246,7 +2432,7 @@@ static void ctx_resched(struct perf_cpu
                        struct perf_event_context *task_ctx,
                        enum event_type_t event_type)
  {
-       enum event_type_t ctx_event_type = event_type & EVENT_ALL;
+       enum event_type_t ctx_event_type;
        bool cpu_event = !!(event_type & EVENT_CPU);
  
        /*
        if (event_type & EVENT_PINNED)
                event_type |= EVENT_FLEXIBLE;
  
+       ctx_event_type = event_type & EVENT_ALL;
        perf_pmu_disable(cpuctx->ctx.pmu);
        if (task_ctx)
                task_ctx_sched_out(cpuctx, task_ctx, event_type);
@@@ -2501,18 -2317,6 +2503,18 @@@ static int  __perf_install_in_context(v
                raw_spin_lock(&task_ctx->lock);
        }
  
 +#ifdef CONFIG_CGROUP_PERF
 +      if (is_cgroup_event(event)) {
 +              /*
 +               * If the current cgroup doesn't match the event's
 +               * cgroup, we should not try to schedule it.
 +               */
 +              struct perf_cgroup *cgrp = perf_cgroup_from_task(current, ctx);
 +              reprogram = cgroup_is_descendant(cgrp->css.cgroup,
 +                                      event->cgrp->css.cgroup);
 +      }
 +#endif
 +
        if (reprogram) {
                ctx_sched_out(ctx, cpuctx, EVENT_TIME);
                add_event_to_ctx(event, ctx);
@@@ -2846,47 -2650,12 +2848,47 @@@ int perf_event_refresh(struct perf_even
  }
  EXPORT_SYMBOL_GPL(perf_event_refresh);
  
 +static int perf_event_modify_breakpoint(struct perf_event *bp,
 +                                       struct perf_event_attr *attr)
 +{
 +      int err;
 +
 +      _perf_event_disable(bp);
 +
 +      err = modify_user_hw_breakpoint_check(bp, attr, true);
 +      if (err) {
 +              if (!bp->attr.disabled)
 +                      _perf_event_enable(bp);
 +
 +              return err;
 +      }
 +
 +      if (!attr->disabled)
 +              _perf_event_enable(bp);
 +      return 0;
 +}
 +
 +static int perf_event_modify_attr(struct perf_event *event,
 +                                struct perf_event_attr *attr)
 +{
 +      if (event->attr.type != attr->type)
 +              return -EINVAL;
 +
 +      switch (event->attr.type) {
 +      case PERF_TYPE_BREAKPOINT:
 +              return perf_event_modify_breakpoint(event, attr);
 +      default:
 +              /* Place holder for future additions. */
 +              return -EOPNOTSUPP;
 +      }
 +}
 +
  static void ctx_sched_out(struct perf_event_context *ctx,
                          struct perf_cpu_context *cpuctx,
                          enum event_type_t event_type)
  {
 +      struct perf_event *event, *tmp;
        int is_active = ctx->is_active;
 -      struct perf_event *event;
  
        lockdep_assert_held(&ctx->lock);
  
  
        perf_pmu_disable(ctx->pmu);
        if (is_active & EVENT_PINNED) {
 -              list_for_each_entry(event, &ctx->pinned_groups, group_entry)
 +              list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list)
                        group_sched_out(event, cpuctx, ctx);
        }
  
        if (is_active & EVENT_FLEXIBLE) {
 -              list_for_each_entry(event, &ctx->flexible_groups, group_entry)
 +              list_for_each_entry_safe(event, tmp, &ctx->flexible_active, active_list)
                        group_sched_out(event, cpuctx, ctx);
        }
        perf_pmu_enable(ctx->pmu);
@@@ -3225,116 -2994,53 +3227,116 @@@ static void cpu_ctx_sched_out(struct pe
        ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
  }
  
 -static void
 -ctx_pinned_sched_in(struct perf_event_context *ctx,
 -                  struct perf_cpu_context *cpuctx)
 +static int visit_groups_merge(struct perf_event_groups *groups, int cpu,
 +                            int (*func)(struct perf_event *, void *), void *data)
  {
 -      struct perf_event *event;
 +      struct perf_event **evt, *evt1, *evt2;
 +      int ret;
  
 -      list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
 -              if (event->state <= PERF_EVENT_STATE_OFF)
 -                      continue;
 -              if (!event_filter_match(event))
 -                      continue;
 +      evt1 = perf_event_groups_first(groups, -1);
 +      evt2 = perf_event_groups_first(groups, cpu);
 +
 +      while (evt1 || evt2) {
 +              if (evt1 && evt2) {
 +                      if (evt1->group_index < evt2->group_index)
 +                              evt = &evt1;
 +                      else
 +                              evt = &evt2;
 +              } else if (evt1) {
 +                      evt = &evt1;
 +              } else {
 +                      evt = &evt2;
 +              }
  
 -              if (group_can_go_on(event, cpuctx, 1))
 -                      group_sched_in(event, cpuctx, ctx);
 +              ret = func(*evt, data);
 +              if (ret)
 +                      return ret;
  
 -              /*
 -               * If this pinned group hasn't been scheduled,
 -               * put it in error state.
 -               */
 -              if (event->state == PERF_EVENT_STATE_INACTIVE)
 -                      perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
 +              *evt = perf_event_groups_next(*evt);
 +      }
 +
 +      return 0;
 +}
 +
 +struct sched_in_data {
 +      struct perf_event_context *ctx;
 +      struct perf_cpu_context *cpuctx;
 +      int can_add_hw;
 +};
 +
 +static int pinned_sched_in(struct perf_event *event, void *data)
 +{
 +      struct sched_in_data *sid = data;
 +
 +      if (event->state <= PERF_EVENT_STATE_OFF)
 +              return 0;
 +
 +      if (!event_filter_match(event))
 +              return 0;
 +
 +      if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
 +              if (!group_sched_in(event, sid->cpuctx, sid->ctx))
 +                      list_add_tail(&event->active_list, &sid->ctx->pinned_active);
        }
 +
 +      /*
 +       * If this pinned group hasn't been scheduled,
 +       * put it in error state.
 +       */
 +      if (event->state == PERF_EVENT_STATE_INACTIVE)
 +              perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
 +
 +      return 0;
 +}
 +
 +static int flexible_sched_in(struct perf_event *event, void *data)
 +{
 +      struct sched_in_data *sid = data;
 +
 +      if (event->state <= PERF_EVENT_STATE_OFF)
 +              return 0;
 +
 +      if (!event_filter_match(event))
 +              return 0;
 +
 +      if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) {
 +              if (!group_sched_in(event, sid->cpuctx, sid->ctx))
 +                      list_add_tail(&event->active_list, &sid->ctx->flexible_active);
 +              else
 +                      sid->can_add_hw = 0;
 +      }
 +
 +      return 0;
 +}
 +
 +static void
 +ctx_pinned_sched_in(struct perf_event_context *ctx,
 +                  struct perf_cpu_context *cpuctx)
 +{
 +      struct sched_in_data sid = {
 +              .ctx = ctx,
 +              .cpuctx = cpuctx,
 +              .can_add_hw = 1,
 +      };
 +
 +      visit_groups_merge(&ctx->pinned_groups,
 +                         smp_processor_id(),
 +                         pinned_sched_in, &sid);
  }
  
  static void
  ctx_flexible_sched_in(struct perf_event_context *ctx,
                      struct perf_cpu_context *cpuctx)
  {
 -      struct perf_event *event;
 -      int can_add_hw = 1;
 -
 -      list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
 -              /* Ignore events in OFF or ERROR state */
 -              if (event->state <= PERF_EVENT_STATE_OFF)
 -                      continue;
 -              /*
 -               * Listen to the 'cpu' scheduling filter constraint
 -               * of events:
 -               */
 -              if (!event_filter_match(event))
 -                      continue;
 +      struct sched_in_data sid = {
 +              .ctx = ctx,
 +              .cpuctx = cpuctx,
 +              .can_add_hw = 1,
 +      };
  
 -              if (group_can_go_on(event, cpuctx, can_add_hw)) {
 -                      if (group_sched_in(event, cpuctx, ctx))
 -                              can_add_hw = 0;
 -              }
 -      }
 +      visit_groups_merge(&ctx->flexible_groups,
 +                         smp_processor_id(),
 +                         flexible_sched_in, &sid);
  }
  
  static void
@@@ -3415,7 -3121,7 +3417,7 @@@ static void perf_event_context_sched_in
         * However, if task's ctx is not carrying any pinned
         * events, no need to flip the cpuctx's events around.
         */
 -      if (!list_empty(&ctx->pinned_groups))
 +      if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree))
                cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
        perf_event_sched_in(cpuctx, ctx, task);
        perf_pmu_enable(ctx->pmu);
@@@ -3644,81 -3350,55 +3646,81 @@@ static void perf_adjust_freq_unthr_cont
  }
  
  /*
 - * Round-robin a context's events:
 + * Move @event to the tail of the @ctx's elegible events.
   */
 -static void rotate_ctx(struct perf_event_context *ctx)
 +static void rotate_ctx(struct perf_event_context *ctx, struct perf_event *event)
  {
        /*
         * Rotate the first entry last of non-pinned groups. Rotation might be
         * disabled by the inheritance code.
         */
 -      if (!ctx->rotate_disable)
 -              list_rotate_left(&ctx->flexible_groups);
 +      if (ctx->rotate_disable)
 +              return;
 +
 +      perf_event_groups_delete(&ctx->flexible_groups, event);
 +      perf_event_groups_insert(&ctx->flexible_groups, event);
  }
  
 -static int perf_rotate_context(struct perf_cpu_context *cpuctx)
 +static inline struct perf_event *
 +ctx_first_active(struct perf_event_context *ctx)
  {
 +      return list_first_entry_or_null(&ctx->flexible_active,
 +                                      struct perf_event, active_list);
 +}
 +
 +static bool perf_rotate_context(struct perf_cpu_context *cpuctx)
 +{
 +      struct perf_event *cpu_event = NULL, *task_event = NULL;
 +      bool cpu_rotate = false, task_rotate = false;
        struct perf_event_context *ctx = NULL;
 -      int rotate = 0;
 +
 +      /*
 +       * Since we run this from IRQ context, nobody can install new
 +       * events, thus the event count values are stable.
 +       */
  
        if (cpuctx->ctx.nr_events) {
                if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
 -                      rotate = 1;
 +                      cpu_rotate = true;
        }
  
        ctx = cpuctx->task_ctx;
        if (ctx && ctx->nr_events) {
                if (ctx->nr_events != ctx->nr_active)
 -                      rotate = 1;
 +                      task_rotate = true;
        }
  
 -      if (!rotate)
 -              goto done;
 +      if (!(cpu_rotate || task_rotate))
 +              return false;
  
        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
        perf_pmu_disable(cpuctx->ctx.pmu);
  
 -      cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
 -      if (ctx)
 +      if (task_rotate)
 +              task_event = ctx_first_active(ctx);
 +      if (cpu_rotate)
 +              cpu_event = ctx_first_active(&cpuctx->ctx);
 +
 +      /*
 +       * As per the order given at ctx_resched() first 'pop' task flexible
 +       * and then, if needed CPU flexible.
 +       */
 +      if (task_event || (ctx && cpu_event))
                ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
 +      if (cpu_event)
 +              cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
  
 -      rotate_ctx(&cpuctx->ctx);
 -      if (ctx)
 -              rotate_ctx(ctx);
 +      if (task_event)
 +              rotate_ctx(ctx, task_event);
 +      if (cpu_event)
 +              rotate_ctx(&cpuctx->ctx, cpu_event);
  
        perf_event_sched_in(cpuctx, ctx, current);
  
        perf_pmu_enable(cpuctx->ctx.pmu);
        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 -done:
  
 -      return rotate;
 +      return true;
  }
  
  void perf_event_task_tick(void)
@@@ -3863,7 -3543,7 +3865,7 @@@ static void __perf_event_read(void *inf
  
        pmu->read(event);
  
 -      list_for_each_entry(sub, &event->sibling_list, group_entry) {
 +      for_each_sibling_event(sub, event) {
                if (sub->state == PERF_EVENT_STATE_ACTIVE) {
                        /*
                         * Use sibling's PMU rather than @event's since
@@@ -4037,11 -3717,9 +4039,11 @@@ static void __perf_event_init_context(s
        raw_spin_lock_init(&ctx->lock);
        mutex_init(&ctx->mutex);
        INIT_LIST_HEAD(&ctx->active_ctx_list);
 -      INIT_LIST_HEAD(&ctx->pinned_groups);
 -      INIT_LIST_HEAD(&ctx->flexible_groups);
 +      perf_event_groups_init(&ctx->pinned_groups);
 +      perf_event_groups_init(&ctx->flexible_groups);
        INIT_LIST_HEAD(&ctx->event_list);
 +      INIT_LIST_HEAD(&ctx->pinned_active);
 +      INIT_LIST_HEAD(&ctx->flexible_active);
        atomic_set(&ctx->refcount, 1);
  }
  
@@@ -4711,7 -4389,7 +4713,7 @@@ static int __perf_read_group_add(struc
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(leader);
  
 -      list_for_each_entry(sub, &leader->sibling_list, group_entry) {
 +      for_each_sibling_event(sub, leader) {
                values[n++] += perf_event_count(sub);
                if (read_format & PERF_FORMAT_ID)
                        values[n++] = primary_event_id(sub);
@@@ -4905,7 -4583,7 +4907,7 @@@ static void perf_event_for_each(struct 
        event = event->group_leader;
  
        perf_event_for_each_child(event, func);
 -      list_for_each_entry(sibling, &event->sibling_list, group_entry)
 +      for_each_sibling_event(sibling, event)
                perf_event_for_each_child(sibling, func);
  }
  
@@@ -4987,8 -4665,6 +4989,8 @@@ static int perf_event_set_output(struc
                                 struct perf_event *output_event);
  static int perf_event_set_filter(struct perf_event *event, void __user *arg);
  static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
 +static int perf_copy_attr(struct perf_event_attr __user *uattr,
 +                        struct perf_event_attr *attr);
  
  static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
  {
  
        case PERF_EVENT_IOC_QUERY_BPF:
                return perf_event_query_prog_array(event, (void __user *)arg);
 +
 +      case PERF_EVENT_IOC_MODIFY_ATTRIBUTES: {
 +              struct perf_event_attr new_attr;
 +              int err = perf_copy_attr((struct perf_event_attr __user *)arg,
 +                                       &new_attr);
 +
 +              if (err)
 +                      return err;
 +
 +              return perf_event_modify_attr(event,  &new_attr);
 +      }
        default:
                return -ENOTTY;
        }
@@@ -6067,8 -5732,7 +6069,8 @@@ static void perf_output_read_group(stru
        if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
                values[n++] = running;
  
 -      if (leader != event)
 +      if ((leader != event) &&
 +          (leader->state == PERF_EVENT_STATE_ACTIVE))
                leader->pmu->read(leader);
  
        values[n++] = perf_event_count(leader);
  
        __output_copy(handle, values, n * sizeof(u64));
  
 -      list_for_each_entry(sub, &leader->sibling_list, group_entry) {
 +      for_each_sibling_event(sub, leader) {
                n = 0;
  
                if ((sub != event) &&
@@@ -8334,119 -7998,9 +8336,119 @@@ static struct pmu perf_tracepoint = 
        .read           = perf_swevent_read,
  };
  
 +#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
 +/*
 + * Flags in config, used by dynamic PMU kprobe and uprobe
 + * The flags should match following PMU_FORMAT_ATTR().
 + *
 + * PERF_PROBE_CONFIG_IS_RETPROBE if set, create kretprobe/uretprobe
 + *                               if not set, create kprobe/uprobe
 + */
 +enum perf_probe_config {
 +      PERF_PROBE_CONFIG_IS_RETPROBE = 1U << 0,  /* [k,u]retprobe */
 +};
 +
 +PMU_FORMAT_ATTR(retprobe, "config:0");
 +
 +static struct attribute *probe_attrs[] = {
 +      &format_attr_retprobe.attr,
 +      NULL,
 +};
 +
 +static struct attribute_group probe_format_group = {
 +      .name = "format",
 +      .attrs = probe_attrs,
 +};
 +
 +static const struct attribute_group *probe_attr_groups[] = {
 +      &probe_format_group,
 +      NULL,
 +};
 +#endif
 +
 +#ifdef CONFIG_KPROBE_EVENTS
 +static int perf_kprobe_event_init(struct perf_event *event);
 +static struct pmu perf_kprobe = {
 +      .task_ctx_nr    = perf_sw_context,
 +      .event_init     = perf_kprobe_event_init,
 +      .add            = perf_trace_add,
 +      .del            = perf_trace_del,
 +      .start          = perf_swevent_start,
 +      .stop           = perf_swevent_stop,
 +      .read           = perf_swevent_read,
 +      .attr_groups    = probe_attr_groups,
 +};
 +
 +static int perf_kprobe_event_init(struct perf_event *event)
 +{
 +      int err;
 +      bool is_retprobe;
 +
 +      if (event->attr.type != perf_kprobe.type)
 +              return -ENOENT;
 +      /*
 +       * no branch sampling for probe events
 +       */
 +      if (has_branch_stack(event))
 +              return -EOPNOTSUPP;
 +
 +      is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
 +      err = perf_kprobe_init(event, is_retprobe);
 +      if (err)
 +              return err;
 +
 +      event->destroy = perf_kprobe_destroy;
 +
 +      return 0;
 +}
 +#endif /* CONFIG_KPROBE_EVENTS */
 +
 +#ifdef CONFIG_UPROBE_EVENTS
 +static int perf_uprobe_event_init(struct perf_event *event);
 +static struct pmu perf_uprobe = {
 +      .task_ctx_nr    = perf_sw_context,
 +      .event_init     = perf_uprobe_event_init,
 +      .add            = perf_trace_add,
 +      .del            = perf_trace_del,
 +      .start          = perf_swevent_start,
 +      .stop           = perf_swevent_stop,
 +      .read           = perf_swevent_read,
 +      .attr_groups    = probe_attr_groups,
 +};
 +
 +static int perf_uprobe_event_init(struct perf_event *event)
 +{
 +      int err;
 +      bool is_retprobe;
 +
 +      if (event->attr.type != perf_uprobe.type)
 +              return -ENOENT;
 +      /*
 +       * no branch sampling for probe events
 +       */
 +      if (has_branch_stack(event))
 +              return -EOPNOTSUPP;
 +
 +      is_retprobe = event->attr.config & PERF_PROBE_CONFIG_IS_RETPROBE;
 +      err = perf_uprobe_init(event, is_retprobe);
 +      if (err)
 +              return err;
 +
 +      event->destroy = perf_uprobe_destroy;
 +
 +      return 0;
 +}
 +#endif /* CONFIG_UPROBE_EVENTS */
 +
  static inline void perf_tp_register(void)
  {
        perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
 +#ifdef CONFIG_KPROBE_EVENTS
 +      perf_pmu_register(&perf_kprobe, "kprobe", -1);
 +#endif
 +#ifdef CONFIG_UPROBE_EVENTS
 +      perf_pmu_register(&perf_uprobe, "uprobe", -1);
 +#endif
  }
  
  static void perf_event_free_filter(struct perf_event *event)
@@@ -8523,32 -8077,13 +8525,32 @@@ static void perf_event_free_bpf_handler
  }
  #endif
  
 +/*
 + * returns true if the event is a tracepoint, or a kprobe/upprobe created
 + * with perf_event_open()
 + */
 +static inline bool perf_event_is_tracing(struct perf_event *event)
 +{
 +      if (event->pmu == &perf_tracepoint)
 +              return true;
 +#ifdef CONFIG_KPROBE_EVENTS
 +      if (event->pmu == &perf_kprobe)
 +              return true;
 +#endif
 +#ifdef CONFIG_UPROBE_EVENTS
 +      if (event->pmu == &perf_uprobe)
 +              return true;
 +#endif
 +      return false;
 +}
 +
  static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
  {
        bool is_kprobe, is_tracepoint, is_syscall_tp;
        struct bpf_prog *prog;
        int ret;
  
 -      if (event->attr.type != PERF_TYPE_TRACEPOINT)
 +      if (!perf_event_is_tracing(event))
                return perf_event_set_bpf_handler(event, prog_fd);
  
        is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
  
  static void perf_event_free_bpf_prog(struct perf_event *event)
  {
 -      if (event->attr.type != PERF_TYPE_TRACEPOINT) {
 +      if (!perf_event_is_tracing(event)) {
                perf_event_free_bpf_handler(event);
                return;
        }
@@@ -9013,36 -8548,47 +9015,36 @@@ fail_clear_files
        return ret;
  }
  
 -static int
 -perf_tracepoint_set_filter(struct perf_event *event, char *filter_str)
 -{
 -      struct perf_event_context *ctx = event->ctx;
 -      int ret;
 -
 -      /*
 -       * Beware, here be dragons!!
 -       *
 -       * the tracepoint muck will deadlock against ctx->mutex, but the tracepoint
 -       * stuff does not actually need it. So temporarily drop ctx->mutex. As per
 -       * perf_event_ctx_lock() we already have a reference on ctx.
 -       *
 -       * This can result in event getting moved to a different ctx, but that
 -       * does not affect the tracepoint state.
 -       */
 -      mutex_unlock(&ctx->mutex);
 -      ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
 -      mutex_lock(&ctx->mutex);
 -
 -      return ret;
 -}
 -
  static int perf_event_set_filter(struct perf_event *event, void __user *arg)
  {
 -      char *filter_str;
        int ret = -EINVAL;
 -
 -      if ((event->attr.type != PERF_TYPE_TRACEPOINT ||
 -          !IS_ENABLED(CONFIG_EVENT_TRACING)) &&
 -          !has_addr_filter(event))
 -              return -EINVAL;
 +      char *filter_str;
  
        filter_str = strndup_user(arg, PAGE_SIZE);
        if (IS_ERR(filter_str))
                return PTR_ERR(filter_str);
  
 -      if (IS_ENABLED(CONFIG_EVENT_TRACING) &&
 -          event->attr.type == PERF_TYPE_TRACEPOINT)
 -              ret = perf_tracepoint_set_filter(event, filter_str);
 -      else if (has_addr_filter(event))
 +#ifdef CONFIG_EVENT_TRACING
 +      if (perf_event_is_tracing(event)) {
 +              struct perf_event_context *ctx = event->ctx;
 +
 +              /*
 +               * Beware, here be dragons!!
 +               *
 +               * the tracepoint muck will deadlock against ctx->mutex, but
 +               * the tracepoint stuff does not actually need it. So
 +               * temporarily drop ctx->mutex. As per perf_event_ctx_lock() we
 +               * already have a reference on ctx.
 +               *
 +               * This can result in event getting moved to a different ctx,
 +               * but that does not affect the tracepoint state.
 +               */
 +              mutex_unlock(&ctx->mutex);
 +              ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
 +              mutex_lock(&ctx->mutex);
 +      } else
 +#endif
 +      if (has_addr_filter(event))
                ret = perf_event_set_addr_filter(event, filter_str);
  
        kfree(filter_str);
@@@ -9895,10 -9441,9 +9897,10 @@@ perf_event_alloc(struct perf_event_att
        mutex_init(&event->child_mutex);
        INIT_LIST_HEAD(&event->child_list);
  
 -      INIT_LIST_HEAD(&event->group_entry);
        INIT_LIST_HEAD(&event->event_entry);
        INIT_LIST_HEAD(&event->sibling_list);
 +      INIT_LIST_HEAD(&event->active_list);
 +      init_event_group(event);
        INIT_LIST_HEAD(&event->rb_entry);
        INIT_LIST_HEAD(&event->active_entry);
        INIT_LIST_HEAD(&event->addr_filters.list);
@@@ -10173,9 -9718,6 +10175,9 @@@ static int perf_copy_attr(struct perf_e
                        ret = -EINVAL;
        }
  
 +      if (!attr->sample_max_stack)
 +              attr->sample_max_stack = sysctl_perf_event_max_stack;
 +
        if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
                ret = perf_reg_validate(attr->sample_regs_intr);
  out:
@@@ -10389,6 -9931,9 +10391,6 @@@ SYSCALL_DEFINE5(perf_event_open
            perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
                return -EACCES;
  
 -      if (!attr.sample_max_stack)
 -              attr.sample_max_stack = sysctl_perf_event_max_stack;
 -
        /*
         * In cgroup mode, the pid argument is used to pass the fd
         * opened to the cgroup directory in cgroupfs. The cpu argument
                perf_remove_from_context(group_leader, 0);
                put_ctx(gctx);
  
 -              list_for_each_entry(sibling, &group_leader->sibling_list,
 -                                  group_entry) {
 +              for_each_sibling_event(sibling, group_leader) {
                        perf_remove_from_context(sibling, 0);
                        put_ctx(gctx);
                }
                 * By installing siblings first we NO-OP because they're not
                 * reachable through the group lists.
                 */
 -              list_for_each_entry(sibling, &group_leader->sibling_list,
 -                                  group_entry) {
 +              for_each_sibling_event(sibling, group_leader) {
                        perf_event__state_init(sibling);
                        perf_install_in_context(ctx, sibling, sibling->cpu);
                        get_ctx(ctx);
@@@ -11322,7 -10869,7 +11324,7 @@@ static int inherit_group(struct perf_ev
         * case inherit_event() will create individual events, similar to what
         * perf_group_detach() would do anyway.
         */
 -      list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
 +      for_each_sibling_event(sub, parent_event) {
                child_ctr = inherit_event(sub, parent, parent_ctx,
                                            child, leader, child_ctx);
                if (IS_ERR(child_ctr))
@@@ -11421,7 -10968,7 +11423,7 @@@ static int perf_event_init_context(stru
         * We dont have to disable NMIs - we are only looking at
         * the list, not manipulating it:
         */
 -      list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
 +      perf_event_groups_for_each(event, &parent_ctx->pinned_groups) {
                ret = inherit_task_group(event, parent, parent_ctx,
                                         child, ctxn, &inherited_all);
                if (ret)
        parent_ctx->rotate_disable = 1;
        raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
  
 -      list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
 +      perf_event_groups_for_each(event, &parent_ctx->flexible_groups) {
                ret = inherit_task_group(event, parent, parent_ctx,
                                         child, ctxn, &inherited_all);
                if (ret)