OSDN Git Service

kernel: Add API to mark IRQs and kthreads as performance critical
authorSultan Alsawaf <sultan@kerneltoast.com>
Sat, 20 Jul 2019 07:10:53 +0000 (00:10 -0700)
committer0ranko0P <ranko0p@outlook.com>
Sat, 7 Dec 2019 10:01:09 +0000 (18:01 +0800)
On devices with a CPU that contains heterogeneous cores (e.g.,
big.LITTLE), it can be beneficial to place some performance-critical
IRQs and kthreads onto the performance CPU cluster in order to improve
performance.

This commit adds the following APIs:
-kthread_run_perf_critical() to create and start a perf-critical kthread
-irq_set_perf_affinity() to mark an active IRQ as perf-critical
-IRQF_PERF_CRITICAL to schedule an IRQ and any threads it may have onto
 performance CPUs
-PF_PERF_CRITICAL to mark a process (mainly a kthread) as performance
 critical (this is used by kthread_run_perf_critical())

In order to accommodate this new API, the following changes are made:
-Performance-critical IRQs are distributed evenly among online CPUs
 available in cpu_perf_mask
-Performance-critical IRQs have their affinities reaffined upon exit
 from suspend (since the affinities are broken when non-boot CPUs are
 disabled)
-Performance-critical IRQs and their threads have their affinities reset
 upon entering suspend, so that upon immediate suspend exit (when only
 the boot CPU is online), interrupts can be processed and interrupt
 threads can be scheduled onto an online CPU (otherwise we'd hit a
 kernel BUG)
-__set_cpus_allowed_ptr() is modified to enforce a performance-critical
 kthread's affinity
-Perf-critical IRQs are marked with IRQD_AFFINITY_MANAGED so userspace
 can't mess with their affinity

Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>
include/linux/interrupt.h
include/linux/kthread.h
include/linux/sched.h
kernel/cpu.c
kernel/irq/manage.c
kernel/sched/core.c

index aa49333..c8e7c1f 100644 (file)
@@ -62,6 +62,8 @@
  *                interrupt handler after suspending interrupts. For system
  *                wakeup devices users need to implement wakeup detection in
  *                their interrupt handlers.
+ * IRQF_PERF_CRITICAL - Interrupt is critical to the overall performance of the
+ *               system and should be processed on a fast CPU.
  */
 #define IRQF_SHARED            0x00000080
 #define IRQF_PROBE_SHARED      0x00000100
@@ -75,6 +77,7 @@
 #define IRQF_NO_THREAD         0x00010000
 #define IRQF_EARLY_RESUME      0x00020000
 #define IRQF_COND_SUSPEND      0x00040000
+#define IRQF_PERF_CRITICAL     0x00080000
 
 #define IRQF_TIMER             (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD)
 
@@ -197,10 +200,13 @@ extern void disable_percpu_irq(unsigned int irq);
 extern void enable_irq(unsigned int irq);
 extern void enable_percpu_irq(unsigned int irq, unsigned int type);
 extern void irq_wake_thread(unsigned int irq, void *dev_id);
+extern void irq_set_perf_affinity(unsigned int irq);
 
 /* The following three functions are for the core kernel use only. */
 extern void suspend_device_irqs(void);
 extern void resume_device_irqs(void);
+extern void unaffine_perf_irqs(void);
+extern void reaffine_perf_irqs(void);
 
 /**
  * struct irq_affinity_notify - context for notification of IRQ affinity changes
index 4289343..167276a 100644 (file)
@@ -37,6 +37,23 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
        __k;                                                               \
 })
 
+/**
+ * kthread_run_perf_critical - create and wake a performance-critical thread.
+ *
+ * Same as kthread_create().
+ */
+#define kthread_run_perf_critical(threadfn, data, namefmt, ...)                   \
+({                                                                        \
+       struct task_struct *__k                                            \
+               = kthread_create(threadfn, data, namefmt, ## __VA_ARGS__); \
+       if (!IS_ERR(__k)) {                                                \
+               __k->flags |= PF_PERF_CRITICAL;                            \
+               kthread_bind_mask(__k, cpu_perf_mask);                     \
+               wake_up_process(__k);                                      \
+       }                                                                  \
+       __k;                                                               \
+})
+
 void kthread_bind(struct task_struct *k, unsigned int cpu);
 void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask);
 int kthread_stop(struct task_struct *k);
index 5d8d78d..a2eee42 100644 (file)
@@ -2406,6 +2406,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
 #define PF_KTHREAD     0x00200000      /* I am a kernel thread */
 #define PF_RANDOMIZE   0x00400000      /* randomize virtual address space */
 #define PF_SWAPWRITE   0x00800000      /* Allowed to write to swap */
+#define PF_PERF_CRITICAL 0x01000000    /* Thread is performance-critical */
 #define PF_NO_SETAFFINITY 0x04000000   /* Userland is not allowed to meddle with cpus_allowed */
 #define PF_MCE_EARLY    0x08000000      /* Early kill for mce process policy */
 #define PF_MUTEX_TESTER        0x20000000      /* Thread belongs to the rt mutex tester */
index f7f441b..4578395 100644 (file)
@@ -458,8 +458,16 @@ out_release:
 
 int cpu_down(unsigned int cpu)
 {
+       struct cpumask newmask;
        int err;
 
+       cpumask_andnot(&newmask, cpu_online_mask, cpumask_of(cpu));
+
+       /* One big cluster CPU and one little cluster CPU must remain online */
+       if (!cpumask_intersects(&newmask, cpu_perf_mask) ||
+           !cpumask_intersects(&newmask, cpu_lp_mask))
+               return -EINVAL;
+
        cpu_maps_update_begin();
 
        if (cpu_hotplug_disabled) {
@@ -641,6 +649,7 @@ int disable_nonboot_cpus(void)
        int cpu, first_cpu, error = 0;
 
        cpu_maps_update_begin();
+       unaffine_perf_irqs();
        first_cpu = cpumask_first(cpu_online_mask);
        /*
         * We take down all of the non-boot CPUs in one shot to avoid races
@@ -722,6 +731,7 @@ void enable_nonboot_cpus(void)
        arch_enable_nonboot_cpus_end();
 
        cpumask_clear(frozen_cpus);
+       reaffine_perf_irqs();
 out:
        cpu_maps_update_done();
 }
index d0193c0..68ac2d5 100644 (file)
 #include <linux/sched.h>
 #include <linux/sched/rt.h>
 #include <linux/task_work.h>
+#include <linux/cpu.h>
 
 #include "internals.h"
 
+struct irq_desc_list {
+       struct list_head list;
+       struct irq_desc *desc;
+} perf_crit_irqs = {
+       .list = LIST_HEAD_INIT(perf_crit_irqs.list)
+};
+
+static DEFINE_RAW_SPINLOCK(perf_irqs_lock);
+static int perf_cpu_index = -1;
+
 #ifdef CONFIG_IRQ_FORCED_THREADING
 __read_mostly bool force_irqthreads;
 
@@ -1124,6 +1135,112 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
        return 0;
 }
 
+static void add_desc_to_perf_list(struct irq_desc *desc)
+{
+       struct irq_desc_list *item;
+
+       item = kmalloc(sizeof(*item), GFP_ATOMIC | __GFP_NOFAIL);
+       item->desc = desc;
+
+       raw_spin_lock(&perf_irqs_lock);
+       list_add(&item->list, &perf_crit_irqs.list);
+       raw_spin_unlock(&perf_irqs_lock);
+}
+
+static void affine_one_perf_thread(struct task_struct *t)
+{
+       t->flags |= PF_PERF_CRITICAL;
+       set_cpus_allowed_ptr(t, cpu_perf_mask);
+}
+
+static void unaffine_one_perf_thread(struct task_struct *t)
+{
+       t->flags &= ~PF_PERF_CRITICAL;
+       set_cpus_allowed_ptr(t, cpu_all_mask);
+}
+
+static void affine_one_perf_irq(struct irq_desc *desc)
+{
+       int cpu;
+
+       /* Balance the performance-critical IRQs across all perf CPUs */
+       while (1) {
+               cpu = cpumask_next_and(perf_cpu_index, cpu_perf_mask,
+                                      cpu_online_mask);
+               if (cpu < nr_cpu_ids)
+                       break;
+               perf_cpu_index = -1;
+       }
+       irq_set_affinity_locked(&desc->irq_data, cpumask_of(cpu), true);
+
+       perf_cpu_index = cpu;
+}
+
+static void setup_perf_irq_locked(struct irq_desc *desc)
+{
+       add_desc_to_perf_list(desc);
+       irqd_set(&desc->irq_data, IRQD_AFFINITY_MANAGED);
+       raw_spin_lock(&perf_irqs_lock);
+       affine_one_perf_irq(desc);
+       raw_spin_unlock(&perf_irqs_lock);
+}
+
+void irq_set_perf_affinity(unsigned int irq)
+{
+       struct irq_desc *desc = irq_to_desc(irq);
+       struct irqaction *action;
+       unsigned long flags;
+
+       if (!desc)
+               return;
+
+       raw_spin_lock_irqsave(&desc->lock, flags);
+       action = desc->action;
+       while (action) {
+               action->flags |= IRQF_PERF_CRITICAL;
+               action = action->next;
+       }
+       setup_perf_irq_locked(desc);
+       raw_spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+void unaffine_perf_irqs(void)
+{
+       struct irq_desc_list *data;
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&perf_irqs_lock, flags);
+       list_for_each_entry(data, &perf_crit_irqs.list, list) {
+               struct irq_desc *desc = data->desc;
+
+               raw_spin_lock(&desc->lock);
+               irq_set_affinity_locked(&desc->irq_data, cpu_all_mask, true);
+               if (desc->action->thread)
+                       unaffine_one_perf_thread(desc->action->thread);
+               raw_spin_unlock(&desc->lock);
+       }
+       perf_cpu_index = -1;
+       raw_spin_unlock_irqrestore(&perf_irqs_lock, flags);
+}
+
+void reaffine_perf_irqs(void)
+{
+       struct irq_desc_list *data;
+       unsigned long flags;
+
+       raw_spin_lock_irqsave(&perf_irqs_lock, flags);
+       list_for_each_entry(data, &perf_crit_irqs.list, list) {
+               struct irq_desc *desc = data->desc;
+
+               raw_spin_lock(&desc->lock);
+               affine_one_perf_irq(desc);
+               if (desc->action->thread)
+                       affine_one_perf_thread(desc->action->thread);
+               raw_spin_unlock(&desc->lock);
+       }
+       raw_spin_unlock_irqrestore(&perf_irqs_lock, flags);
+}
+
 /*
  * Internal function to register an irqaction - typically used to
  * allocate special interrupts that are part of the architecture.
@@ -1184,6 +1301,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                        if (ret)
                                goto out_thread;
                }
+
+               if (new->flags & IRQF_PERF_CRITICAL)
+                       affine_one_perf_thread(new->thread);
        }
 
        if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
@@ -1346,7 +1466,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                }
 
                /* Set default affinity mask once everything is setup */
-               setup_affinity(desc, mask);
+               if (new->flags & IRQF_PERF_CRITICAL)
+                       setup_perf_irq_locked(desc);
+               else
+                       setup_affinity(desc, mask);
 
        } else if (new->flags & IRQF_TRIGGER_MASK) {
                unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK;
@@ -1487,6 +1610,20 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
                action_ptr = &action->next;
        }
 
+       if (action->flags & IRQF_PERF_CRITICAL) {
+               struct irq_desc_list *data;
+
+               raw_spin_lock(&perf_irqs_lock);
+               list_for_each_entry(data, &perf_crit_irqs.list, list) {
+                       if (data->desc == desc) {
+                               list_del(&data->list);
+                               kfree(data);
+                               break;
+                       }
+               }
+               raw_spin_unlock(&perf_irqs_lock);
+       }
+
        /* Found it - now remove it from the list of entries: */
        *action_ptr = action->next;
 
index ad15780..ba55586 100644 (file)
@@ -1292,6 +1292,10 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
        int ret = 0;
        cpumask_t allowed_mask;
 
+       /* Force all performance-critical kthreads onto the big cluster */
+       if (p->flags & PF_PERF_CRITICAL)
+               new_mask = cpu_perf_mask;
+
        rq = task_rq_lock(p, &flags);
 
        /*