powerpc/xive: Implement get_irqchip_state method for XIVE to fix shutdown race

author Paul Mackerras <paulus@ozlabs.org>

Tue, 13 Aug 2019 10:06:48 +0000 (20:06 +1000)

committer Michael Ellerman <mpe@ellerman.id.au>

Fri, 16 Aug 2019 04:16:59 +0000 (14:16 +1000)
author Paul Mackerras <paulus@ozlabs.org>
Tue, 13 Aug 2019 10:06:48 +0000 (20:06 +1000)
committer Michael Ellerman <mpe@ellerman.id.au>
Fri, 16 Aug 2019 04:16:59 +0000 (14:16 +1000)
diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h

index e401698..efb0e59 100644 (file)
--- a/arch/powerpc/include/asm/xive.h
+++ b/arch/powerpc/include/asm/xive.h
@@ -46,7 +46,15 @@ struct xive_irq_data {
  
         /* Setup/used by frontend */
         int target;
+       /*
+        * saved_p means that there is a queue entry for this interrupt
+        * in some CPU's queue (not including guest vcpu queues), even
+        * if P is not set in the source ESB.
+        * stale_p means that there is no queue entry for this interrupt
+        * in some CPU's queue, even if P is set in the source ESB.
+        */
         bool saved_p;
+       bool stale_p;
  };
  #define XIVE_IRQ_FLAG_STORE_EOI        0x01
  #define XIVE_IRQ_FLAG_LSI      0x02
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c

index 586867e..591bfb4 100644 (file)
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -166,6 +166,9 @@ static irqreturn_t xive_esc_irq(int irq, void *data)
          */
         vcpu->arch.xive_esc_on = false;
  
+       /* This orders xive_esc_on = false vs. subsequent stale_p = true */
+       smp_wmb();      /* goes with smp_mb() in cleanup_single_escalation */
+
         return IRQ_HANDLED;
  }
  
@@ -1119,6 +1122,31 @@ void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu)
         vcpu->arch.xive_esc_raddr = 0;
  }
  
+/*
+ * In single escalation mode, the escalation interrupt is marked so
+ * that EOI doesn't re-enable it, but just sets the stale_p flag to
+ * indicate that the P bit has already been dealt with.  However, the
+ * assembly code that enters the guest sets PQ to 00 without clearing
+ * stale_p (because it has no easy way to address it).  Hence we have
+ * to adjust stale_p before shutting down the interrupt.
+ */
+void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu,
+                                   struct kvmppc_xive_vcpu *xc, int irq)
+{
+       struct irq_data *d = irq_get_irq_data(irq);
+       struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+
+       /*
+        * This slightly odd sequence gives the right result
+        * (i.e. stale_p set if xive_esc_on is false) even if
+        * we race with xive_esc_irq() and xive_irq_eoi().
+        */
+       xd->stale_p = false;
+       smp_mb();               /* paired with smb_wmb in xive_esc_irq */
+       if (!vcpu->arch.xive_esc_on)
+               xd->stale_p = true;
+}
+
  void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
  {
         struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
@@ -1143,6 +1171,9 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
         /* Free escalations */
         for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
                 if (xc->esc_virq[i]) {
+                       if (xc->xive->single_escalation)
+                               xive_cleanup_single_escalation(vcpu, xc,
+                                                       xc->esc_virq[i]);
                         free_irq(xc->esc_virq[i], vcpu);
                         irq_dispose_mapping(xc->esc_virq[i]);
                         kfree(xc->esc_virq_names[i]);
diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h

index 50494d0..955b820 100644 (file)
--- a/arch/powerpc/kvm/book3s_xive.h
+++ b/arch/powerpc/kvm/book3s_xive.h
@@ -282,6 +282,8 @@ int kvmppc_xive_select_target(struct kvm *kvm, u32 *server, u8 prio);
  int kvmppc_xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio,
                                   bool single_escalation);
  struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type);
+void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu,
+                                   struct kvmppc_xive_vcpu *xc, int irq);
  
  #endif /* CONFIG_KVM_XICS */
  #endif /* _KVM_PPC_BOOK3S_XICS_H */
diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c

index 11b91b4..f0cab43 100644 (file)
--- a/arch/powerpc/kvm/book3s_xive_native.c
+++ b/arch/powerpc/kvm/book3s_xive_native.c
@@ -71,6 +71,9 @@ void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu)
         for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
                 /* Free the escalation irq */
                 if (xc->esc_virq[i]) {
+                       if (xc->xive->single_escalation)
+                               xive_cleanup_single_escalation(vcpu, xc,
+                                                       xc->esc_virq[i]);
                         free_irq(xc->esc_virq[i], vcpu);
                         irq_dispose_mapping(xc->esc_virq[i]);
                         kfree(xc->esc_virq_names[i]);
diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c

index 1cdb395..be86fce 100644 (file)
--- a/arch/powerpc/sysdev/xive/common.c
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -135,7 +135,7 @@ static u32 xive_read_eq(struct xive_q *q, bool just_peek)
  static u32 xive_scan_interrupts(struct xive_cpu *xc, bool just_peek)
  {
         u32 irq = 0;
-       u8 prio;
+       u8 prio = 0;
  
         /* Find highest pending priority */
         while (xc->pending_prio != 0) {
@@ -148,8 +148,19 @@ static u32 xive_scan_interrupts(struct xive_cpu *xc, bool just_peek)
                 irq = xive_read_eq(&xc->queue[prio], just_peek);
  
                 /* Found something ? That's it */
-               if (irq)
-                       break;
+               if (irq) {
+                       if (just_peek || irq_to_desc(irq))
+                               break;
+                       /*
+                        * We should never get here; if we do then we must
+                        * have failed to synchronize the interrupt properly
+                        * when shutting it down.
+                        */
+                       pr_crit("xive: got interrupt %d without descriptor, dropping\n",
+                               irq);
+                       WARN_ON(1);
+                       continue;
+               }
  
                 /* Clear pending bits */
                 xc->pending_prio &= ~(1 << prio);
@@ -307,6 +318,7 @@ static void xive_do_queue_eoi(struct xive_cpu *xc)
   */
  static void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd)
  {
+       xd->stale_p = false;
         /* If the XIVE supports the new "store EOI facility, use it */
         if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI)
                 xive_esb_write(xd, XIVE_ESB_STORE_EOI, 0);
@@ -350,7 +362,7 @@ static void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd)
         }
  }
  
-/* irq_chip eoi callback */
+/* irq_chip eoi callback, called with irq descriptor lock held */
  static void xive_irq_eoi(struct irq_data *d)
  {
         struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
@@ -366,6 +378,8 @@ static void xive_irq_eoi(struct irq_data *d)
         if (!irqd_irq_disabled(d) && !irqd_is_forwarded_to_vcpu(d) &&
             !(xd->flags & XIVE_IRQ_NO_EOI))
                 xive_do_source_eoi(irqd_to_hwirq(d), xd);
+       else
+               xd->stale_p = true;
  
         /*
          * Clear saved_p to indicate that it's no longer occupying
@@ -397,11 +411,16 @@ static void xive_do_source_set_mask(struct xive_irq_data *xd,
          */
         if (mask) {
                 val = xive_esb_read(xd, XIVE_ESB_SET_PQ_01);
-               xd->saved_p = !!(val & XIVE_ESB_VAL_P);
-       } else if (xd->saved_p)
+               if (!xd->stale_p && !!(val & XIVE_ESB_VAL_P))
+                       xd->saved_p = true;
+               xd->stale_p = false;
+       } else if (xd->saved_p) {
                 xive_esb_read(xd, XIVE_ESB_SET_PQ_10);
-       else
+               xd->saved_p = false;
+       } else {
                 xive_esb_read(xd, XIVE_ESB_SET_PQ_00);
+               xd->stale_p = false;
+       }
  }
  
  /*
@@ -541,6 +560,8 @@ static unsigned int xive_irq_startup(struct irq_data *d)
         unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
         int target, rc;
  
+       xd->saved_p = false;
+       xd->stale_p = false;
         pr_devel("xive_irq_startup: irq %d [0x%x] data @%p\n",
                  d->irq, hw_irq, d);
  
@@ -587,6 +608,7 @@ static unsigned int xive_irq_startup(struct irq_data *d)
         return 0;
  }
  
+/* called with irq descriptor lock held */
  static void xive_irq_shutdown(struct irq_data *d)
  {
         struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
@@ -602,16 +624,6 @@ static void xive_irq_shutdown(struct irq_data *d)
         xive_do_source_set_mask(xd, true);
  
         /*
-        * The above may have set saved_p. We clear it otherwise it
-        * will prevent re-enabling later on. It is ok to forget the
-        * fact that the interrupt might be in a queue because we are
-        * accounting that already in xive_dec_target_count() and will
-        * be re-routing it to a new queue with proper accounting when
-        * it's started up again
-        */
-       xd->saved_p = false;
-
-       /*
          * Mask the interrupt in HW in the IVT/EAS and set the number
          * to be the "bad" IRQ number
          */
@@ -797,6 +809,10 @@ static int xive_irq_retrigger(struct irq_data *d)
         return 1;
  }
  
+/*
+ * Caller holds the irq descriptor lock, so this won't be called
+ * concurrently with xive_get_irqchip_state on the same interrupt.
+ */
  static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state)
  {
         struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
@@ -820,6 +836,10 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state)
  
                 /* Set it to PQ=10 state to prevent further sends */
                 pq = xive_esb_read(xd, XIVE_ESB_SET_PQ_10);
+               if (!xd->stale_p) {
+                       xd->saved_p = !!(pq & XIVE_ESB_VAL_P);
+                       xd->stale_p = !xd->saved_p;
+               }
  
                 /* No target ? nothing to do */
                 if (xd->target == XIVE_INVALID_TARGET) {
@@ -827,7 +847,7 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state)
                          * An untargetted interrupt should have been
                          * also masked at the source
                          */
-                       WARN_ON(pq & 2);
+                       WARN_ON(xd->saved_p);
  
                         return 0;
                 }
@@ -847,9 +867,8 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state)
                  * This saved_p is cleared by the host EOI, when we know
                  * for sure the queue slot is no longer in use.
                  */
-               if (pq & 2) {
-                       pq = xive_esb_read(xd, XIVE_ESB_SET_PQ_11);
-                       xd->saved_p = true;
+               if (xd->saved_p) {
+                       xive_esb_read(xd, XIVE_ESB_SET_PQ_11);
  
                         /*
                          * Sync the XIVE source HW to ensure the interrupt
@@ -862,8 +881,7 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state)
                          */
                         if (xive_ops->sync_source)
                                 xive_ops->sync_source(hw_irq);
-               } else
-                       xd->saved_p = false;
+               }
         } else {
                 irqd_clr_forwarded_to_vcpu(d);
  
@@ -914,6 +932,23 @@ static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state)
         return 0;
  }
  
+/* Called with irq descriptor lock held. */
+static int xive_get_irqchip_state(struct irq_data *data,
+                                 enum irqchip_irq_state which, bool *state)
+{
+       struct xive_irq_data *xd = irq_data_get_irq_handler_data(data);
+
+       switch (which) {
+       case IRQCHIP_STATE_ACTIVE:
+               *state = !xd->stale_p &&
+                        (xd->saved_p ||
+                         !!(xive_esb_read(xd, XIVE_ESB_GET) & XIVE_ESB_VAL_P));
+               return 0;
+       default:
+               return -EINVAL;
+       }
+}
+
  static struct irq_chip xive_irq_chip = {
         .name = "XIVE-IRQ",
         .irq_startup = xive_irq_startup,
@@ -925,6 +960,7 @@ static struct irq_chip xive_irq_chip = {
         .irq_set_type = xive_irq_set_type,
         .irq_retrigger = xive_irq_retrigger,
         .irq_set_vcpu_affinity = xive_irq_set_vcpu_affinity,
+       .irq_get_irqchip_state = xive_get_irqchip_state,
  };
  
  bool is_xive_irq(struct irq_chip *chip)
@@ -1338,6 +1374,11 @@ static void xive_flush_cpu_queue(unsigned int cpu, struct xive_cpu *xc)
                 xd = irq_desc_get_handler_data(desc);
  
                 /*
+                * Clear saved_p to indicate that it's no longer pending
+                */
+               xd->saved_p = false;
+
+               /*
                  * For LSIs, we EOI, this will cause a resend if it's
                  * still asserted. Otherwise do an MSI retrigger.
                  */
author	Paul Mackerras <paulus@ozlabs.org>
	Tue, 13 Aug 2019 10:06:48 +0000 (20:06 +1000)
committer	Michael Ellerman <mpe@ellerman.id.au>
	Fri, 16 Aug 2019 04:16:59 +0000 (14:16 +1000)
arch/powerpc/include/asm/xive.h		patch \| blob \| history
arch/powerpc/kvm/book3s_xive.c		patch \| blob \| history
arch/powerpc/kvm/book3s_xive.h		patch \| blob \| history
arch/powerpc/kvm/book3s_xive_native.c		patch \| blob \| history
arch/powerpc/sysdev/xive/common.c		patch \| blob \| history