aboutsummaryrefslogtreecommitdiffstatshomepage
diff options
context:
space:
mode:
-rw-r--r--arch/x86/include/asm/posted_intr.h78
-rw-r--r--arch/x86/kernel/irq.c63
-rw-r--r--arch/x86/kvm/lapic.c20
-rw-r--r--arch/x86/kvm/lapic.h4
-rw-r--r--arch/x86/kvm/vmx/posted_intr.h2
5 files changed, 95 insertions, 72 deletions
diff --git a/arch/x86/include/asm/posted_intr.h b/arch/x86/include/asm/posted_intr.h
index bb107ebbe713..a5f761fbf45b 100644
--- a/arch/x86/include/asm/posted_intr.h
+++ b/arch/x86/include/asm/posted_intr.h
@@ -1,19 +1,24 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _X86_POSTED_INTR_H
#define _X86_POSTED_INTR_H
+
+#include <asm/cmpxchg.h>
+#include <asm/rwonce.h>
#include <asm/irq_vectors.h>
+#include <linux/bitmap.h>
+
#define POSTED_INTR_ON 0
#define POSTED_INTR_SN 1
#define PID_TABLE_ENTRY_VALID 1
+#define NR_PIR_VECTORS 256
+#define NR_PIR_WORDS (NR_PIR_VECTORS / BITS_PER_LONG)
+
/* Posted-Interrupt Descriptor */
struct pi_desc {
- union {
- u32 pir[8]; /* Posted interrupt requested */
- u64 pir64[4];
- };
+ unsigned long pir[NR_PIR_WORDS]; /* Posted interrupt requested */
union {
struct {
u16 notifications; /* Suppress and outstanding bits */
@@ -26,6 +31,65 @@ struct pi_desc {
u32 rsvd[6];
} __aligned(64);
+/*
+ * De-multiplexing posted interrupts is on the performance path, the code
+ * below is written to optimize the cache performance based on the following
+ * considerations:
+ * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently
+ * accessed by both CPU and IOMMU.
+ * 2.During software processing of posted interrupts, the CPU needs to do
+ * natural width read and xchg for checking and clearing posted interrupt
+ * request (PIR), a 256 bit field within the PID.
+ * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache
+ * line when posting interrupts and setting control bits.
+ * 4.The CPU can access the cache line a magnitude faster than the IOMMU.
+ * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID
+ * cache line. The cache line states after each operation are as follows,
+ * assuming a 64-bit kernel:
+ * CPU IOMMU PID Cache line state
+ * ---------------------------------------------------------------
+ *...read64 exclusive
+ *...lock xchg64 modified
+ *... post/atomic swap invalid
+ *...-------------------------------------------------------------
+ *
+ * To reduce L1 data cache miss, it is important to avoid contention with
+ * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used
+ * when processing posted interrupts in software, e.g. to dispatch interrupt
+ * handlers for posted MSIs, or to move interrupts from the PIR to the vIRR
+ * in KVM.
+ *
+ * In addition, the code is trying to keep the cache line state consistent
+ * as much as possible. e.g. when making a copy and clearing the PIR
+ * (assuming non-zero PIR bits are present in the entire PIR), it does:
+ * read, read, read, read, xchg, xchg, xchg, xchg
+ * instead of:
+ * read, xchg, read, xchg, read, xchg, read, xchg
+ */
+static __always_inline bool pi_harvest_pir(unsigned long *pir,
+ unsigned long *pir_vals)
+{
+ unsigned long pending = 0;
+ int i;
+
+ for (i = 0; i < NR_PIR_WORDS; i++) {
+ pir_vals[i] = READ_ONCE(pir[i]);
+ pending |= pir_vals[i];
+ }
+
+ if (!pending)
+ return false;
+
+ for (i = 0; i < NR_PIR_WORDS; i++) {
+ if (!pir_vals[i])
+ continue;
+
+ pir_vals[i] = arch_xchg(&pir[i], 0);
+ }
+
+ return true;
+}
+
static inline bool pi_test_and_set_on(struct pi_desc *pi_desc)
{
return test_and_set_bit(POSTED_INTR_ON, (unsigned long *)&pi_desc->control);
@@ -43,12 +107,12 @@ static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc)
static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
{
- return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
+ return test_and_set_bit(vector, pi_desc->pir);
}
static inline bool pi_is_pir_empty(struct pi_desc *pi_desc)
{
- return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS);
+ return bitmap_empty(pi_desc->pir, NR_VECTORS);
}
static inline void pi_set_sn(struct pi_desc *pi_desc)
@@ -110,7 +174,7 @@ static inline bool pi_pending_this_cpu(unsigned int vector)
if (WARN_ON_ONCE(vector > NR_VECTORS || vector < FIRST_EXTERNAL_VECTOR))
return false;
- return test_bit(vector, (unsigned long *)pid->pir);
+ return test_bit(vector, pid->pir);
}
extern void intel_posted_msi_init(void);
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 81f9b78e0f7b..9ed29ff10e59 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -380,61 +380,18 @@ void intel_posted_msi_init(void)
this_cpu_write(posted_msi_pi_desc.ndst, destination);
}
-/*
- * De-multiplexing posted interrupts is on the performance path, the code
- * below is written to optimize the cache performance based on the following
- * considerations:
- * 1.Posted interrupt descriptor (PID) fits in a cache line that is frequently
- * accessed by both CPU and IOMMU.
- * 2.During posted MSI processing, the CPU needs to do 64-bit read and xchg
- * for checking and clearing posted interrupt request (PIR), a 256 bit field
- * within the PID.
- * 3.On the other side, the IOMMU does atomic swaps of the entire PID cache
- * line when posting interrupts and setting control bits.
- * 4.The CPU can access the cache line a magnitude faster than the IOMMU.
- * 5.Each time the IOMMU does interrupt posting to the PIR will evict the PID
- * cache line. The cache line states after each operation are as follows:
- * CPU IOMMU PID Cache line state
- * ---------------------------------------------------------------
- *...read64 exclusive
- *...lock xchg64 modified
- *... post/atomic swap invalid
- *...-------------------------------------------------------------
- *
- * To reduce L1 data cache miss, it is important to avoid contention with
- * IOMMU's interrupt posting/atomic swap. Therefore, a copy of PIR is used
- * to dispatch interrupt handlers.
- *
- * In addition, the code is trying to keep the cache line state consistent
- * as much as possible. e.g. when making a copy and clearing the PIR
- * (assuming non-zero PIR bits are present in the entire PIR), it does:
- * read, read, read, read, xchg, xchg, xchg, xchg
- * instead of:
- * read, xchg, read, xchg, read, xchg, read, xchg
- */
-static __always_inline bool handle_pending_pir(u64 *pir, struct pt_regs *regs)
+static __always_inline bool handle_pending_pir(unsigned long *pir, struct pt_regs *regs)
{
- int i, vec = FIRST_EXTERNAL_VECTOR;
- unsigned long pir_copy[4];
- bool handled = false;
+ unsigned long pir_copy[NR_PIR_WORDS];
+ int vec = FIRST_EXTERNAL_VECTOR;
- for (i = 0; i < 4; i++)
- pir_copy[i] = pir[i];
-
- for (i = 0; i < 4; i++) {
- if (!pir_copy[i])
- continue;
+ if (!pi_harvest_pir(pir, pir_copy))
+ return false;
- pir_copy[i] = arch_xchg(&pir[i], 0);
- handled = true;
- }
-
- if (handled) {
- for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR)
- call_irq_handler(vec, regs);
- }
+ for_each_set_bit_from(vec, pir_copy, FIRST_SYSTEM_VECTOR)
+ call_irq_handler(vec, regs);
- return handled;
+ return true;
}
/*
@@ -464,7 +421,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification)
* MAX_POSTED_MSI_COALESCING_LOOP - 1 loops are executed here.
*/
while (++i < MAX_POSTED_MSI_COALESCING_LOOP) {
- if (!handle_pending_pir(pid->pir64, regs))
+ if (!handle_pending_pir(pid->pir, regs))
break;
}
@@ -479,7 +436,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_posted_msi_notification)
* process PIR bits one last time such that handling the new interrupts
* are not delayed until the next IRQ.
*/
- handle_pending_pir(pid->pir64, regs);
+ handle_pending_pir(pid->pir, regs);
apic_eoi();
irq_exit();
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 3defbe520aed..73418dc0ebb2 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -655,27 +655,29 @@ static u8 count_vectors(void *bitmap)
return count;
}
-bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr)
+bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr)
{
+ unsigned long pir_vals[NR_PIR_WORDS];
+ u32 *__pir = (void *)pir_vals;
u32 i, vec;
- u32 pir_val, irr_val, prev_irr_val;
+ u32 irr_val, prev_irr_val;
int max_updated_irr;
max_updated_irr = -1;
*max_irr = -1;
+ if (!pi_harvest_pir(pir, pir_vals))
+ return false;
+
for (i = vec = 0; i <= 7; i++, vec += 32) {
u32 *p_irr = (u32 *)(regs + APIC_IRR + i * 0x10);
- irr_val = *p_irr;
- pir_val = READ_ONCE(pir[i]);
-
- if (pir_val) {
- pir_val = xchg(&pir[i], 0);
+ irr_val = READ_ONCE(*p_irr);
+ if (__pir[i]) {
prev_irr_val = irr_val;
do {
- irr_val = prev_irr_val | pir_val;
+ irr_val = prev_irr_val | __pir[i];
} while (prev_irr_val != irr_val &&
!try_cmpxchg(p_irr, &prev_irr_val, irr_val));
@@ -691,7 +693,7 @@ bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr)
}
EXPORT_SYMBOL_GPL(__kvm_apic_update_irr);
-bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr)
+bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr)
{
struct kvm_lapic *apic = vcpu->arch.apic;
bool irr_updated = __kvm_apic_update_irr(pir, apic->regs, max_irr);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index e33c969439f7..4ce30db65828 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -103,8 +103,8 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
int shorthand, unsigned int dest, int dest_mode);
int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
void kvm_apic_clear_irr(struct kvm_vcpu *vcpu, int vec);
-bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr);
-bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr);
+bool __kvm_apic_update_irr(unsigned long *pir, void *regs, int *max_irr);
+bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, unsigned long *pir, int *max_irr);
void kvm_apic_update_ppr(struct kvm_vcpu *vcpu);
int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
struct dest_map *dest_map);
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
index 68605ca7ef68..8d183983cd91 100644
--- a/arch/x86/kvm/vmx/posted_intr.h
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -20,7 +20,7 @@ static inline int pi_find_highest_vector(struct pi_desc *pi_desc)
{
int vec;
- vec = find_last_bit((unsigned long *)pi_desc->pir, 256);
+ vec = find_last_bit(pi_desc->pir, 256);
return vec < 256 ? vec : -1;
}