diff options
author | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2018-04-04 16:11:49 -0700 |
---|---|---|
committer | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2018-04-04 16:11:49 -0700 |
commit | 664b0bae0b87f69bc9deb098f5e0158b9cf18e04 (patch) | |
tree | d5841492b396ff483723b9339c7c11dc33b67688 /virt | |
parent | Input: ALPS - fix TrackStick detection on Thinkpad L570 and Latitude 7370 (diff) | |
parent | Input: i8042 - enable MUX on Sony VAIO VGN-CS series to fix touchpad (diff) | |
download | linux-dev-664b0bae0b87f69bc9deb098f5e0158b9cf18e04.tar.xz linux-dev-664b0bae0b87f69bc9deb098f5e0158b9cf18e04.zip |
Merge branch 'next' into for-linus
Prepare input updates for 4.17 merge window.
Diffstat (limited to 'virt')
-rw-r--r-- | virt/kvm/Kconfig | 3 | ||||
-rw-r--r-- | virt/kvm/arm/aarch32.c | 97 | ||||
-rw-r--r-- | virt/kvm/arm/arch_timer.c | 517 | ||||
-rw-r--r-- | virt/kvm/arm/arm.c | 295 | ||||
-rw-r--r-- | virt/kvm/arm/hyp/timer-sr.c | 40 | ||||
-rw-r--r-- | virt/kvm/arm/hyp/vgic-v2-sr.c | 5 | ||||
-rw-r--r-- | virt/kvm/arm/hyp/vgic-v3-sr.c | 9 | ||||
-rw-r--r-- | virt/kvm/arm/mmio.c | 6 | ||||
-rw-r--r-- | virt/kvm/arm/mmu.c | 88 | ||||
-rw-r--r-- | virt/kvm/arm/psci.c | 143 | ||||
-rw-r--r-- | virt/kvm/arm/vgic/vgic-init.c | 9 | ||||
-rw-r--r-- | virt/kvm/arm/vgic/vgic-irqfd.c | 3 | ||||
-rw-r--r-- | virt/kvm/arm/vgic/vgic-its.c | 401 | ||||
-rw-r--r-- | virt/kvm/arm/vgic/vgic-mmio-v2.c | 22 | ||||
-rw-r--r-- | virt/kvm/arm/vgic/vgic-mmio-v3.c | 22 | ||||
-rw-r--r-- | virt/kvm/arm/vgic/vgic-mmio.c | 157 | ||||
-rw-r--r-- | virt/kvm/arm/vgic/vgic-v2.c | 34 | ||||
-rw-r--r-- | virt/kvm/arm/vgic/vgic-v3.c | 57 | ||||
-rw-r--r-- | virt/kvm/arm/vgic/vgic-v4.c | 366 | ||||
-rw-r--r-- | virt/kvm/arm/vgic/vgic.c | 168 | ||||
-rw-r--r-- | virt/kvm/arm/vgic/vgic.h | 21 | ||||
-rw-r--r-- | virt/kvm/eventfd.c | 12 | ||||
-rw-r--r-- | virt/kvm/kvm_main.c | 161 |
23 files changed, 1939 insertions, 697 deletions
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 70691c08e1ed..cca7e065a075 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -51,3 +51,6 @@ config KVM_COMPAT config HAVE_KVM_IRQ_BYPASS bool + +config HAVE_KVM_VCPU_ASYNC_IOCTL + bool diff --git a/virt/kvm/arm/aarch32.c b/virt/kvm/arm/aarch32.c index 79c7c357804b..8bc479fa37e6 100644 --- a/virt/kvm/arm/aarch32.c +++ b/virt/kvm/arm/aarch32.c @@ -25,11 +25,6 @@ #include <asm/kvm_emulate.h> #include <asm/kvm_hyp.h> -#ifndef CONFIG_ARM64 -#define COMPAT_PSR_T_BIT PSR_T_BIT -#define COMPAT_PSR_IT_MASK PSR_IT_MASK -#endif - /* * stolen from arch/arm/kernel/opcodes.c * @@ -150,3 +145,95 @@ void __hyp_text kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr) *vcpu_pc(vcpu) += 4; kvm_adjust_itstate(vcpu); } + +/* + * Table taken from ARMv8 ARM DDI0487B-B, table G1-10. + */ +static const u8 return_offsets[8][2] = { + [0] = { 0, 0 }, /* Reset, unused */ + [1] = { 4, 2 }, /* Undefined */ + [2] = { 0, 0 }, /* SVC, unused */ + [3] = { 4, 4 }, /* Prefetch abort */ + [4] = { 8, 8 }, /* Data abort */ + [5] = { 0, 0 }, /* HVC, unused */ + [6] = { 4, 4 }, /* IRQ, unused */ + [7] = { 4, 4 }, /* FIQ, unused */ +}; + +static void prepare_fault32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset) +{ + unsigned long cpsr; + unsigned long new_spsr_value = *vcpu_cpsr(vcpu); + bool is_thumb = (new_spsr_value & COMPAT_PSR_T_BIT); + u32 return_offset = return_offsets[vect_offset >> 2][is_thumb]; + u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR); + + cpsr = mode | COMPAT_PSR_I_BIT; + + if (sctlr & (1 << 30)) + cpsr |= COMPAT_PSR_T_BIT; + if (sctlr & (1 << 25)) + cpsr |= COMPAT_PSR_E_BIT; + + *vcpu_cpsr(vcpu) = cpsr; + + /* Note: These now point to the banked copies */ + *vcpu_spsr(vcpu) = new_spsr_value; + *vcpu_reg32(vcpu, 14) = *vcpu_pc(vcpu) + return_offset; + + /* Branch to exception vector */ + if (sctlr & (1 << 13)) + vect_offset += 0xffff0000; + else /* always have security exceptions */ + vect_offset += vcpu_cp15(vcpu, c12_VBAR); + + *vcpu_pc(vcpu) = vect_offset; +} + +void kvm_inject_undef32(struct kvm_vcpu *vcpu) +{ + prepare_fault32(vcpu, COMPAT_PSR_MODE_UND, 4); +} + +/* + * Modelled after TakeDataAbortException() and TakePrefetchAbortException + * pseudocode. + */ +static void inject_abt32(struct kvm_vcpu *vcpu, bool is_pabt, + unsigned long addr) +{ + u32 vect_offset; + u32 *far, *fsr; + bool is_lpae; + + if (is_pabt) { + vect_offset = 12; + far = &vcpu_cp15(vcpu, c6_IFAR); + fsr = &vcpu_cp15(vcpu, c5_IFSR); + } else { /* !iabt */ + vect_offset = 16; + far = &vcpu_cp15(vcpu, c6_DFAR); + fsr = &vcpu_cp15(vcpu, c5_DFSR); + } + + prepare_fault32(vcpu, COMPAT_PSR_MODE_ABT | COMPAT_PSR_A_BIT, vect_offset); + + *far = addr; + + /* Give the guest an IMPLEMENTATION DEFINED exception */ + is_lpae = (vcpu_cp15(vcpu, c2_TTBCR) >> 31); + if (is_lpae) + *fsr = 1 << 9 | 0x34; + else + *fsr = 0x14; +} + +void kvm_inject_dabt32(struct kvm_vcpu *vcpu, unsigned long addr) +{ + inject_abt32(vcpu, false, addr); +} + +void kvm_inject_pabt32(struct kvm_vcpu *vcpu, unsigned long addr) +{ + inject_abt32(vcpu, true, addr); +} diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 8e89d63005c7..70f4c30918eb 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -36,6 +36,8 @@ static struct timecounter *timecounter; static unsigned int host_vtimer_irq; static u32 host_vtimer_irq_flags; +static DEFINE_STATIC_KEY_FALSE(has_gic_active_state); + static const struct kvm_irq_level default_ptimer_irq = { .irq = 30, .level = 1, @@ -46,49 +48,57 @@ static const struct kvm_irq_level default_vtimer_irq = { .level = 1, }; -void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) -{ - vcpu_vtimer(vcpu)->active_cleared_last = false; -} +static bool kvm_timer_irq_can_fire(struct arch_timer_context *timer_ctx); +static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, + struct arch_timer_context *timer_ctx); +static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx); u64 kvm_phys_timer_read(void) { return timecounter->cc->read(timecounter->cc); } -static bool timer_is_armed(struct arch_timer_cpu *timer) +static inline bool userspace_irqchip(struct kvm *kvm) { - return timer->armed; + return static_branch_unlikely(&userspace_irqchip_in_use) && + unlikely(!irqchip_in_kernel(kvm)); } -/* timer_arm: as in "arm the timer", not as in ARM the company */ -static void timer_arm(struct arch_timer_cpu *timer, u64 ns) +static void soft_timer_start(struct hrtimer *hrt, u64 ns) { - timer->armed = true; - hrtimer_start(&timer->timer, ktime_add_ns(ktime_get(), ns), + hrtimer_start(hrt, ktime_add_ns(ktime_get(), ns), HRTIMER_MODE_ABS); } -static void timer_disarm(struct arch_timer_cpu *timer) +static void soft_timer_cancel(struct hrtimer *hrt, struct work_struct *work) { - if (timer_is_armed(timer)) { - hrtimer_cancel(&timer->timer); - cancel_work_sync(&timer->expired); - timer->armed = false; - } + hrtimer_cancel(hrt); + if (work) + cancel_work_sync(work); } static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) { struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id; + struct arch_timer_context *vtimer; /* - * We disable the timer in the world switch and let it be - * handled by kvm_timer_sync_hwstate(). Getting a timer - * interrupt at this point is a sure sign of some major - * breakage. + * We may see a timer interrupt after vcpu_put() has been called which + * sets the CPU's vcpu pointer to NULL, because even though the timer + * has been disabled in vtimer_save_state(), the hardware interrupt + * signal may not have been retired from the interrupt controller yet. */ - pr_warn("Unexpected interrupt %d on vcpu %p\n", irq, vcpu); + if (!vcpu) + return IRQ_HANDLED; + + vtimer = vcpu_vtimer(vcpu); + if (kvm_timer_should_fire(vtimer)) + kvm_timer_update_irq(vcpu, true, vtimer); + + if (userspace_irqchip(vcpu->kvm) && + !static_branch_unlikely(&has_gic_active_state)) + disable_percpu_irq(host_vtimer_irq); + return IRQ_HANDLED; } @@ -158,13 +168,13 @@ static u64 kvm_timer_earliest_exp(struct kvm_vcpu *vcpu) return min(min_virt, min_phys); } -static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt) +static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt) { struct arch_timer_cpu *timer; struct kvm_vcpu *vcpu; u64 ns; - timer = container_of(hrt, struct arch_timer_cpu, timer); + timer = container_of(hrt, struct arch_timer_cpu, bg_timer); vcpu = container_of(timer, struct kvm_vcpu, arch.timer_cpu); /* @@ -182,10 +192,46 @@ static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt) return HRTIMER_NORESTART; } -bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx) +static enum hrtimer_restart kvm_phys_timer_expire(struct hrtimer *hrt) +{ + struct arch_timer_context *ptimer; + struct arch_timer_cpu *timer; + struct kvm_vcpu *vcpu; + u64 ns; + + timer = container_of(hrt, struct arch_timer_cpu, phys_timer); + vcpu = container_of(timer, struct kvm_vcpu, arch.timer_cpu); + ptimer = vcpu_ptimer(vcpu); + + /* + * Check that the timer has really expired from the guest's + * PoV (NTP on the host may have forced it to expire + * early). If not ready, schedule for a later time. + */ + ns = kvm_timer_compute_delta(ptimer); + if (unlikely(ns)) { + hrtimer_forward_now(hrt, ns_to_ktime(ns)); + return HRTIMER_RESTART; + } + + kvm_timer_update_irq(vcpu, true, ptimer); + return HRTIMER_NORESTART; +} + +static bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx) { u64 cval, now; + if (timer_ctx->loaded) { + u32 cnt_ctl; + + /* Only the virtual timer can be loaded so far */ + cnt_ctl = read_sysreg_el0(cntv_ctl); + return (cnt_ctl & ARCH_TIMER_CTRL_ENABLE) && + (cnt_ctl & ARCH_TIMER_CTRL_IT_STAT) && + !(cnt_ctl & ARCH_TIMER_CTRL_IT_MASK); + } + if (!kvm_timer_irq_can_fire(timer_ctx)) return false; @@ -195,6 +241,17 @@ bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx) return cval <= now; } +bool kvm_timer_is_pending(struct kvm_vcpu *vcpu) +{ + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + + if (kvm_timer_should_fire(vtimer)) + return true; + + return kvm_timer_should_fire(ptimer); +} + /* * Reflect the timer output level into the kvm_run structure */ @@ -207,9 +264,9 @@ void kvm_timer_update_run(struct kvm_vcpu *vcpu) /* Populate the device bitmap with the timer states */ regs->device_irq_level &= ~(KVM_ARM_DEV_EL1_VTIMER | KVM_ARM_DEV_EL1_PTIMER); - if (vtimer->irq.level) + if (kvm_timer_should_fire(vtimer)) regs->device_irq_level |= KVM_ARM_DEV_EL1_VTIMER; - if (ptimer->irq.level) + if (kvm_timer_should_fire(ptimer)) regs->device_irq_level |= KVM_ARM_DEV_EL1_PTIMER; } @@ -218,12 +275,11 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, { int ret; - timer_ctx->active_cleared_last = false; timer_ctx->irq.level = new_level; trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_ctx->irq.irq, timer_ctx->irq.level); - if (likely(irqchip_in_kernel(vcpu->kvm))) { + if (!userspace_irqchip(vcpu->kvm)) { ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, timer_ctx->irq.irq, timer_ctx->irq.level, @@ -232,46 +288,79 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, } } +/* Schedule the background timer for the emulated timer. */ +static void phys_timer_emulate(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + + /* + * If the timer can fire now we have just raised the IRQ line and we + * don't need to have a soft timer scheduled for the future. If the + * timer cannot fire at all, then we also don't need a soft timer. + */ + if (kvm_timer_should_fire(ptimer) || !kvm_timer_irq_can_fire(ptimer)) { + soft_timer_cancel(&timer->phys_timer, NULL); + return; + } + + soft_timer_start(&timer->phys_timer, kvm_timer_compute_delta(ptimer)); +} + /* - * Check if there was a change in the timer state (should we raise or lower - * the line level to the GIC). + * Check if there was a change in the timer state, so that we should either + * raise or lower the line level to the GIC or schedule a background timer to + * emulate the physical timer. */ static void kvm_timer_update_state(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + bool level; - /* - * If userspace modified the timer registers via SET_ONE_REG before - * the vgic was initialized, we mustn't set the vtimer->irq.level value - * because the guest would never see the interrupt. Instead wait - * until we call this function from kvm_timer_flush_hwstate. - */ if (unlikely(!timer->enabled)) return; - if (kvm_timer_should_fire(vtimer) != vtimer->irq.level) - kvm_timer_update_irq(vcpu, !vtimer->irq.level, vtimer); + /* + * The vtimer virtual interrupt is a 'mapped' interrupt, meaning part + * of its lifecycle is offloaded to the hardware, and we therefore may + * not have lowered the irq.level value before having to signal a new + * interrupt, but have to signal an interrupt every time the level is + * asserted. + */ + level = kvm_timer_should_fire(vtimer); + kvm_timer_update_irq(vcpu, level, vtimer); if (kvm_timer_should_fire(ptimer) != ptimer->irq.level) kvm_timer_update_irq(vcpu, !ptimer->irq.level, ptimer); + + phys_timer_emulate(vcpu); } -/* Schedule the background timer for the emulated timer. */ -static void kvm_timer_emulate(struct kvm_vcpu *vcpu, - struct arch_timer_context *timer_ctx) +static void vtimer_save_state(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + unsigned long flags; - if (kvm_timer_should_fire(timer_ctx)) - return; + local_irq_save(flags); - if (!kvm_timer_irq_can_fire(timer_ctx)) - return; + if (!vtimer->loaded) + goto out; + + if (timer->enabled) { + vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl); + vtimer->cnt_cval = read_sysreg_el0(cntv_cval); + } - /* The timer has not yet expired, schedule a background timer */ - timer_arm(timer, kvm_timer_compute_delta(timer_ctx)); + /* Disable the virtual timer */ + write_sysreg_el0(0, cntv_ctl); + isb(); + + vtimer->loaded = false; +out: + local_irq_restore(flags); } /* @@ -285,7 +374,7 @@ void kvm_timer_schedule(struct kvm_vcpu *vcpu) struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); - BUG_ON(timer_is_armed(timer)); + vtimer_save_state(vcpu); /* * No need to schedule a background timer if any guest timer has @@ -306,70 +395,113 @@ void kvm_timer_schedule(struct kvm_vcpu *vcpu) * The guest timers have not yet expired, schedule a background timer. * Set the earliest expiration time among the guest timers. */ - timer_arm(timer, kvm_timer_earliest_exp(vcpu)); + soft_timer_start(&timer->bg_timer, kvm_timer_earliest_exp(vcpu)); +} + +static void vtimer_restore_state(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + unsigned long flags; + + local_irq_save(flags); + + if (vtimer->loaded) + goto out; + + if (timer->enabled) { + write_sysreg_el0(vtimer->cnt_cval, cntv_cval); + isb(); + write_sysreg_el0(vtimer->cnt_ctl, cntv_ctl); + } + + vtimer->loaded = true; +out: + local_irq_restore(flags); } void kvm_timer_unschedule(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - timer_disarm(timer); + + vtimer_restore_state(vcpu); + + soft_timer_cancel(&timer->bg_timer, &timer->expired); +} + +static void set_cntvoff(u64 cntvoff) +{ + u32 low = lower_32_bits(cntvoff); + u32 high = upper_32_bits(cntvoff); + + /* + * Since kvm_call_hyp doesn't fully support the ARM PCS especially on + * 32-bit systems, but rather passes register by register shifted one + * place (we put the function address in r0/x0), we cannot simply pass + * a 64-bit value as an argument, but have to split the value in two + * 32-bit halves. + */ + kvm_call_hyp(__kvm_timer_set_cntvoff, low, high); +} + +static inline void set_vtimer_irq_phys_active(struct kvm_vcpu *vcpu, bool active) +{ + int r; + r = irq_set_irqchip_state(host_vtimer_irq, IRQCHIP_STATE_ACTIVE, active); + WARN_ON(r); } -static void kvm_timer_flush_hwstate_vgic(struct kvm_vcpu *vcpu) +static void kvm_timer_vcpu_load_gic(struct kvm_vcpu *vcpu) { struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); bool phys_active; - int ret; - /* - * If we enter the guest with the virtual input level to the VGIC - * asserted, then we have already told the VGIC what we need to, and - * we don't need to exit from the guest until the guest deactivates - * the already injected interrupt, so therefore we should set the - * hardware active state to prevent unnecessary exits from the guest. - * - * Also, if we enter the guest with the virtual timer interrupt active, - * then it must be active on the physical distributor, because we set - * the HW bit and the guest must be able to deactivate the virtual and - * physical interrupt at the same time. - * - * Conversely, if the virtual input level is deasserted and the virtual - * interrupt is not active, then always clear the hardware active state - * to ensure that hardware interrupts from the timer triggers a guest - * exit. - */ - phys_active = vtimer->irq.level || - kvm_vgic_map_is_active(vcpu, vtimer->irq.irq); + if (irqchip_in_kernel(vcpu->kvm)) + phys_active = kvm_vgic_map_is_active(vcpu, vtimer->irq.irq); + else + phys_active = vtimer->irq.level; + set_vtimer_irq_phys_active(vcpu, phys_active); +} + +static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu) +{ + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); /* - * We want to avoid hitting the (re)distributor as much as - * possible, as this is a potentially expensive MMIO access - * (not to mention locks in the irq layer), and a solution for - * this is to cache the "active" state in memory. - * - * Things to consider: we cannot cache an "active set" state, - * because the HW can change this behind our back (it becomes - * "clear" in the HW). We must then restrict the caching to - * the "clear" state. - * - * The cache is invalidated on: - * - vcpu put, indicating that the HW cannot be trusted to be - * in a sane state on the next vcpu load, - * - any change in the interrupt state - * - * Usage conditions: - * - cached value is "active clear" - * - value to be programmed is "active clear" + * When using a userspace irqchip with the architected timers and a + * host interrupt controller that doesn't support an active state, we + * must still prevent continuously exiting from the guest, and + * therefore mask the physical interrupt by disabling it on the host + * interrupt controller when the virtual level is high, such that the + * guest can make forward progress. Once we detect the output level + * being de-asserted, we unmask the interrupt again so that we exit + * from the guest when the timer fires. */ - if (vtimer->active_cleared_last && !phys_active) + if (vtimer->irq.level) + disable_percpu_irq(host_vtimer_irq); + else + enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); +} + +void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + + if (unlikely(!timer->enabled)) return; - ret = irq_set_irqchip_state(host_vtimer_irq, - IRQCHIP_STATE_ACTIVE, - phys_active); - WARN_ON(ret); + if (static_branch_likely(&has_gic_active_state)) + kvm_timer_vcpu_load_gic(vcpu); + else + kvm_timer_vcpu_load_nogic(vcpu); + + set_cntvoff(vtimer->cntvoff); + + vtimer_restore_state(vcpu); - vtimer->active_cleared_last = !phys_active; + /* Set the background timer for the physical timer emulation. */ + phys_timer_emulate(vcpu); } bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu) @@ -385,76 +517,66 @@ bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu) vlevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_VTIMER; plevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_PTIMER; - return vtimer->irq.level != vlevel || - ptimer->irq.level != plevel; + return kvm_timer_should_fire(vtimer) != vlevel || + kvm_timer_should_fire(ptimer) != plevel; } -static void kvm_timer_flush_hwstate_user(struct kvm_vcpu *vcpu) -{ - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - - /* - * To prevent continuously exiting from the guest, we mask the - * physical interrupt such that the guest can make forward progress. - * Once we detect the output level being deasserted, we unmask the - * interrupt again so that we exit from the guest when the timer - * fires. - */ - if (vtimer->irq.level) - disable_percpu_irq(host_vtimer_irq); - else - enable_percpu_irq(host_vtimer_irq, 0); -} - -/** - * kvm_timer_flush_hwstate - prepare timers before running the vcpu - * @vcpu: The vcpu pointer - * - * Check if the virtual timer has expired while we were running in the host, - * and inject an interrupt if that was the case, making sure the timer is - * masked or disabled on the host so that we keep executing. Also schedule a - * software timer for the physical timer if it is enabled. - */ -void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) +void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; if (unlikely(!timer->enabled)) return; - kvm_timer_update_state(vcpu); + vtimer_save_state(vcpu); - /* Set the background timer for the physical timer emulation. */ - kvm_timer_emulate(vcpu, vcpu_ptimer(vcpu)); + /* + * Cancel the physical timer emulation, because the only case where we + * need it after a vcpu_put is in the context of a sleeping VCPU, and + * in that case we already factor in the deadline for the physical + * timer when scheduling the bg_timer. + * + * In any case, we re-schedule the hrtimer for the physical timer when + * coming back to the VCPU thread in kvm_timer_vcpu_load(). + */ + soft_timer_cancel(&timer->phys_timer, NULL); - if (unlikely(!irqchip_in_kernel(vcpu->kvm))) - kvm_timer_flush_hwstate_user(vcpu); - else - kvm_timer_flush_hwstate_vgic(vcpu); + /* + * The kernel may decide to run userspace after calling vcpu_put, so + * we reset cntvoff to 0 to ensure a consistent read between user + * accesses to the virtual counter and kernel access to the physical + * counter. + */ + set_cntvoff(0); } -/** - * kvm_timer_sync_hwstate - sync timer state from cpu - * @vcpu: The vcpu pointer - * - * Check if any of the timers have expired while we were running in the guest, - * and inject an interrupt if that was the case. +/* + * With a userspace irqchip we have to check if the guest de-asserted the + * timer and if so, unmask the timer irq signal on the host interrupt + * controller to ensure that we see future timer signals. */ +static void unmask_vtimer_irq_user(struct kvm_vcpu *vcpu) +{ + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + + if (!kvm_timer_should_fire(vtimer)) { + kvm_timer_update_irq(vcpu, false, vtimer); + if (static_branch_likely(&has_gic_active_state)) + set_vtimer_irq_phys_active(vcpu, false); + else + enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); + } +} + void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - /* - * This is to cancel the background timer for the physical timer - * emulation if it is set. - */ - timer_disarm(timer); + if (unlikely(!timer->enabled)) + return; - /* - * The guest could have modified the timer registers or the timer - * could have expired, update the timer state. - */ - kvm_timer_update_state(vcpu); + if (unlikely(!irqchip_in_kernel(vcpu->kvm))) + unmask_vtimer_irq_user(vcpu); } int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu) @@ -505,8 +627,11 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) vcpu_ptimer(vcpu)->cntvoff = 0; INIT_WORK(&timer->expired, kvm_timer_inject_irq_work); - hrtimer_init(&timer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - timer->timer.function = kvm_timer_expire; + hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + timer->bg_timer.function = kvm_bg_timer_expire; + + hrtimer_init(&timer->phys_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + timer->phys_timer.function = kvm_phys_timer_expire; vtimer->irq.irq = default_vtimer_irq.irq; ptimer->irq.irq = default_ptimer_irq.irq; @@ -520,10 +645,11 @@ static void kvm_timer_init_interrupt(void *info) int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) { struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); switch (regid) { case KVM_REG_ARM_TIMER_CTL: - vtimer->cnt_ctl = value; + vtimer->cnt_ctl = value & ~ARCH_TIMER_CTRL_IT_STAT; break; case KVM_REG_ARM_TIMER_CNT: update_vtimer_cntvoff(vcpu, kvm_phys_timer_read() - value); @@ -531,6 +657,13 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) case KVM_REG_ARM_TIMER_CVAL: vtimer->cnt_cval = value; break; + case KVM_REG_ARM_PTIMER_CTL: + ptimer->cnt_ctl = value & ~ARCH_TIMER_CTRL_IT_STAT; + break; + case KVM_REG_ARM_PTIMER_CVAL: + ptimer->cnt_cval = value; + break; + default: return -1; } @@ -539,17 +672,38 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value) return 0; } +static u64 read_timer_ctl(struct arch_timer_context *timer) +{ + /* + * Set ISTATUS bit if it's expired. + * Note that according to ARMv8 ARM Issue A.k, ISTATUS bit is + * UNKNOWN when ENABLE bit is 0, so we chose to set ISTATUS bit + * regardless of ENABLE bit for our implementation convenience. + */ + if (!kvm_timer_compute_delta(timer)) + return timer->cnt_ctl | ARCH_TIMER_CTRL_IT_STAT; + else + return timer->cnt_ctl; +} + u64 kvm_arm_timer_get_reg(struct kvm_vcpu *vcpu, u64 regid) { + struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); switch (regid) { case KVM_REG_ARM_TIMER_CTL: - return vtimer->cnt_ctl; + return read_timer_ctl(vtimer); case KVM_REG_ARM_TIMER_CNT: return kvm_phys_timer_read() - vtimer->cntvoff; case KVM_REG_ARM_TIMER_CVAL: return vtimer->cnt_cval; + case KVM_REG_ARM_PTIMER_CTL: + return read_timer_ctl(ptimer); + case KVM_REG_ARM_PTIMER_CVAL: + return ptimer->cnt_cval; + case KVM_REG_ARM_PTIMER_CNT: + return kvm_phys_timer_read(); } return (u64)-1; } @@ -566,7 +720,7 @@ static int kvm_timer_dying_cpu(unsigned int cpu) return 0; } -int kvm_timer_hyp_init(void) +int kvm_timer_hyp_init(bool has_gic) { struct arch_timer_kvm_info *info; int err; @@ -602,11 +756,25 @@ int kvm_timer_hyp_init(void) return err; } + if (has_gic) { + err = irq_set_vcpu_affinity(host_vtimer_irq, + kvm_get_running_vcpus()); + if (err) { + kvm_err("kvm_arch_timer: error setting vcpu affinity\n"); + goto out_free_irq; + } + + static_branch_enable(&has_gic_active_state); + } + kvm_info("virtual timer IRQ%d\n", host_vtimer_irq); cpuhp_setup_state(CPUHP_AP_KVM_ARM_TIMER_STARTING, "kvm/arm/timer:starting", kvm_timer_starting_cpu, kvm_timer_dying_cpu); + return 0; +out_free_irq: + free_percpu_irq(host_vtimer_irq, kvm_get_running_vcpus()); return err; } @@ -615,7 +783,8 @@ void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu) struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - timer_disarm(timer); + soft_timer_cancel(&timer->bg_timer, &timer->expired); + soft_timer_cancel(&timer->phys_timer, NULL); kvm_vgic_unmap_phys_irq(vcpu, vtimer->irq.irq); } @@ -643,13 +812,23 @@ static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu) return true; } +bool kvm_arch_timer_get_input_level(int vintid) +{ + struct kvm_vcpu *vcpu = kvm_arm_get_running_vcpu(); + struct arch_timer_context *timer; + + if (vintid == vcpu_vtimer(vcpu)->irq.irq) + timer = vcpu_vtimer(vcpu); + else + BUG(); /* We only map the vtimer so far */ + + return kvm_timer_should_fire(timer); +} + int kvm_timer_enable(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - struct irq_desc *desc; - struct irq_data *data; - int phys_irq; int ret; if (timer->enabled) @@ -667,31 +846,17 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu) return -EINVAL; } - /* - * Find the physical IRQ number corresponding to the host_vtimer_irq - */ - desc = irq_to_desc(host_vtimer_irq); - if (!desc) { - kvm_err("%s: no interrupt descriptor\n", __func__); - return -EINVAL; - } - - data = irq_desc_get_irq_data(desc); - while (data->parent_data) - data = data->parent_data; - - phys_irq = data->hwirq; - - /* - * Tell the VGIC that the virtual interrupt is tied to a - * physical interrupt. We do that once per VCPU. - */ - ret = kvm_vgic_map_phys_irq(vcpu, vtimer->irq.irq, phys_irq); + ret = kvm_vgic_map_phys_irq(vcpu, host_vtimer_irq, vtimer->irq.irq, + kvm_arch_timer_get_input_level); if (ret) return ret; no_vgic: + preempt_disable(); timer->enabled = 1; + kvm_timer_vcpu_load(vcpu); + preempt_enable(); + return 0; } diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 95cba0799828..86941f6181bb 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -27,8 +27,11 @@ #include <linux/mman.h> #include <linux/sched.h> #include <linux/kvm.h> +#include <linux/kvm_irqfd.h> +#include <linux/irqbypass.h> #include <trace/events/kvm.h> #include <kvm/arm_pmu.h> +#include <kvm/arm_psci.h> #define CREATE_TRACE_POINTS #include "trace.h" @@ -44,15 +47,14 @@ #include <asm/kvm_mmu.h> #include <asm/kvm_emulate.h> #include <asm/kvm_coproc.h> -#include <asm/kvm_psci.h> #include <asm/sections.h> #ifdef REQUIRES_VIRT __asm__(".arch_extension virt"); #endif +DEFINE_PER_CPU(kvm_cpu_context_t, kvm_host_cpu_state); static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); -static kvm_cpu_context_t __percpu *kvm_host_cpu_state; /* Per-CPU variable containing the currently running vcpu. */ static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_arm_running_vcpu); @@ -69,17 +71,17 @@ static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled); static void kvm_arm_set_running_vcpu(struct kvm_vcpu *vcpu) { - BUG_ON(preemptible()); __this_cpu_write(kvm_arm_running_vcpu, vcpu); } +DEFINE_STATIC_KEY_FALSE(userspace_irqchip_in_use); + /** * kvm_arm_get_running_vcpu - get the vcpu running on the current CPU. * Must be called from non-preemptible context */ struct kvm_vcpu *kvm_arm_get_running_vcpu(void) { - BUG_ON(preemptible()); return __this_cpu_read(kvm_arm_running_vcpu); } @@ -175,6 +177,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm) { int i; + kvm_vgic_destroy(kvm); + free_percpu(kvm->arch.last_vcpu_ran); kvm->arch.last_vcpu_ran = NULL; @@ -184,8 +188,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm->vcpus[i] = NULL; } } - - kvm_vgic_destroy(kvm); + atomic_set(&kvm->online_vcpus, 0); } int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) @@ -292,9 +295,11 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) { + if (vcpu->arch.has_run_once && unlikely(!irqchip_in_kernel(vcpu->kvm))) + static_branch_dec(&userspace_irqchip_in_use); + kvm_mmu_free_memory_caches(vcpu); kvm_timer_vcpu_terminate(vcpu); - kvm_vgic_vcpu_destroy(vcpu); kvm_pmu_vcpu_destroy(vcpu); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, vcpu); @@ -307,18 +312,19 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) { - return kvm_timer_should_fire(vcpu_vtimer(vcpu)) || - kvm_timer_should_fire(vcpu_ptimer(vcpu)); + return kvm_timer_is_pending(vcpu); } void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) { kvm_timer_schedule(vcpu); + kvm_vgic_v4_enable_doorbell(vcpu); } void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) { kvm_timer_unschedule(vcpu); + kvm_vgic_v4_disable_doorbell(vcpu); } int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) @@ -351,21 +357,21 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } vcpu->cpu = cpu; - vcpu->arch.host_cpu_context = this_cpu_ptr(kvm_host_cpu_state); + vcpu->arch.host_cpu_context = this_cpu_ptr(&kvm_host_cpu_state); kvm_arm_set_running_vcpu(vcpu); - kvm_vgic_load(vcpu); + kvm_timer_vcpu_load(vcpu); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { + kvm_timer_vcpu_put(vcpu); kvm_vgic_put(vcpu); vcpu->cpu = -1; kvm_arm_set_running_vcpu(NULL); - kvm_timer_vcpu_put(vcpu); } static void vcpu_power_off(struct kvm_vcpu *vcpu) @@ -378,17 +384,24 @@ static void vcpu_power_off(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { + vcpu_load(vcpu); + if (vcpu->arch.power_off) mp_state->mp_state = KVM_MP_STATE_STOPPED; else mp_state->mp_state = KVM_MP_STATE_RUNNABLE; + vcpu_put(vcpu); return 0; } int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { + int ret = 0; + + vcpu_load(vcpu); + switch (mp_state->mp_state) { case KVM_MP_STATE_RUNNABLE: vcpu->arch.power_off = false; @@ -397,10 +410,11 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, vcpu_power_off(vcpu); break; default: - return -EINVAL; + ret = -EINVAL; } - return 0; + vcpu_put(vcpu); + return ret; } /** @@ -506,7 +520,7 @@ static void update_vttbr(struct kvm *kvm) pgd_phys = virt_to_phys(kvm->arch.pgd); BUG_ON(pgd_phys & ~VTTBR_BADDR_MASK); vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK(kvm_vmid_bits); - kvm->arch.vttbr = pgd_phys | vmid; + kvm->arch.vttbr = kvm_phys_to_vttbr(pgd_phys) | vmid; spin_unlock(&kvm_vmid_lock); } @@ -521,14 +535,22 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu) vcpu->arch.has_run_once = true; - /* - * Map the VGIC hardware resources before running a vcpu the first - * time on this VM. - */ - if (unlikely(irqchip_in_kernel(kvm) && !vgic_ready(kvm))) { - ret = kvm_vgic_map_resources(kvm); - if (ret) - return ret; + if (likely(irqchip_in_kernel(kvm))) { + /* + * Map the VGIC hardware resources before running a vcpu the + * first time on this VM. + */ + if (unlikely(!vgic_ready(kvm))) { + ret = kvm_vgic_map_resources(kvm); + if (ret) + return ret; + } + } else { + /* + * Tell the rest of the code that there are userspace irqchip + * VMs in the wild. + */ + static_branch_inc(&userspace_irqchip_in_use); } ret = kvm_timer_enable(vcpu); @@ -612,26 +634,33 @@ static void check_vcpu_requests(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) { int ret; - sigset_t sigsaved; if (unlikely(!kvm_vcpu_initialized(vcpu))) return -ENOEXEC; + vcpu_load(vcpu); + ret = kvm_vcpu_first_run_init(vcpu); if (ret) - return ret; + goto out; if (run->exit_reason == KVM_EXIT_MMIO) { ret = kvm_handle_mmio_return(vcpu, vcpu->run); if (ret) - return ret; + goto out; + if (kvm_arm_handle_step_debug(vcpu, vcpu->run)) { + ret = 0; + goto out; + } + } - if (run->immediate_exit) - return -EINTR; + if (run->immediate_exit) { + ret = -EINTR; + goto out; + } - if (vcpu->sigset_active) - sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); + kvm_sigset_activate(vcpu); ret = 1; run->exit_reason = KVM_EXIT_UNKNOWN; @@ -652,27 +681,40 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) */ preempt_disable(); - kvm_pmu_flush_hwstate(vcpu); + /* Flush FP/SIMD state that can't survive guest entry/exit */ + kvm_fpsimd_flush_cpu_state(); - kvm_timer_flush_hwstate(vcpu); - kvm_vgic_flush_hwstate(vcpu); + kvm_pmu_flush_hwstate(vcpu); local_irq_disable(); + kvm_vgic_flush_hwstate(vcpu); + /* - * If we have a singal pending, or need to notify a userspace - * irqchip about timer or PMU level changes, then we exit (and - * update the timer level state in kvm_timer_update_run - * below). + * Exit if we have a signal pending so that we can deliver the + * signal to user space. */ - if (signal_pending(current) || - kvm_timer_should_notify_user(vcpu) || - kvm_pmu_should_notify_user(vcpu)) { + if (signal_pending(current)) { ret = -EINTR; run->exit_reason = KVM_EXIT_INTR; } /* + * If we're using a userspace irqchip, then check if we need + * to tell a userspace irqchip about timer or PMU level + * changes and if so, exit to userspace (the actual level + * state gets updated in kvm_timer_update_run and + * kvm_pmu_update_run below). + */ + if (static_branch_unlikely(&userspace_irqchip_in_use)) { + if (kvm_timer_should_notify_user(vcpu) || + kvm_pmu_should_notify_user(vcpu)) { + ret = -EINTR; + run->exit_reason = KVM_EXIT_INTR; + } + } + + /* * Ensure we set mode to IN_GUEST_MODE after we disable * interrupts and before the final VCPU requests check. * See the comment in kvm_vcpu_exiting_guest_mode() and @@ -683,10 +725,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) if (ret <= 0 || need_new_vmid_gen(vcpu->kvm) || kvm_request_pending(vcpu)) { vcpu->mode = OUTSIDE_GUEST_MODE; - local_irq_enable(); kvm_pmu_sync_hwstate(vcpu); - kvm_timer_sync_hwstate(vcpu); + if (static_branch_unlikely(&userspace_irqchip_in_use)) + kvm_timer_sync_hwstate(vcpu); kvm_vgic_sync_hwstate(vcpu); + local_irq_enable(); preempt_enable(); continue; } @@ -698,9 +741,13 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) */ trace_kvm_entry(*vcpu_pc(vcpu)); guest_enter_irqoff(); + if (has_vhe()) + kvm_arm_vhe_guest_enter(); ret = kvm_call_hyp(__kvm_vcpu_run, vcpu); + if (has_vhe()) + kvm_arm_vhe_guest_exit(); vcpu->mode = OUTSIDE_GUEST_MODE; vcpu->stat.exits++; /* @@ -710,6 +757,28 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) kvm_arm_clear_debug(vcpu); /* + * We must sync the PMU state before the vgic state so + * that the vgic can properly sample the updated state of the + * interrupt line. + */ + kvm_pmu_sync_hwstate(vcpu); + + /* + * Sync the vgic state before syncing the timer state because + * the timer code needs to know if the virtual timer + * interrupts are active. + */ + kvm_vgic_sync_hwstate(vcpu); + + /* + * Sync the timer hardware state before enabling interrupts as + * we don't want vtimer interrupts to race with syncing the + * timer virtual interrupt state. + */ + if (static_branch_unlikely(&userspace_irqchip_in_use)) + kvm_timer_sync_hwstate(vcpu); + + /* * We may have taken a host interrupt in HYP mode (ie * while executing the guest). This interrupt is still * pending, as we haven't serviced it yet! @@ -732,15 +801,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) guest_exit(); trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); - /* - * We must sync the PMU and timer state before the vgic state so - * that the vgic can properly sample the updated state of the - * interrupt line. - */ - kvm_pmu_sync_hwstate(vcpu); - kvm_timer_sync_hwstate(vcpu); - - kvm_vgic_sync_hwstate(vcpu); + /* Exit types that need handling before we can be preempted */ + handle_exit_early(vcpu, run, ret); preempt_enable(); @@ -753,8 +815,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) kvm_pmu_update_run(vcpu); } - if (vcpu->sigset_active) - sigprocmask(SIG_SETMASK, &sigsaved, NULL); + kvm_sigset_deactivate(vcpu); + +out: + vcpu_put(vcpu); return ret; } @@ -970,66 +1034,88 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; struct kvm_device_attr attr; + long r; + + vcpu_load(vcpu); switch (ioctl) { case KVM_ARM_VCPU_INIT: { struct kvm_vcpu_init init; + r = -EFAULT; if (copy_from_user(&init, argp, sizeof(init))) - return -EFAULT; + break; - return kvm_arch_vcpu_ioctl_vcpu_init(vcpu, &init); + r = kvm_arch_vcpu_ioctl_vcpu_init(vcpu, &init); + break; } case KVM_SET_ONE_REG: case KVM_GET_ONE_REG: { struct kvm_one_reg reg; + r = -ENOEXEC; if (unlikely(!kvm_vcpu_initialized(vcpu))) - return -ENOEXEC; + break; + r = -EFAULT; if (copy_from_user(®, argp, sizeof(reg))) - return -EFAULT; + break; + if (ioctl == KVM_SET_ONE_REG) - return kvm_arm_set_reg(vcpu, ®); + r = kvm_arm_set_reg(vcpu, ®); else - return kvm_arm_get_reg(vcpu, ®); + r = kvm_arm_get_reg(vcpu, ®); + break; } case KVM_GET_REG_LIST: { struct kvm_reg_list __user *user_list = argp; struct kvm_reg_list reg_list; unsigned n; + r = -ENOEXEC; if (unlikely(!kvm_vcpu_initialized(vcpu))) - return -ENOEXEC; + break; + r = -EFAULT; if (copy_from_user(®_list, user_list, sizeof(reg_list))) - return -EFAULT; + break; n = reg_list.n; reg_list.n = kvm_arm_num_regs(vcpu); if (copy_to_user(user_list, ®_list, sizeof(reg_list))) - return -EFAULT; + break; + r = -E2BIG; if (n < reg_list.n) - return -E2BIG; - return kvm_arm_copy_reg_indices(vcpu, user_list->reg); + break; + r = kvm_arm_copy_reg_indices(vcpu, user_list->reg); + break; } case KVM_SET_DEVICE_ATTR: { + r = -EFAULT; if (copy_from_user(&attr, argp, sizeof(attr))) - return -EFAULT; - return kvm_arm_vcpu_set_attr(vcpu, &attr); + break; + r = kvm_arm_vcpu_set_attr(vcpu, &attr); + break; } case KVM_GET_DEVICE_ATTR: { + r = -EFAULT; if (copy_from_user(&attr, argp, sizeof(attr))) - return -EFAULT; - return kvm_arm_vcpu_get_attr(vcpu, &attr); + break; + r = kvm_arm_vcpu_get_attr(vcpu, &attr); + break; } case KVM_HAS_DEVICE_ATTR: { + r = -EFAULT; if (copy_from_user(&attr, argp, sizeof(attr))) - return -EFAULT; - return kvm_arm_vcpu_has_attr(vcpu, &attr); + break; + r = kvm_arm_vcpu_has_attr(vcpu, &attr); + break; } default: - return -EINVAL; + r = -EINVAL; } + + vcpu_put(vcpu); + return r; } /** @@ -1141,7 +1227,7 @@ static void cpu_init_hyp_mode(void *dummy) pgd_ptr = kvm_mmu_get_httbr(); stack_page = __this_cpu_read(kvm_arm_hyp_stack_page); hyp_stack_ptr = stack_page + PAGE_SIZE; - vector_ptr = (unsigned long)kvm_ksym_ref(__kvm_hyp_vector); + vector_ptr = (unsigned long)kvm_get_hyp_vector(); __cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr); __cpu_init_stage2(); @@ -1222,6 +1308,7 @@ static int hyp_init_cpu_pm_notifier(struct notifier_block *self, cpu_hyp_reset(); return NOTIFY_OK; + case CPU_PM_ENTER_FAILED: case CPU_PM_EXIT: if (__this_cpu_read(kvm_arm_hardware_enabled)) /* The hardware was enabled before suspend. */ @@ -1255,19 +1342,8 @@ static inline void hyp_cpu_pm_exit(void) } #endif -static void teardown_common_resources(void) -{ - free_percpu(kvm_host_cpu_state); -} - static int init_common_resources(void) { - kvm_host_cpu_state = alloc_percpu(kvm_cpu_context_t); - if (!kvm_host_cpu_state) { - kvm_err("Cannot allocate host CPU state\n"); - return -ENOMEM; - } - /* set size of VMID supported by CPU */ kvm_vmid_bits = kvm_get_vmid_bits(); kvm_info("%d-bit VMID\n", kvm_vmid_bits); @@ -1309,7 +1385,7 @@ static int init_subsystems(void) /* * Init HYP architected timer support */ - err = kvm_timer_hyp_init(); + err = kvm_timer_hyp_init(vgic_present); if (err) goto out; @@ -1386,6 +1462,12 @@ static int init_hyp_mode(void) goto out_err; } + err = kvm_map_vectors(); + if (err) { + kvm_err("Cannot map vectors\n"); + goto out_err; + } + /* * Map the Hyp stack pages */ @@ -1403,7 +1485,7 @@ static int init_hyp_mode(void) for_each_possible_cpu(cpu) { kvm_cpu_context_t *cpu_ctxt; - cpu_ctxt = per_cpu_ptr(kvm_host_cpu_state, cpu); + cpu_ctxt = per_cpu_ptr(&kvm_host_cpu_state, cpu); err = create_hyp_mappings(cpu_ctxt, cpu_ctxt + 1, PAGE_HYP); if (err) { @@ -1438,6 +1520,46 @@ struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr) return NULL; } +bool kvm_arch_has_irq_bypass(void) +{ + return true; +} + +int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, + struct irq_bypass_producer *prod) +{ + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + + return kvm_vgic_v4_set_forwarding(irqfd->kvm, prod->irq, + &irqfd->irq_entry); +} +void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, + struct irq_bypass_producer *prod) +{ + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + + kvm_vgic_v4_unset_forwarding(irqfd->kvm, prod->irq, + &irqfd->irq_entry); +} + +void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *cons) +{ + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + + kvm_arm_halt_guest(irqfd->kvm); +} + +void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *cons) +{ + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + + kvm_arm_resume_guest(irqfd->kvm); +} + /** * Initialize Hyp-mode and memory mappings on all CPUs. */ @@ -1448,7 +1570,7 @@ int kvm_arch_init(void *opaque) bool in_hyp_mode; if (!is_hyp_mode_available()) { - kvm_err("HYP mode not available\n"); + kvm_info("HYP mode not available\n"); return -ENODEV; } @@ -1487,7 +1609,6 @@ out_hyp: if (!in_hyp_mode) teardown_hyp_mode(); out_err: - teardown_common_resources(); return err; } diff --git a/virt/kvm/arm/hyp/timer-sr.c b/virt/kvm/arm/hyp/timer-sr.c index 4734915ab71f..f24404b3c8df 100644 --- a/virt/kvm/arm/hyp/timer-sr.c +++ b/virt/kvm/arm/hyp/timer-sr.c @@ -21,44 +21,33 @@ #include <asm/kvm_hyp.h> -/* vcpu is already in the HYP VA space */ -void __hyp_text __timer_save_state(struct kvm_vcpu *vcpu) +void __hyp_text __kvm_timer_set_cntvoff(u32 cntvoff_low, u32 cntvoff_high) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - u64 val; - - if (timer->enabled) { - vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl); - vtimer->cnt_cval = read_sysreg_el0(cntv_cval); - } - - /* Disable the virtual timer */ - write_sysreg_el0(0, cntv_ctl); + u64 cntvoff = (u64)cntvoff_high << 32 | cntvoff_low; + write_sysreg(cntvoff, cntvoff_el2); +} +void __hyp_text __timer_disable_traps(struct kvm_vcpu *vcpu) +{ /* * We don't need to do this for VHE since the host kernel runs in EL2 * with HCR_EL2.TGE ==1, which makes those bits have no impact. */ if (!has_vhe()) { + u64 val; + /* Allow physical timer/counter access for the host */ val = read_sysreg(cnthctl_el2); val |= CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN; write_sysreg(val, cnthctl_el2); } - - /* Clear cntvoff for the host */ - write_sysreg(0, cntvoff_el2); } -void __hyp_text __timer_restore_state(struct kvm_vcpu *vcpu) +void __hyp_text __timer_enable_traps(struct kvm_vcpu *vcpu) { - struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - u64 val; - - /* Those bits are already configured at boot on VHE-system */ if (!has_vhe()) { + u64 val; + /* * Disallow physical timer access for the guest * Physical counter access is allowed @@ -68,11 +57,4 @@ void __hyp_text __timer_restore_state(struct kvm_vcpu *vcpu) val |= CNTHCTL_EL1PCTEN; write_sysreg(val, cnthctl_el2); } - - if (timer->enabled) { - write_sysreg(vtimer->cntvoff, cntvoff_el2); - write_sysreg_el0(vtimer->cnt_cval, cntv_cval); - isb(); - write_sysreg_el0(vtimer->cnt_ctl, cntv_ctl); - } } diff --git a/virt/kvm/arm/hyp/vgic-v2-sr.c b/virt/kvm/arm/hyp/vgic-v2-sr.c index a3f18d362366..4fe6e797e8b3 100644 --- a/virt/kvm/arm/hyp/vgic-v2-sr.c +++ b/virt/kvm/arm/hyp/vgic-v2-sr.c @@ -21,6 +21,7 @@ #include <asm/kvm_emulate.h> #include <asm/kvm_hyp.h> +#include <asm/kvm_mmu.h> static void __hyp_text save_elrsr(struct kvm_vcpu *vcpu, void __iomem *base) { @@ -34,11 +35,7 @@ static void __hyp_text save_elrsr(struct kvm_vcpu *vcpu, void __iomem *base) else elrsr1 = 0; -#ifdef CONFIG_CPU_BIG_ENDIAN - cpu_if->vgic_elrsr = ((u64)elrsr0 << 32) | elrsr1; -#else cpu_if->vgic_elrsr = ((u64)elrsr1 << 32) | elrsr0; -#endif } static void __hyp_text save_lrs(struct kvm_vcpu *vcpu, void __iomem *base) diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c index 91728faa13fd..f5c3d6d7019e 100644 --- a/virt/kvm/arm/hyp/vgic-v3-sr.c +++ b/virt/kvm/arm/hyp/vgic-v3-sr.c @@ -258,7 +258,8 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu) cpu_if->vgic_ap1r[0] = __vgic_v3_read_ap1rn(0); } } else { - if (static_branch_unlikely(&vgic_v3_cpuif_trap)) + if (static_branch_unlikely(&vgic_v3_cpuif_trap) || + cpu_if->its_vpe.its_vm) write_gicreg(0, ICH_HCR_EL2); cpu_if->vgic_elrsr = 0xffff; @@ -337,9 +338,11 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu) /* * If we need to trap system registers, we must write * ICH_HCR_EL2 anyway, even if no interrupts are being - * injected, + * injected. Same thing if GICv4 is used, as VLPI + * delivery is gated by ICH_HCR_EL2.En. */ - if (static_branch_unlikely(&vgic_v3_cpuif_trap)) + if (static_branch_unlikely(&vgic_v3_cpuif_trap) || + cpu_if->its_vpe.its_vm) write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); } diff --git a/virt/kvm/arm/mmio.c b/virt/kvm/arm/mmio.c index b6e715fd3c90..dac7ceb1a677 100644 --- a/virt/kvm/arm/mmio.c +++ b/virt/kvm/arm/mmio.c @@ -112,7 +112,7 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) } trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr, - data); + &data); data = vcpu_data_host_to_guest(vcpu, data, len); vcpu_set_reg(vcpu, vcpu->arch.mmio_decode.rt, data); } @@ -182,14 +182,14 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, data = vcpu_data_guest_to_host(vcpu, vcpu_get_reg(vcpu, rt), len); - trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, data); + trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, &data); kvm_mmio_write_buf(data_buf, len, data); ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, fault_ipa, len, data_buf); } else { trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, len, - fault_ipa, 0); + fault_ipa, NULL); ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, fault_ipa, len, data_buf); diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index b36945d49986..ec62d1cccab7 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -509,8 +509,6 @@ static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size) */ void free_hyp_pgds(void) { - unsigned long addr; - mutex_lock(&kvm_hyp_pgd_mutex); if (boot_hyp_pgd) { @@ -521,10 +519,10 @@ void free_hyp_pgds(void) if (hyp_pgd) { unmap_hyp_range(hyp_pgd, hyp_idmap_start, PAGE_SIZE); - for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE) - unmap_hyp_range(hyp_pgd, kern_hyp_va(addr), PGDIR_SIZE); - for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE) - unmap_hyp_range(hyp_pgd, kern_hyp_va(addr), PGDIR_SIZE); + unmap_hyp_range(hyp_pgd, kern_hyp_va(PAGE_OFFSET), + (uintptr_t)high_memory - PAGE_OFFSET); + unmap_hyp_range(hyp_pgd, kern_hyp_va(VMALLOC_START), + VMALLOC_END - VMALLOC_START); free_pages((unsigned long)hyp_pgd, hyp_pgd_order); hyp_pgd = NULL; @@ -623,7 +621,7 @@ static int create_hyp_pud_mappings(pgd_t *pgd, unsigned long start, return 0; } -static int __create_hyp_mappings(pgd_t *pgdp, +static int __create_hyp_mappings(pgd_t *pgdp, unsigned long ptrs_per_pgd, unsigned long start, unsigned long end, unsigned long pfn, pgprot_t prot) { @@ -636,7 +634,7 @@ static int __create_hyp_mappings(pgd_t *pgdp, addr = start & PAGE_MASK; end = PAGE_ALIGN(end); do { - pgd = pgdp + pgd_index(addr); + pgd = pgdp + ((addr >> PGDIR_SHIFT) & (ptrs_per_pgd - 1)); if (pgd_none(*pgd)) { pud = pud_alloc_one(NULL, addr); @@ -699,8 +697,8 @@ int create_hyp_mappings(void *from, void *to, pgprot_t prot) int err; phys_addr = kvm_kaddr_to_phys(from + virt_addr - start); - err = __create_hyp_mappings(hyp_pgd, virt_addr, - virt_addr + PAGE_SIZE, + err = __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD, + virt_addr, virt_addr + PAGE_SIZE, __phys_to_pfn(phys_addr), prot); if (err) @@ -731,7 +729,7 @@ int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr) if (!is_vmalloc_addr(from) || !is_vmalloc_addr(to - 1)) return -EINVAL; - return __create_hyp_mappings(hyp_pgd, start, end, + return __create_hyp_mappings(hyp_pgd, PTRS_PER_PGD, start, end, __phys_to_pfn(phys_addr), PAGE_HYP_DEVICE); } @@ -926,6 +924,25 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache return 0; } +static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr) +{ + pmd_t *pmdp; + pte_t *ptep; + + pmdp = stage2_get_pmd(kvm, NULL, addr); + if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp)) + return false; + + if (pmd_thp_or_huge(*pmdp)) + return kvm_s2pmd_exec(pmdp); + + ptep = pte_offset_kernel(pmdp, addr); + if (!ptep || pte_none(*ptep) || !pte_present(*ptep)) + return false; + + return kvm_s2pte_exec(ptep); +} + static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, phys_addr_t addr, const pte_t *new_pte, unsigned long flags) @@ -1257,10 +1274,14 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); } -static void coherent_cache_guest_page(struct kvm_vcpu *vcpu, kvm_pfn_t pfn, - unsigned long size) +static void clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size) { - __coherent_cache_guest_page(vcpu, pfn, size); + __clean_dcache_guest_page(pfn, size); +} + +static void invalidate_icache_guest_page(kvm_pfn_t pfn, unsigned long size) +{ + __invalidate_icache_guest_page(pfn, size); } static void kvm_send_hwpoison_signal(unsigned long address, @@ -1286,7 +1307,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, unsigned long fault_status) { int ret; - bool write_fault, writable, hugetlb = false, force_pte = false; + bool write_fault, exec_fault, writable, hugetlb = false, force_pte = false; unsigned long mmu_seq; gfn_t gfn = fault_ipa >> PAGE_SHIFT; struct kvm *kvm = vcpu->kvm; @@ -1298,7 +1319,10 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, unsigned long flags = 0; write_fault = kvm_is_write_fault(vcpu); - if (fault_status == FSC_PERM && !write_fault) { + exec_fault = kvm_vcpu_trap_is_iabt(vcpu); + VM_BUG_ON(write_fault && exec_fault); + + if (fault_status == FSC_PERM && !write_fault && !exec_fault) { kvm_err("Unexpected L2 read permission error\n"); return -EFAULT; } @@ -1312,7 +1336,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, return -EFAULT; } - if (is_vm_hugetlb_page(vma) && !logging_active) { + if (vma_kernel_pagesize(vma) == PMD_SIZE && !logging_active) { hugetlb = true; gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT; } else { @@ -1391,7 +1415,19 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, new_pmd = kvm_s2pmd_mkwrite(new_pmd); kvm_set_pfn_dirty(pfn); } - coherent_cache_guest_page(vcpu, pfn, PMD_SIZE); + + if (fault_status != FSC_PERM) + clean_dcache_guest_page(pfn, PMD_SIZE); + + if (exec_fault) { + new_pmd = kvm_s2pmd_mkexec(new_pmd); + invalidate_icache_guest_page(pfn, PMD_SIZE); + } else if (fault_status == FSC_PERM) { + /* Preserve execute if XN was already cleared */ + if (stage2_is_exec(kvm, fault_ipa)) + new_pmd = kvm_s2pmd_mkexec(new_pmd); + } + ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd); } else { pte_t new_pte = pfn_pte(pfn, mem_type); @@ -1401,7 +1437,19 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, kvm_set_pfn_dirty(pfn); mark_page_dirty(kvm, gfn); } - coherent_cache_guest_page(vcpu, pfn, PAGE_SIZE); + + if (fault_status != FSC_PERM) + clean_dcache_guest_page(pfn, PAGE_SIZE); + + if (exec_fault) { + new_pte = kvm_s2pte_mkexec(new_pte); + invalidate_icache_guest_page(pfn, PAGE_SIZE); + } else if (fault_status == FSC_PERM) { + /* Preserve execute if XN was already cleared */ + if (stage2_is_exec(kvm, fault_ipa)) + new_pte = kvm_s2pte_mkexec(new_pte); + } + ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags); } @@ -1737,7 +1785,7 @@ static int kvm_map_idmap_text(pgd_t *pgd) int err; /* Create the idmap in the boot page tables */ - err = __create_hyp_mappings(pgd, + err = __create_hyp_mappings(pgd, __kvm_idmap_ptrs_per_pgd(), hyp_idmap_start, hyp_idmap_end, __phys_to_pfn(hyp_idmap_start), PAGE_HYP_EXEC); diff --git a/virt/kvm/arm/psci.c b/virt/kvm/arm/psci.c index f1e363bab5e8..6919352cbf15 100644 --- a/virt/kvm/arm/psci.c +++ b/virt/kvm/arm/psci.c @@ -15,16 +15,16 @@ * along with this program. If not, see <http://www.gnu.org/licenses/>. */ +#include <linux/arm-smccc.h> #include <linux/preempt.h> #include <linux/kvm_host.h> #include <linux/wait.h> #include <asm/cputype.h> #include <asm/kvm_emulate.h> -#include <asm/kvm_psci.h> #include <asm/kvm_host.h> -#include <uapi/linux/psci.h> +#include <kvm/arm_psci.h> /* * This is an implementation of the Power State Coordination Interface @@ -33,6 +33,38 @@ #define AFFINITY_MASK(level) ~((0x1UL << ((level) * MPIDR_LEVEL_BITS)) - 1) +static u32 smccc_get_function(struct kvm_vcpu *vcpu) +{ + return vcpu_get_reg(vcpu, 0); +} + +static unsigned long smccc_get_arg1(struct kvm_vcpu *vcpu) +{ + return vcpu_get_reg(vcpu, 1); +} + +static unsigned long smccc_get_arg2(struct kvm_vcpu *vcpu) +{ + return vcpu_get_reg(vcpu, 2); +} + +static unsigned long smccc_get_arg3(struct kvm_vcpu *vcpu) +{ + return vcpu_get_reg(vcpu, 3); +} + +static void smccc_set_retval(struct kvm_vcpu *vcpu, + unsigned long a0, + unsigned long a1, + unsigned long a2, + unsigned long a3) +{ + vcpu_set_reg(vcpu, 0, a0); + vcpu_set_reg(vcpu, 1, a1); + vcpu_set_reg(vcpu, 2, a2); + vcpu_set_reg(vcpu, 3, a3); +} + static unsigned long psci_affinity_mask(unsigned long affinity_level) { if (affinity_level <= 3) @@ -78,7 +110,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) unsigned long context_id; phys_addr_t target_pc; - cpu_id = vcpu_get_reg(source_vcpu, 1) & MPIDR_HWID_BITMASK; + cpu_id = smccc_get_arg1(source_vcpu) & MPIDR_HWID_BITMASK; if (vcpu_mode_is_32bit(source_vcpu)) cpu_id &= ~((u32) 0); @@ -91,14 +123,14 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) if (!vcpu) return PSCI_RET_INVALID_PARAMS; if (!vcpu->arch.power_off) { - if (kvm_psci_version(source_vcpu) != KVM_ARM_PSCI_0_1) + if (kvm_psci_version(source_vcpu, kvm) != KVM_ARM_PSCI_0_1) return PSCI_RET_ALREADY_ON; else return PSCI_RET_INVALID_PARAMS; } - target_pc = vcpu_get_reg(source_vcpu, 2); - context_id = vcpu_get_reg(source_vcpu, 3); + target_pc = smccc_get_arg2(source_vcpu); + context_id = smccc_get_arg3(source_vcpu); kvm_reset_vcpu(vcpu); @@ -117,7 +149,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) * NOTE: We always update r0 (or x0) because for PSCI v0.1 * the general puspose registers are undefined upon CPU_ON. */ - vcpu_set_reg(vcpu, 0, context_id); + smccc_set_retval(vcpu, context_id, 0, 0, 0); vcpu->arch.power_off = false; smp_mb(); /* Make sure the above is visible */ @@ -137,8 +169,8 @@ static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu) struct kvm *kvm = vcpu->kvm; struct kvm_vcpu *tmp; - target_affinity = vcpu_get_reg(vcpu, 1); - lowest_affinity_level = vcpu_get_reg(vcpu, 2); + target_affinity = smccc_get_arg1(vcpu); + lowest_affinity_level = smccc_get_arg2(vcpu); /* Determine target affinity mask */ target_affinity_mask = psci_affinity_mask(lowest_affinity_level); @@ -200,18 +232,10 @@ static void kvm_psci_system_reset(struct kvm_vcpu *vcpu) kvm_prepare_system_event(vcpu, KVM_SYSTEM_EVENT_RESET); } -int kvm_psci_version(struct kvm_vcpu *vcpu) -{ - if (test_bit(KVM_ARM_VCPU_PSCI_0_2, vcpu->arch.features)) - return KVM_ARM_PSCI_0_2; - - return KVM_ARM_PSCI_0_1; -} - static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; - unsigned long psci_fn = vcpu_get_reg(vcpu, 0) & ~((u32) 0); + u32 psci_fn = smccc_get_function(vcpu); unsigned long val; int ret = 1; @@ -221,7 +245,7 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu) * Bits[31:16] = Major Version = 0 * Bits[15:0] = Minor Version = 2 */ - val = 2; + val = KVM_ARM_PSCI_0_2; break; case PSCI_0_2_FN_CPU_SUSPEND: case PSCI_0_2_FN64_CPU_SUSPEND: @@ -278,14 +302,56 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu) break; } - vcpu_set_reg(vcpu, 0, val); + smccc_set_retval(vcpu, val, 0, 0, 0); + return ret; +} + +static int kvm_psci_1_0_call(struct kvm_vcpu *vcpu) +{ + u32 psci_fn = smccc_get_function(vcpu); + u32 feature; + unsigned long val; + int ret = 1; + + switch(psci_fn) { + case PSCI_0_2_FN_PSCI_VERSION: + val = KVM_ARM_PSCI_1_0; + break; + case PSCI_1_0_FN_PSCI_FEATURES: + feature = smccc_get_arg1(vcpu); + switch(feature) { + case PSCI_0_2_FN_PSCI_VERSION: + case PSCI_0_2_FN_CPU_SUSPEND: + case PSCI_0_2_FN64_CPU_SUSPEND: + case PSCI_0_2_FN_CPU_OFF: + case PSCI_0_2_FN_CPU_ON: + case PSCI_0_2_FN64_CPU_ON: + case PSCI_0_2_FN_AFFINITY_INFO: + case PSCI_0_2_FN64_AFFINITY_INFO: + case PSCI_0_2_FN_MIGRATE_INFO_TYPE: + case PSCI_0_2_FN_SYSTEM_OFF: + case PSCI_0_2_FN_SYSTEM_RESET: + case PSCI_1_0_FN_PSCI_FEATURES: + case ARM_SMCCC_VERSION_FUNC_ID: + val = 0; + break; + default: + val = PSCI_RET_NOT_SUPPORTED; + break; + } + break; + default: + return kvm_psci_0_2_call(vcpu); + } + + smccc_set_retval(vcpu, val, 0, 0, 0); return ret; } static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu) { struct kvm *kvm = vcpu->kvm; - unsigned long psci_fn = vcpu_get_reg(vcpu, 0) & ~((u32) 0); + u32 psci_fn = smccc_get_function(vcpu); unsigned long val; switch (psci_fn) { @@ -303,7 +369,7 @@ static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu) break; } - vcpu_set_reg(vcpu, 0, val); + smccc_set_retval(vcpu, val, 0, 0, 0); return 1; } @@ -321,9 +387,11 @@ static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu) * Errors: * -EINVAL: Unrecognized PSCI function */ -int kvm_psci_call(struct kvm_vcpu *vcpu) +static int kvm_psci_call(struct kvm_vcpu *vcpu) { - switch (kvm_psci_version(vcpu)) { + switch (kvm_psci_version(vcpu, vcpu->kvm)) { + case KVM_ARM_PSCI_1_0: + return kvm_psci_1_0_call(vcpu); case KVM_ARM_PSCI_0_2: return kvm_psci_0_2_call(vcpu); case KVM_ARM_PSCI_0_1: @@ -332,3 +400,30 @@ int kvm_psci_call(struct kvm_vcpu *vcpu) return -EINVAL; }; } + +int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) +{ + u32 func_id = smccc_get_function(vcpu); + u32 val = PSCI_RET_NOT_SUPPORTED; + u32 feature; + + switch (func_id) { + case ARM_SMCCC_VERSION_FUNC_ID: + val = ARM_SMCCC_VERSION_1_1; + break; + case ARM_SMCCC_ARCH_FEATURES_FUNC_ID: + feature = smccc_get_arg1(vcpu); + switch(feature) { + case ARM_SMCCC_ARCH_WORKAROUND_1: + if (kvm_arm_harden_branch_predictor()) + val = 0; + break; + } + break; + default: + return kvm_psci_call(vcpu); + } + + smccc_set_retval(vcpu, val, 0, 0, 0); + return 1; +} diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c index 5801261f3add..743ca5cb05ef 100644 --- a/virt/kvm/arm/vgic/vgic-init.c +++ b/virt/kvm/arm/vgic/vgic-init.c @@ -285,6 +285,12 @@ int vgic_init(struct kvm *kvm) if (ret) goto out; + if (vgic_has_its(kvm)) { + ret = vgic_v4_init(kvm); + if (ret) + goto out; + } + kvm_for_each_vcpu(i, vcpu, kvm) kvm_vgic_vcpu_enable(vcpu); @@ -320,6 +326,9 @@ static void kvm_vgic_dist_destroy(struct kvm *kvm) kfree(dist->spis); dist->nr_spis = 0; + + if (vgic_supports_direct_msis(kvm)) + vgic_v4_teardown(kvm); } void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) diff --git a/virt/kvm/arm/vgic/vgic-irqfd.c b/virt/kvm/arm/vgic/vgic-irqfd.c index b7baf581611a..99e026d2dade 100644 --- a/virt/kvm/arm/vgic/vgic-irqfd.c +++ b/virt/kvm/arm/vgic/vgic-irqfd.c @@ -112,8 +112,7 @@ int kvm_vgic_setup_default_irq_routing(struct kvm *kvm) u32 nr = dist->nr_spis; int i, ret; - entries = kcalloc(nr, sizeof(struct kvm_kernel_irq_routing_entry), - GFP_KERNEL); + entries = kcalloc(nr, sizeof(*entries), GFP_KERNEL); if (!entries) return -ENOMEM; diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c index 547f12dc4d54..465095355666 100644 --- a/virt/kvm/arm/vgic/vgic-its.c +++ b/virt/kvm/arm/vgic/vgic-its.c @@ -38,7 +38,7 @@ static int vgic_its_save_tables_v0(struct vgic_its *its); static int vgic_its_restore_tables_v0(struct vgic_its *its); static int vgic_its_commit_v0(struct vgic_its *its); static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq, - struct kvm_vcpu *filter_vcpu); + struct kvm_vcpu *filter_vcpu, bool needs_inv); /* * Creates a new (reference to a) struct vgic_irq for a given LPI. @@ -106,7 +106,7 @@ out_unlock: * However we only have those structs for mapped IRQs, so we read in * the respective config data from memory here upon mapping the LPI. */ - ret = update_lpi_config(kvm, irq, NULL); + ret = update_lpi_config(kvm, irq, NULL, false); if (ret) return ERR_PTR(ret); @@ -273,11 +273,12 @@ static struct its_collection *find_collection(struct vgic_its *its, int coll_id) * VCPU. Unconditionally applies if filter_vcpu is NULL. */ static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq, - struct kvm_vcpu *filter_vcpu) + struct kvm_vcpu *filter_vcpu, bool needs_inv) { u64 propbase = GICR_PROPBASER_ADDRESS(kvm->arch.vgic.propbaser); u8 prop; int ret; + unsigned long flags; ret = kvm_read_guest(kvm, propbase + irq->intid - GIC_LPI_OFFSET, &prop, 1); @@ -285,17 +286,23 @@ static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq, if (ret) return ret; - spin_lock(&irq->irq_lock); + spin_lock_irqsave(&irq->irq_lock, flags); if (!filter_vcpu || filter_vcpu == irq->target_vcpu) { irq->priority = LPI_PROP_PRIORITY(prop); irq->enabled = LPI_PROP_ENABLE_BIT(prop); - vgic_queue_irq_unlock(kvm, irq); - } else { - spin_unlock(&irq->irq_lock); + if (!irq->hw) { + vgic_queue_irq_unlock(kvm, irq, flags); + return 0; + } } + spin_unlock_irqrestore(&irq->irq_lock, flags); + + if (irq->hw) + return its_prop_update_vlpi(irq->host_irq, prop, needs_inv); + return 0; } @@ -335,6 +342,29 @@ static int vgic_copy_lpi_list(struct kvm_vcpu *vcpu, u32 **intid_ptr) return i; } +static int update_affinity(struct vgic_irq *irq, struct kvm_vcpu *vcpu) +{ + int ret = 0; + + spin_lock(&irq->irq_lock); + irq->target_vcpu = vcpu; + spin_unlock(&irq->irq_lock); + + if (irq->hw) { + struct its_vlpi_map map; + + ret = its_get_vlpi(irq->host_irq, &map); + if (ret) + return ret; + + map.vpe = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; + + ret = its_map_vlpi(irq->host_irq, &map); + } + + return ret; +} + /* * Promotes the ITS view of affinity of an ITTE (which redistributor this LPI * is targeting) to the VGIC's view, which deals with target VCPUs. @@ -349,10 +379,7 @@ static void update_affinity_ite(struct kvm *kvm, struct its_ite *ite) return; vcpu = kvm_get_vcpu(kvm, ite->collection->target_addr); - - spin_lock(&ite->irq->irq_lock); - ite->irq->target_vcpu = vcpu; - spin_unlock(&ite->irq->irq_lock); + update_affinity(ite->irq, vcpu); } /* @@ -393,6 +420,8 @@ static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu) int ret = 0; u32 *intids; int nr_irqs, i; + unsigned long flags; + u8 pendmask; nr_irqs = vgic_copy_lpi_list(vcpu, &intids); if (nr_irqs < 0) @@ -400,7 +429,6 @@ static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu) for (i = 0; i < nr_irqs; i++) { int byte_offset, bit_nr; - u8 pendmask; byte_offset = intids[i] / BITS_PER_BYTE; bit_nr = intids[i] % BITS_PER_BYTE; @@ -420,9 +448,9 @@ static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu) } irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]); - spin_lock(&irq->irq_lock); + spin_lock_irqsave(&irq->irq_lock, flags); irq->pending_latch = pendmask & (1U << bit_nr); - vgic_queue_irq_unlock(vcpu->kvm, irq); + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); vgic_put_irq(vcpu->kvm, irq); } @@ -503,15 +531,8 @@ static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm, return 0; } -/* - * Find the target VCPU and the LPI number for a given devid/eventid pair - * and make this IRQ pending, possibly injecting it. - * Must be called with the its_lock mutex held. - * Returns 0 on success, a positive error value for any ITS mapping - * related errors and negative error values for generic errors. - */ -static int vgic_its_trigger_msi(struct kvm *kvm, struct vgic_its *its, - u32 devid, u32 eventid) +int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its, + u32 devid, u32 eventid, struct vgic_irq **irq) { struct kvm_vcpu *vcpu; struct its_ite *ite; @@ -530,26 +551,65 @@ static int vgic_its_trigger_msi(struct kvm *kvm, struct vgic_its *its, if (!vcpu->arch.vgic_cpu.lpis_enabled) return -EBUSY; - spin_lock(&ite->irq->irq_lock); - ite->irq->pending_latch = true; - vgic_queue_irq_unlock(kvm, ite->irq); - + *irq = ite->irq; return 0; } -static struct vgic_io_device *vgic_get_its_iodev(struct kvm_io_device *dev) +struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi) { + u64 address; + struct kvm_io_device *kvm_io_dev; struct vgic_io_device *iodev; - if (dev->ops != &kvm_io_gic_ops) - return NULL; + if (!vgic_has_its(kvm)) + return ERR_PTR(-ENODEV); - iodev = container_of(dev, struct vgic_io_device, dev); + if (!(msi->flags & KVM_MSI_VALID_DEVID)) + return ERR_PTR(-EINVAL); + address = (u64)msi->address_hi << 32 | msi->address_lo; + + kvm_io_dev = kvm_io_bus_get_dev(kvm, KVM_MMIO_BUS, address); + if (!kvm_io_dev) + return ERR_PTR(-EINVAL); + + if (kvm_io_dev->ops != &kvm_io_gic_ops) + return ERR_PTR(-EINVAL); + + iodev = container_of(kvm_io_dev, struct vgic_io_device, dev); if (iodev->iodev_type != IODEV_ITS) - return NULL; + return ERR_PTR(-EINVAL); - return iodev; + return iodev->its; +} + +/* + * Find the target VCPU and the LPI number for a given devid/eventid pair + * and make this IRQ pending, possibly injecting it. + * Must be called with the its_lock mutex held. + * Returns 0 on success, a positive error value for any ITS mapping + * related errors and negative error values for generic errors. + */ +static int vgic_its_trigger_msi(struct kvm *kvm, struct vgic_its *its, + u32 devid, u32 eventid) +{ + struct vgic_irq *irq = NULL; + unsigned long flags; + int err; + + err = vgic_its_resolve_lpi(kvm, its, devid, eventid, &irq); + if (err) + return err; + + if (irq->hw) + return irq_set_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, true); + + spin_lock_irqsave(&irq->irq_lock, flags); + irq->pending_latch = true; + vgic_queue_irq_unlock(kvm, irq, flags); + + return 0; } /* @@ -560,30 +620,16 @@ static struct vgic_io_device *vgic_get_its_iodev(struct kvm_io_device *dev) */ int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi) { - u64 address; - struct kvm_io_device *kvm_io_dev; - struct vgic_io_device *iodev; + struct vgic_its *its; int ret; - if (!vgic_has_its(kvm)) - return -ENODEV; + its = vgic_msi_to_its(kvm, msi); + if (IS_ERR(its)) + return PTR_ERR(its); - if (!(msi->flags & KVM_MSI_VALID_DEVID)) - return -EINVAL; - - address = (u64)msi->address_hi << 32 | msi->address_lo; - - kvm_io_dev = kvm_io_bus_get_dev(kvm, KVM_MMIO_BUS, address); - if (!kvm_io_dev) - return -EINVAL; - - iodev = vgic_get_its_iodev(kvm_io_dev); - if (!iodev) - return -EINVAL; - - mutex_lock(&iodev->its->its_lock); - ret = vgic_its_trigger_msi(kvm, iodev->its, msi->devid, msi->data); - mutex_unlock(&iodev->its->its_lock); + mutex_lock(&its->its_lock); + ret = vgic_its_trigger_msi(kvm, its, msi->devid, msi->data); + mutex_unlock(&its->its_lock); if (ret < 0) return ret; @@ -605,8 +651,12 @@ static void its_free_ite(struct kvm *kvm, struct its_ite *ite) list_del(&ite->ite_list); /* This put matches the get in vgic_add_lpi. */ - if (ite->irq) + if (ite->irq) { + if (ite->irq->hw) + WARN_ON(its_unmap_vlpi(ite->irq->host_irq)); + vgic_put_irq(kvm, ite->irq); + } kfree(ite); } @@ -680,11 +730,7 @@ static int vgic_its_cmd_handle_movi(struct kvm *kvm, struct vgic_its *its, ite->collection = collection; vcpu = kvm_get_vcpu(kvm, collection->target_addr); - spin_lock(&ite->irq->irq_lock); - ite->irq->target_vcpu = vcpu; - spin_unlock(&ite->irq->irq_lock); - - return 0; + return update_affinity(ite->irq, vcpu); } /* @@ -775,6 +821,8 @@ static int vgic_its_alloc_collection(struct vgic_its *its, return E_ITS_MAPC_COLLECTION_OOR; collection = kzalloc(sizeof(*collection), GFP_KERNEL); + if (!collection) + return -ENOMEM; collection->collection_id = coll_id; collection->target_addr = COLLECTION_NOT_MAPPED; @@ -894,7 +942,7 @@ static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its, } /* Requires the its_lock to be held. */ -static void vgic_its_unmap_device(struct kvm *kvm, struct its_device *device) +static void vgic_its_free_device(struct kvm *kvm, struct its_device *device) { struct its_ite *ite, *temp; @@ -910,6 +958,24 @@ static void vgic_its_unmap_device(struct kvm *kvm, struct its_device *device) kfree(device); } +/* its lock must be held */ +static void vgic_its_free_device_list(struct kvm *kvm, struct vgic_its *its) +{ + struct its_device *cur, *temp; + + list_for_each_entry_safe(cur, temp, &its->device_list, dev_list) + vgic_its_free_device(kvm, cur); +} + +/* its lock must be held */ +static void vgic_its_free_collection_list(struct kvm *kvm, struct vgic_its *its) +{ + struct its_collection *cur, *temp; + + list_for_each_entry_safe(cur, temp, &its->collection_list, coll_list) + vgic_its_free_collection(its, cur->collection_id); +} + /* Must be called with its_lock mutex held */ static struct its_device *vgic_its_alloc_device(struct vgic_its *its, u32 device_id, gpa_t itt_addr, @@ -957,7 +1023,7 @@ static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its, * by removing the mapping and re-establishing it. */ if (device) - vgic_its_unmap_device(kvm, device); + vgic_its_free_device(kvm, device); /* * The spec does not say whether unmapping a not-mapped device @@ -968,10 +1034,8 @@ static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its, device = vgic_its_alloc_device(its, device_id, itt_addr, num_eventid_bits); - if (IS_ERR(device)) - return PTR_ERR(device); - return 0; + return PTR_ERR_OR_ZERO(device); } /* @@ -1033,6 +1097,10 @@ static int vgic_its_cmd_handle_clear(struct kvm *kvm, struct vgic_its *its, ite->irq->pending_latch = false; + if (ite->irq->hw) + return irq_set_irqchip_state(ite->irq->host_irq, + IRQCHIP_STATE_PENDING, false); + return 0; } @@ -1052,7 +1120,7 @@ static int vgic_its_cmd_handle_inv(struct kvm *kvm, struct vgic_its *its, if (!ite) return E_ITS_INV_UNMAPPED_INTERRUPT; - return update_lpi_config(kvm, ite->irq, NULL); + return update_lpi_config(kvm, ite->irq, NULL, true); } /* @@ -1087,12 +1155,15 @@ static int vgic_its_cmd_handle_invall(struct kvm *kvm, struct vgic_its *its, irq = vgic_get_irq(kvm, NULL, intids[i]); if (!irq) continue; - update_lpi_config(kvm, irq, vcpu); + update_lpi_config(kvm, irq, vcpu, false); vgic_put_irq(kvm, irq); } kfree(intids); + if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.its_vm) + its_invall_vpe(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe); + return 0; } @@ -1107,11 +1178,12 @@ static int vgic_its_cmd_handle_invall(struct kvm *kvm, struct vgic_its *its, static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its, u64 *its_cmd) { - struct vgic_dist *dist = &kvm->arch.vgic; u32 target1_addr = its_cmd_get_target_addr(its_cmd); u32 target2_addr = its_cmd_mask_field(its_cmd, 3, 16, 32); struct kvm_vcpu *vcpu1, *vcpu2; struct vgic_irq *irq; + u32 *intids; + int irq_count, i; if (target1_addr >= atomic_read(&kvm->online_vcpus) || target2_addr >= atomic_read(&kvm->online_vcpus)) @@ -1123,19 +1195,19 @@ static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its, vcpu1 = kvm_get_vcpu(kvm, target1_addr); vcpu2 = kvm_get_vcpu(kvm, target2_addr); - spin_lock(&dist->lpi_list_lock); + irq_count = vgic_copy_lpi_list(vcpu1, &intids); + if (irq_count < 0) + return irq_count; - list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { - spin_lock(&irq->irq_lock); + for (i = 0; i < irq_count; i++) { + irq = vgic_get_irq(kvm, NULL, intids[i]); - if (irq->target_vcpu == vcpu1) - irq->target_vcpu = vcpu2; + update_affinity(irq, vcpu2); - spin_unlock(&irq->irq_lock); + vgic_put_irq(kvm, irq); } - spin_unlock(&dist->lpi_list_lock); - + kfree(intids); return 0; } @@ -1410,7 +1482,7 @@ static void vgic_mmio_write_its_baser(struct kvm *kvm, unsigned long val) { const struct vgic_its_abi *abi = vgic_its_get_abi(its); - u64 entry_size, device_type; + u64 entry_size, table_type; u64 reg, *regptr, clearbits = 0; /* When GITS_CTLR.Enable is 1, we ignore write accesses. */ @@ -1421,12 +1493,12 @@ static void vgic_mmio_write_its_baser(struct kvm *kvm, case 0: regptr = &its->baser_device_table; entry_size = abi->dte_esz; - device_type = GITS_BASER_TYPE_DEVICE; + table_type = GITS_BASER_TYPE_DEVICE; break; case 1: regptr = &its->baser_coll_table; entry_size = abi->cte_esz; - device_type = GITS_BASER_TYPE_COLLECTION; + table_type = GITS_BASER_TYPE_COLLECTION; clearbits = GITS_BASER_INDIRECT; break; default: @@ -1438,10 +1510,24 @@ static void vgic_mmio_write_its_baser(struct kvm *kvm, reg &= ~clearbits; reg |= (entry_size - 1) << GITS_BASER_ENTRY_SIZE_SHIFT; - reg |= device_type << GITS_BASER_TYPE_SHIFT; + reg |= table_type << GITS_BASER_TYPE_SHIFT; reg = vgic_sanitise_its_baser(reg); *regptr = reg; + + if (!(reg & GITS_BASER_VALID)) { + /* Take the its_lock to prevent a race with a save/restore */ + mutex_lock(&its->its_lock); + switch (table_type) { + case GITS_BASER_TYPE_DEVICE: + vgic_its_free_device_list(kvm, its); + break; + case GITS_BASER_TYPE_COLLECTION: + vgic_its_free_collection_list(kvm, its); + break; + } + mutex_unlock(&its->its_lock); + } } static unsigned long vgic_mmio_read_its_ctlr(struct kvm *vcpu, @@ -1599,6 +1685,14 @@ static int vgic_its_create(struct kvm_device *dev, u32 type) if (!its) return -ENOMEM; + if (vgic_initialized(dev->kvm)) { + int ret = vgic_v4_init(dev->kvm); + if (ret < 0) { + kfree(its); + return ret; + } + } + mutex_init(&its->its_lock); mutex_init(&its->cmd_lock); @@ -1623,46 +1717,17 @@ static int vgic_its_create(struct kvm_device *dev, u32 type) return vgic_its_set_abi(its, NR_ITS_ABIS - 1); } -static void vgic_its_free_device(struct kvm *kvm, struct its_device *dev) -{ - struct its_ite *ite, *tmp; - - list_for_each_entry_safe(ite, tmp, &dev->itt_head, ite_list) - its_free_ite(kvm, ite); - list_del(&dev->dev_list); - kfree(dev); -} - static void vgic_its_destroy(struct kvm_device *kvm_dev) { struct kvm *kvm = kvm_dev->kvm; struct vgic_its *its = kvm_dev->private; - struct list_head *cur, *temp; - - /* - * We may end up here without the lists ever having been initialized. - * Check this and bail out early to avoid dereferencing a NULL pointer. - */ - if (!its->device_list.next) - return; mutex_lock(&its->its_lock); - list_for_each_safe(cur, temp, &its->device_list) { - struct its_device *dev; - - dev = list_entry(cur, struct its_device, dev_list); - vgic_its_free_device(kvm, dev); - } - list_for_each_safe(cur, temp, &its->collection_list) { - struct its_collection *coll; + vgic_its_free_device_list(kvm, its); + vgic_its_free_collection_list(kvm, its); - coll = list_entry(cur, struct its_collection, coll_list); - list_del(cur); - kfree(coll); - } mutex_unlock(&its->its_lock); - kfree(its); } @@ -1940,6 +2005,15 @@ static int vgic_its_save_itt(struct vgic_its *its, struct its_device *device) list_for_each_entry(ite, &device->itt_head, ite_list) { gpa_t gpa = base + ite->event_id * ite_esz; + /* + * If an LPI carries the HW bit, this means that this + * interrupt is controlled by GICv4, and we do not + * have direct access to that state. Let's simply fail + * the save operation... + */ + if (ite->irq->hw) + return -EACCES; + ret = vgic_its_save_ite(its, device, ite, gpa, ite_esz); if (ret) return ret; @@ -2290,29 +2364,13 @@ static int vgic_its_restore_collection_table(struct vgic_its *its) */ static int vgic_its_save_tables_v0(struct vgic_its *its) { - struct kvm *kvm = its->dev->kvm; int ret; - mutex_lock(&kvm->lock); - mutex_lock(&its->its_lock); - - if (!lock_all_vcpus(kvm)) { - mutex_unlock(&its->its_lock); - mutex_unlock(&kvm->lock); - return -EBUSY; - } - ret = vgic_its_save_device_tables(its); if (ret) - goto out; - - ret = vgic_its_save_collection_table(its); + return ret; -out: - unlock_all_vcpus(kvm); - mutex_unlock(&its->its_lock); - mutex_unlock(&kvm->lock); - return ret; + return vgic_its_save_collection_table(its); } /** @@ -2322,29 +2380,13 @@ out: */ static int vgic_its_restore_tables_v0(struct vgic_its *its) { - struct kvm *kvm = its->dev->kvm; int ret; - mutex_lock(&kvm->lock); - mutex_lock(&its->its_lock); - - if (!lock_all_vcpus(kvm)) { - mutex_unlock(&its->its_lock); - mutex_unlock(&kvm->lock); - return -EBUSY; - } - ret = vgic_its_restore_collection_table(its); if (ret) - goto out; - - ret = vgic_its_restore_device_tables(its); -out: - unlock_all_vcpus(kvm); - mutex_unlock(&its->its_lock); - mutex_unlock(&kvm->lock); + return ret; - return ret; + return vgic_its_restore_device_tables(its); } static int vgic_its_commit_v0(struct vgic_its *its) @@ -2363,6 +2405,19 @@ static int vgic_its_commit_v0(struct vgic_its *its) return 0; } +static void vgic_its_reset(struct kvm *kvm, struct vgic_its *its) +{ + /* We need to keep the ABI specific field values */ + its->baser_coll_table &= ~GITS_BASER_VALID; + its->baser_device_table &= ~GITS_BASER_VALID; + its->cbaser = 0; + its->creadr = 0; + its->cwriter = 0; + its->enabled = 0; + vgic_its_free_device_list(kvm, its); + vgic_its_free_collection_list(kvm, its); +} + static int vgic_its_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { @@ -2377,6 +2432,8 @@ static int vgic_its_has_attr(struct kvm_device *dev, switch (attr->attr) { case KVM_DEV_ARM_VGIC_CTRL_INIT: return 0; + case KVM_DEV_ARM_ITS_CTRL_RESET: + return 0; case KVM_DEV_ARM_ITS_SAVE_TABLES: return 0; case KVM_DEV_ARM_ITS_RESTORE_TABLES: @@ -2389,6 +2446,41 @@ static int vgic_its_has_attr(struct kvm_device *dev, return -ENXIO; } +static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr) +{ + const struct vgic_its_abi *abi = vgic_its_get_abi(its); + int ret = 0; + + if (attr == KVM_DEV_ARM_VGIC_CTRL_INIT) /* Nothing to do */ + return 0; + + mutex_lock(&kvm->lock); + mutex_lock(&its->its_lock); + + if (!lock_all_vcpus(kvm)) { + mutex_unlock(&its->its_lock); + mutex_unlock(&kvm->lock); + return -EBUSY; + } + + switch (attr) { + case KVM_DEV_ARM_ITS_CTRL_RESET: + vgic_its_reset(kvm, its); + break; + case KVM_DEV_ARM_ITS_SAVE_TABLES: + ret = abi->save_tables(its); + break; + case KVM_DEV_ARM_ITS_RESTORE_TABLES: + ret = abi->restore_tables(its); + break; + } + + unlock_all_vcpus(kvm); + mutex_unlock(&its->its_lock); + mutex_unlock(&kvm->lock); + return ret; +} + static int vgic_its_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { @@ -2414,19 +2506,8 @@ static int vgic_its_set_attr(struct kvm_device *dev, return vgic_register_its_iodev(dev->kvm, its, addr); } - case KVM_DEV_ARM_VGIC_GRP_CTRL: { - const struct vgic_its_abi *abi = vgic_its_get_abi(its); - - switch (attr->attr) { - case KVM_DEV_ARM_VGIC_CTRL_INIT: - /* Nothing to do */ - return 0; - case KVM_DEV_ARM_ITS_SAVE_TABLES: - return abi->save_tables(its); - case KVM_DEV_ARM_ITS_RESTORE_TABLES: - return abi->restore_tables(its); - } - } + case KVM_DEV_ARM_VGIC_GRP_CTRL: + return vgic_its_ctrl(dev->kvm, its, attr->attr); case KVM_DEV_ARM_VGIC_GRP_ITS_REGS: { u64 __user *uaddr = (u64 __user *)(long)attr->addr; u64 reg; diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c index b3d4a10f09a1..e21e2f49b005 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v2.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v2.c @@ -74,6 +74,7 @@ static void vgic_mmio_write_sgir(struct kvm_vcpu *source_vcpu, int mode = (val >> 24) & 0x03; int c; struct kvm_vcpu *vcpu; + unsigned long flags; switch (mode) { case 0x0: /* as specified by targets */ @@ -97,11 +98,11 @@ static void vgic_mmio_write_sgir(struct kvm_vcpu *source_vcpu, irq = vgic_get_irq(source_vcpu->kvm, vcpu, intid); - spin_lock(&irq->irq_lock); + spin_lock_irqsave(&irq->irq_lock, flags); irq->pending_latch = true; irq->source |= 1U << source_vcpu->vcpu_id; - vgic_queue_irq_unlock(source_vcpu->kvm, irq); + vgic_queue_irq_unlock(source_vcpu->kvm, irq, flags); vgic_put_irq(source_vcpu->kvm, irq); } } @@ -131,6 +132,7 @@ static void vgic_mmio_write_target(struct kvm_vcpu *vcpu, u32 intid = VGIC_ADDR_TO_INTID(addr, 8); u8 cpu_mask = GENMASK(atomic_read(&vcpu->kvm->online_vcpus) - 1, 0); int i; + unsigned long flags; /* GICD_ITARGETSR[0-7] are read-only */ if (intid < VGIC_NR_PRIVATE_IRQS) @@ -140,13 +142,13 @@ static void vgic_mmio_write_target(struct kvm_vcpu *vcpu, struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid + i); int target; - spin_lock(&irq->irq_lock); + spin_lock_irqsave(&irq->irq_lock, flags); irq->targets = (val >> (i * 8)) & cpu_mask; target = irq->targets ? __ffs(irq->targets) : 0; irq->target_vcpu = kvm_get_vcpu(vcpu->kvm, target); - spin_unlock(&irq->irq_lock); + spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); } } @@ -174,17 +176,18 @@ static void vgic_mmio_write_sgipendc(struct kvm_vcpu *vcpu, { u32 intid = addr & 0x0f; int i; + unsigned long flags; for (i = 0; i < len; i++) { struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - spin_lock(&irq->irq_lock); + spin_lock_irqsave(&irq->irq_lock, flags); irq->source &= ~((val >> (i * 8)) & 0xff); if (!irq->source) irq->pending_latch = false; - spin_unlock(&irq->irq_lock); + spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); } } @@ -195,19 +198,20 @@ static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu, { u32 intid = addr & 0x0f; int i; + unsigned long flags; for (i = 0; i < len; i++) { struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - spin_lock(&irq->irq_lock); + spin_lock_irqsave(&irq->irq_lock, flags); irq->source |= (val >> (i * 8)) & 0xff; if (irq->source) { irq->pending_latch = true; - vgic_queue_irq_unlock(vcpu->kvm, irq); + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); } else { - spin_unlock(&irq->irq_lock); + spin_unlock_irqrestore(&irq->irq_lock, flags); } vgic_put_irq(vcpu->kvm, irq); } diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c index 408ef06638fc..671fe81f8e1d 100644 --- a/virt/kvm/arm/vgic/vgic-mmio-v3.c +++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c @@ -54,6 +54,11 @@ bool vgic_has_its(struct kvm *kvm) return dist->has_its; } +bool vgic_supports_direct_msis(struct kvm *kvm) +{ + return kvm_vgic_global_state.has_gicv4 && vgic_has_its(kvm); +} + static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len) { @@ -129,6 +134,7 @@ static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu, { int intid = VGIC_ADDR_TO_INTID(addr, 64); struct vgic_irq *irq; + unsigned long flags; /* The upper word is WI for us since we don't implement Aff3. */ if (addr & 4) @@ -139,13 +145,13 @@ static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu, if (!irq) return; - spin_lock(&irq->irq_lock); + spin_lock_irqsave(&irq->irq_lock, flags); /* We only care about and preserve Aff0, Aff1 and Aff2. */ irq->mpidr = val & GENMASK(23, 0); irq->target_vcpu = kvm_mpidr_to_vcpu(vcpu->kvm, irq->mpidr); - spin_unlock(&irq->irq_lock); + spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); } @@ -241,11 +247,12 @@ static void vgic_v3_uaccess_write_pending(struct kvm_vcpu *vcpu, { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); int i; + unsigned long flags; for (i = 0; i < len * 8; i++) { struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - spin_lock(&irq->irq_lock); + spin_lock_irqsave(&irq->irq_lock, flags); if (test_bit(i, &val)) { /* * pending_latch is set irrespective of irq type @@ -253,10 +260,10 @@ static void vgic_v3_uaccess_write_pending(struct kvm_vcpu *vcpu, * restore irq config before pending info. */ irq->pending_latch = true; - vgic_queue_irq_unlock(vcpu->kvm, irq); + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); } else { irq->pending_latch = false; - spin_unlock(&irq->irq_lock); + spin_unlock_irqrestore(&irq->irq_lock, flags); } vgic_put_irq(vcpu->kvm, irq); @@ -799,6 +806,7 @@ void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg) int sgi, c; int vcpu_id = vcpu->vcpu_id; bool broadcast; + unsigned long flags; sgi = (reg & ICC_SGI1R_SGI_ID_MASK) >> ICC_SGI1R_SGI_ID_SHIFT; broadcast = reg & BIT_ULL(ICC_SGI1R_IRQ_ROUTING_MODE_BIT); @@ -837,10 +845,10 @@ void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg) irq = vgic_get_irq(vcpu->kvm, c_vcpu, sgi); - spin_lock(&irq->irq_lock); + spin_lock_irqsave(&irq->irq_lock, flags); irq->pending_latch = true; - vgic_queue_irq_unlock(vcpu->kvm, irq); + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); vgic_put_irq(vcpu->kvm, irq); } } diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c index c1e4bdd66131..83d82bd7dc4e 100644 --- a/virt/kvm/arm/vgic/vgic-mmio.c +++ b/virt/kvm/arm/vgic/vgic-mmio.c @@ -16,6 +16,7 @@ #include <linux/kvm.h> #include <linux/kvm_host.h> #include <kvm/iodev.h> +#include <kvm/arm_arch_timer.h> #include <kvm/arm_vgic.h> #include "vgic.h" @@ -69,13 +70,14 @@ void vgic_mmio_write_senable(struct kvm_vcpu *vcpu, { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); int i; + unsigned long flags; for_each_set_bit(i, &val, len * 8) { struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - spin_lock(&irq->irq_lock); + spin_lock_irqsave(&irq->irq_lock, flags); irq->enabled = true; - vgic_queue_irq_unlock(vcpu->kvm, irq); + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); vgic_put_irq(vcpu->kvm, irq); } @@ -87,15 +89,16 @@ void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu, { u32 intid = VGIC_ADDR_TO_INTID(addr, 1); int i; + unsigned long flags; for_each_set_bit(i, &val, len * 8) { struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - spin_lock(&irq->irq_lock); + spin_lock_irqsave(&irq->irq_lock, flags); irq->enabled = false; - spin_unlock(&irq->irq_lock); + spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); } } @@ -120,39 +123,105 @@ unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu, return value; } +/* + * This function will return the VCPU that performed the MMIO access and + * trapped from within the VM, and will return NULL if this is a userspace + * access. + * + * We can disable preemption locally around accessing the per-CPU variable, + * and use the resolved vcpu pointer after enabling preemption again, because + * even if the current thread is migrated to another CPU, reading the per-CPU + * value later will give us the same value as we update the per-CPU variable + * in the preempt notifier handlers. + */ +static struct kvm_vcpu *vgic_get_mmio_requester_vcpu(void) +{ + struct kvm_vcpu *vcpu; + + preempt_disable(); + vcpu = kvm_arm_get_running_vcpu(); + preempt_enable(); + return vcpu; +} + +/* Must be called with irq->irq_lock held */ +static void vgic_hw_irq_spending(struct kvm_vcpu *vcpu, struct vgic_irq *irq, + bool is_uaccess) +{ + if (is_uaccess) + return; + + irq->pending_latch = true; + vgic_irq_set_phys_active(irq, true); +} + void vgic_mmio_write_spending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { + bool is_uaccess = !vgic_get_mmio_requester_vcpu(); u32 intid = VGIC_ADDR_TO_INTID(addr, 1); int i; + unsigned long flags; for_each_set_bit(i, &val, len * 8) { struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - spin_lock(&irq->irq_lock); - irq->pending_latch = true; - - vgic_queue_irq_unlock(vcpu->kvm, irq); + spin_lock_irqsave(&irq->irq_lock, flags); + if (irq->hw) + vgic_hw_irq_spending(vcpu, irq, is_uaccess); + else + irq->pending_latch = true; + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); vgic_put_irq(vcpu->kvm, irq); } } +/* Must be called with irq->irq_lock held */ +static void vgic_hw_irq_cpending(struct kvm_vcpu *vcpu, struct vgic_irq *irq, + bool is_uaccess) +{ + if (is_uaccess) + return; + + irq->pending_latch = false; + + /* + * We don't want the guest to effectively mask the physical + * interrupt by doing a write to SPENDR followed by a write to + * CPENDR for HW interrupts, so we clear the active state on + * the physical side if the virtual interrupt is not active. + * This may lead to taking an additional interrupt on the + * host, but that should not be a problem as the worst that + * can happen is an additional vgic injection. We also clear + * the pending state to maintain proper semantics for edge HW + * interrupts. + */ + vgic_irq_set_phys_pending(irq, false); + if (!irq->active) + vgic_irq_set_phys_active(irq, false); +} + void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len, unsigned long val) { + bool is_uaccess = !vgic_get_mmio_requester_vcpu(); u32 intid = VGIC_ADDR_TO_INTID(addr, 1); int i; + unsigned long flags; for_each_set_bit(i, &val, len * 8) { struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - spin_lock(&irq->irq_lock); + spin_lock_irqsave(&irq->irq_lock, flags); - irq->pending_latch = false; + if (irq->hw) + vgic_hw_irq_cpending(vcpu, irq, is_uaccess); + else + irq->pending_latch = false; - spin_unlock(&irq->irq_lock); + spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); } } @@ -177,26 +246,24 @@ unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu, return value; } +/* Must be called with irq->irq_lock held */ +static void vgic_hw_irq_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, + bool active, bool is_uaccess) +{ + if (is_uaccess) + return; + + irq->active = active; + vgic_irq_set_phys_active(irq, active); +} + static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, - bool new_active_state) + bool active) { - struct kvm_vcpu *requester_vcpu; - spin_lock(&irq->irq_lock); + unsigned long flags; + struct kvm_vcpu *requester_vcpu = vgic_get_mmio_requester_vcpu(); - /* - * The vcpu parameter here can mean multiple things depending on how - * this function is called; when handling a trap from the kernel it - * depends on the GIC version, and these functions are also called as - * part of save/restore from userspace. - * - * Therefore, we have to figure out the requester in a reliable way. - * - * When accessing VGIC state from user space, the requester_vcpu is - * NULL, which is fine, because we guarantee that no VCPUs are running - * when accessing VGIC state from user space so irq->vcpu->cpu is - * always -1. - */ - requester_vcpu = kvm_arm_get_running_vcpu(); + spin_lock_irqsave(&irq->irq_lock, flags); /* * If this virtual IRQ was written into a list register, we @@ -208,17 +275,26 @@ static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq, * vgic_change_active_prepare) and still has to sync back this IRQ, * so we release and re-acquire the spin_lock to let the other thread * sync back the IRQ. + * + * When accessing VGIC state from user space, requester_vcpu is + * NULL, which is fine, because we guarantee that no VCPUs are running + * when accessing VGIC state from user space so irq->vcpu->cpu is + * always -1. */ while (irq->vcpu && /* IRQ may have state in an LR somewhere */ irq->vcpu != requester_vcpu && /* Current thread is not the VCPU thread */ irq->vcpu->cpu != -1) /* VCPU thread is running */ cond_resched_lock(&irq->irq_lock); - irq->active = new_active_state; - if (new_active_state) - vgic_queue_irq_unlock(vcpu->kvm, irq); + if (irq->hw) + vgic_hw_irq_change_active(vcpu, irq, active, !requester_vcpu); + else + irq->active = active; + + if (irq->active) + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); else - spin_unlock(&irq->irq_lock); + spin_unlock_irqrestore(&irq->irq_lock, flags); } /* @@ -352,14 +428,15 @@ void vgic_mmio_write_priority(struct kvm_vcpu *vcpu, { u32 intid = VGIC_ADDR_TO_INTID(addr, 8); int i; + unsigned long flags; for (i = 0; i < len; i++) { struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - spin_lock(&irq->irq_lock); + spin_lock_irqsave(&irq->irq_lock, flags); /* Narrow the priority range to what we actually support */ irq->priority = (val >> (i * 8)) & GENMASK(7, 8 - VGIC_PRI_BITS); - spin_unlock(&irq->irq_lock); + spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); } @@ -390,6 +467,7 @@ void vgic_mmio_write_config(struct kvm_vcpu *vcpu, { u32 intid = VGIC_ADDR_TO_INTID(addr, 2); int i; + unsigned long flags; for (i = 0; i < len * 4; i++) { struct vgic_irq *irq; @@ -404,14 +482,14 @@ void vgic_mmio_write_config(struct kvm_vcpu *vcpu, continue; irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i); - spin_lock(&irq->irq_lock); + spin_lock_irqsave(&irq->irq_lock, flags); if (test_bit(i * 2 + 1, &val)) irq->config = VGIC_CONFIG_EDGE; else irq->config = VGIC_CONFIG_LEVEL; - spin_unlock(&irq->irq_lock); + spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); } } @@ -443,6 +521,7 @@ void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid, { int i; int nr_irqs = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS; + unsigned long flags; for (i = 0; i < 32; i++) { struct vgic_irq *irq; @@ -459,12 +538,12 @@ void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid, * restore irq config before line level. */ new_level = !!(val & (1U << i)); - spin_lock(&irq->irq_lock); + spin_lock_irqsave(&irq->irq_lock, flags); irq->line_level = new_level; if (new_level) - vgic_queue_irq_unlock(vcpu->kvm, irq); + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); else - spin_unlock(&irq->irq_lock); + spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); } diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c index e4187e52bb26..c32d7b93ffd1 100644 --- a/virt/kvm/arm/vgic/vgic-v2.c +++ b/virt/kvm/arm/vgic/vgic-v2.c @@ -62,6 +62,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2; int lr; + unsigned long flags; cpuif->vgic_hcr &= ~GICH_HCR_UIE; @@ -77,7 +78,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) irq = vgic_get_irq(vcpu->kvm, vcpu, intid); - spin_lock(&irq->irq_lock); + spin_lock_irqsave(&irq->irq_lock, flags); /* Always preserve the active bit */ irq->active = !!(val & GICH_LR_ACTIVE_BIT); @@ -104,7 +105,27 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) irq->pending_latch = false; } - spin_unlock(&irq->irq_lock); + /* + * Level-triggered mapped IRQs are special because we only + * observe rising edges as input to the VGIC. + * + * If the guest never acked the interrupt we have to sample + * the physical line and set the line level, because the + * device state could have changed or we simply need to + * process the still pending interrupt later. + * + * If this causes us to lower the level, we have to also clear + * the physical active state, since we will otherwise never be + * told when the interrupt becomes asserted again. + */ + if (vgic_irq_is_mapped_level(irq) && (val & GICH_LR_PENDING_BIT)) { + irq->line_level = vgic_get_phys_line_level(irq); + + if (!irq->line_level) + vgic_irq_set_phys_active(irq, false); + } + + spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); } @@ -161,6 +182,15 @@ void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) val |= GICH_LR_EOI; } + /* + * Level-triggered mapped IRQs are special because we only observe + * rising edges as input to the VGIC. We therefore lower the line + * level here, so that we can take new virtual IRQs. See + * vgic_v2_fold_lr_state for more info. + */ + if (vgic_irq_is_mapped_level(irq) && (val & GICH_LR_PENDING_BIT)) + irq->line_level = false; + /* The GICv2 LR only holds five bits of priority. */ val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT; diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c index 96ea597db0e7..6b329414e57a 100644 --- a/virt/kvm/arm/vgic/vgic-v3.c +++ b/virt/kvm/arm/vgic/vgic-v3.c @@ -24,6 +24,7 @@ static bool group0_trap; static bool group1_trap; static bool common_trap; +static bool gicv4_enable; void vgic_v3_set_underflow(struct kvm_vcpu *vcpu) { @@ -44,6 +45,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3; u32 model = vcpu->kvm->arch.vgic.vgic_model; int lr; + unsigned long flags; cpuif->vgic_hcr &= ~ICH_HCR_UIE; @@ -66,7 +68,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) if (!irq) /* An LPI could have been unmapped. */ continue; - spin_lock(&irq->irq_lock); + spin_lock_irqsave(&irq->irq_lock, flags); /* Always preserve the active bit */ irq->active = !!(val & ICH_LR_ACTIVE_BIT); @@ -94,7 +96,27 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) irq->pending_latch = false; } - spin_unlock(&irq->irq_lock); + /* + * Level-triggered mapped IRQs are special because we only + * observe rising edges as input to the VGIC. + * + * If the guest never acked the interrupt we have to sample + * the physical line and set the line level, because the + * device state could have changed or we simply need to + * process the still pending interrupt later. + * + * If this causes us to lower the level, we have to also clear + * the physical active state, since we will otherwise never be + * told when the interrupt becomes asserted again. + */ + if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT)) { + irq->line_level = vgic_get_phys_line_level(irq); + + if (!irq->line_level) + vgic_irq_set_phys_active(irq, false); + } + + spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); } @@ -144,6 +166,15 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) } /* + * Level-triggered mapped IRQs are special because we only observe + * rising edges as input to the VGIC. We therefore lower the line + * level here, so that we can take new virtual IRQs. See + * vgic_v3_fold_lr_state for more info. + */ + if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT)) + irq->line_level = false; + + /* * We currently only support Group1 interrupts, which is a * known defect. This needs to be addressed at some point. */ @@ -278,6 +309,7 @@ int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq) bool status; u8 val; int ret; + unsigned long flags; retry: vcpu = irq->target_vcpu; @@ -296,13 +328,13 @@ retry: status = val & (1 << bit_nr); - spin_lock(&irq->irq_lock); + spin_lock_irqsave(&irq->irq_lock, flags); if (irq->target_vcpu != vcpu) { - spin_unlock(&irq->irq_lock); + spin_unlock_irqrestore(&irq->irq_lock, flags); goto retry; } irq->pending_latch = status; - vgic_queue_irq_unlock(vcpu->kvm, irq); + vgic_queue_irq_unlock(vcpu->kvm, irq, flags); if (status) { /* clear consumed data */ @@ -324,13 +356,13 @@ int vgic_v3_save_pending_tables(struct kvm *kvm) int last_byte_offset = -1; struct vgic_irq *irq; int ret; + u8 val; list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { int byte_offset, bit_nr; struct kvm_vcpu *vcpu; gpa_t pendbase, ptr; bool stored; - u8 val; vcpu = irq->target_vcpu; if (!vcpu) @@ -459,6 +491,12 @@ static int __init early_common_trap_cfg(char *buf) } early_param("kvm-arm.vgic_v3_common_trap", early_common_trap_cfg); +static int __init early_gicv4_enable(char *buf) +{ + return strtobool(buf, &gicv4_enable); +} +early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable); + /** * vgic_v3_probe - probe for a GICv3 compatible interrupt controller in DT * @node: pointer to the DT node @@ -478,6 +516,13 @@ int vgic_v3_probe(const struct gic_kvm_info *info) kvm_vgic_global_state.can_emulate_gicv2 = false; kvm_vgic_global_state.ich_vtr_el2 = ich_vtr_el2; + /* GICv4 support? */ + if (info->has_v4) { + kvm_vgic_global_state.has_gicv4 = gicv4_enable; + kvm_info("GICv4 support %sabled\n", + gicv4_enable ? "en" : "dis"); + } + if (!info->vcpu.start) { kvm_info("GICv3: no GICV resource entry\n"); kvm_vgic_global_state.vcpu_base = 0; diff --git a/virt/kvm/arm/vgic/vgic-v4.c b/virt/kvm/arm/vgic/vgic-v4.c new file mode 100644 index 000000000000..bc4265154bac --- /dev/null +++ b/virt/kvm/arm/vgic/vgic-v4.c @@ -0,0 +1,366 @@ +/* + * Copyright (C) 2017 ARM Ltd. + * Author: Marc Zyngier <marc.zyngier@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/irqdomain.h> +#include <linux/kvm_host.h> +#include <linux/irqchip/arm-gic-v3.h> + +#include "vgic.h" + +/* + * How KVM uses GICv4 (insert rude comments here): + * + * The vgic-v4 layer acts as a bridge between several entities: + * - The GICv4 ITS representation offered by the ITS driver + * - VFIO, which is in charge of the PCI endpoint + * - The virtual ITS, which is the only thing the guest sees + * + * The configuration of VLPIs is triggered by a callback from VFIO, + * instructing KVM that a PCI device has been configured to deliver + * MSIs to a vITS. + * + * kvm_vgic_v4_set_forwarding() is thus called with the routing entry, + * and this is used to find the corresponding vITS data structures + * (ITS instance, device, event and irq) using a process that is + * extremely similar to the injection of an MSI. + * + * At this stage, we can link the guest's view of an LPI (uniquely + * identified by the routing entry) and the host irq, using the GICv4 + * driver mapping operation. Should the mapping succeed, we've then + * successfully upgraded the guest's LPI to a VLPI. We can then start + * with updating GICv4's view of the property table and generating an + * INValidation in order to kickstart the delivery of this VLPI to the + * guest directly, without software intervention. Well, almost. + * + * When the PCI endpoint is deconfigured, this operation is reversed + * with VFIO calling kvm_vgic_v4_unset_forwarding(). + * + * Once the VLPI has been mapped, it needs to follow any change the + * guest performs on its LPI through the vITS. For that, a number of + * command handlers have hooks to communicate these changes to the HW: + * - Any invalidation triggers a call to its_prop_update_vlpi() + * - The INT command results in a irq_set_irqchip_state(), which + * generates an INT on the corresponding VLPI. + * - The CLEAR command results in a irq_set_irqchip_state(), which + * generates an CLEAR on the corresponding VLPI. + * - DISCARD translates into an unmap, similar to a call to + * kvm_vgic_v4_unset_forwarding(). + * - MOVI is translated by an update of the existing mapping, changing + * the target vcpu, resulting in a VMOVI being generated. + * - MOVALL is translated by a string of mapping updates (similar to + * the handling of MOVI). MOVALL is horrible. + * + * Note that a DISCARD/MAPTI sequence emitted from the guest without + * reprogramming the PCI endpoint after MAPTI does not result in a + * VLPI being mapped, as there is no callback from VFIO (the guest + * will get the interrupt via the normal SW injection). Fixing this is + * not trivial, and requires some horrible messing with the VFIO + * internals. Not fun. Don't do that. + * + * Then there is the scheduling. Each time a vcpu is about to run on a + * physical CPU, KVM must tell the corresponding redistributor about + * it. And if we've migrated our vcpu from one CPU to another, we must + * tell the ITS (so that the messages reach the right redistributor). + * This is done in two steps: first issue a irq_set_affinity() on the + * irq corresponding to the vcpu, then call its_schedule_vpe(). You + * must be in a non-preemptible context. On exit, another call to + * its_schedule_vpe() tells the redistributor that we're done with the + * vcpu. + * + * Finally, the doorbell handling: Each vcpu is allocated an interrupt + * which will fire each time a VLPI is made pending whilst the vcpu is + * not running. Each time the vcpu gets blocked, the doorbell + * interrupt gets enabled. When the vcpu is unblocked (for whatever + * reason), the doorbell interrupt is disabled. + */ + +#define DB_IRQ_FLAGS (IRQ_NOAUTOEN | IRQ_DISABLE_UNLAZY | IRQ_NO_BALANCING) + +static irqreturn_t vgic_v4_doorbell_handler(int irq, void *info) +{ + struct kvm_vcpu *vcpu = info; + + vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last = true; + kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); + kvm_vcpu_kick(vcpu); + + return IRQ_HANDLED; +} + +/** + * vgic_v4_init - Initialize the GICv4 data structures + * @kvm: Pointer to the VM being initialized + * + * We may be called each time a vITS is created, or when the + * vgic is initialized. This relies on kvm->lock to be + * held. In both cases, the number of vcpus should now be + * fixed. + */ +int vgic_v4_init(struct kvm *kvm) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + struct kvm_vcpu *vcpu; + int i, nr_vcpus, ret; + + if (!kvm_vgic_global_state.has_gicv4) + return 0; /* Nothing to see here... move along. */ + + if (dist->its_vm.vpes) + return 0; + + nr_vcpus = atomic_read(&kvm->online_vcpus); + + dist->its_vm.vpes = kzalloc(sizeof(*dist->its_vm.vpes) * nr_vcpus, + GFP_KERNEL); + if (!dist->its_vm.vpes) + return -ENOMEM; + + dist->its_vm.nr_vpes = nr_vcpus; + + kvm_for_each_vcpu(i, vcpu, kvm) + dist->its_vm.vpes[i] = &vcpu->arch.vgic_cpu.vgic_v3.its_vpe; + + ret = its_alloc_vcpu_irqs(&dist->its_vm); + if (ret < 0) { + kvm_err("VPE IRQ allocation failure\n"); + kfree(dist->its_vm.vpes); + dist->its_vm.nr_vpes = 0; + dist->its_vm.vpes = NULL; + return ret; + } + + kvm_for_each_vcpu(i, vcpu, kvm) { + int irq = dist->its_vm.vpes[i]->irq; + + /* + * Don't automatically enable the doorbell, as we're + * flipping it back and forth when the vcpu gets + * blocked. Also disable the lazy disabling, as the + * doorbell could kick us out of the guest too + * early... + */ + irq_set_status_flags(irq, DB_IRQ_FLAGS); + ret = request_irq(irq, vgic_v4_doorbell_handler, + 0, "vcpu", vcpu); + if (ret) { + kvm_err("failed to allocate vcpu IRQ%d\n", irq); + /* + * Trick: adjust the number of vpes so we know + * how many to nuke on teardown... + */ + dist->its_vm.nr_vpes = i; + break; + } + } + + if (ret) + vgic_v4_teardown(kvm); + + return ret; +} + +/** + * vgic_v4_teardown - Free the GICv4 data structures + * @kvm: Pointer to the VM being destroyed + * + * Relies on kvm->lock to be held. + */ +void vgic_v4_teardown(struct kvm *kvm) +{ + struct its_vm *its_vm = &kvm->arch.vgic.its_vm; + int i; + + if (!its_vm->vpes) + return; + + for (i = 0; i < its_vm->nr_vpes; i++) { + struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, i); + int irq = its_vm->vpes[i]->irq; + + irq_clear_status_flags(irq, DB_IRQ_FLAGS); + free_irq(irq, vcpu); + } + + its_free_vcpu_irqs(its_vm); + kfree(its_vm->vpes); + its_vm->nr_vpes = 0; + its_vm->vpes = NULL; +} + +int vgic_v4_sync_hwstate(struct kvm_vcpu *vcpu) +{ + if (!vgic_supports_direct_msis(vcpu->kvm)) + return 0; + + return its_schedule_vpe(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe, false); +} + +int vgic_v4_flush_hwstate(struct kvm_vcpu *vcpu) +{ + int irq = vcpu->arch.vgic_cpu.vgic_v3.its_vpe.irq; + int err; + + if (!vgic_supports_direct_msis(vcpu->kvm)) + return 0; + + /* + * Before making the VPE resident, make sure the redistributor + * corresponding to our current CPU expects us here. See the + * doc in drivers/irqchip/irq-gic-v4.c to understand how this + * turns into a VMOVP command at the ITS level. + */ + err = irq_set_affinity(irq, cpumask_of(smp_processor_id())); + if (err) + return err; + + err = its_schedule_vpe(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe, true); + if (err) + return err; + + /* + * Now that the VPE is resident, let's get rid of a potential + * doorbell interrupt that would still be pending. + */ + err = irq_set_irqchip_state(irq, IRQCHIP_STATE_PENDING, false); + + return err; +} + +static struct vgic_its *vgic_get_its(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *irq_entry) +{ + struct kvm_msi msi = (struct kvm_msi) { + .address_lo = irq_entry->msi.address_lo, + .address_hi = irq_entry->msi.address_hi, + .data = irq_entry->msi.data, + .flags = irq_entry->msi.flags, + .devid = irq_entry->msi.devid, + }; + + return vgic_msi_to_its(kvm, &msi); +} + +int kvm_vgic_v4_set_forwarding(struct kvm *kvm, int virq, + struct kvm_kernel_irq_routing_entry *irq_entry) +{ + struct vgic_its *its; + struct vgic_irq *irq; + struct its_vlpi_map map; + int ret; + + if (!vgic_supports_direct_msis(kvm)) + return 0; + + /* + * Get the ITS, and escape early on error (not a valid + * doorbell for any of our vITSs). + */ + its = vgic_get_its(kvm, irq_entry); + if (IS_ERR(its)) + return 0; + + mutex_lock(&its->its_lock); + + /* Perform then actual DevID/EventID -> LPI translation. */ + ret = vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid, + irq_entry->msi.data, &irq); + if (ret) + goto out; + + /* + * Emit the mapping request. If it fails, the ITS probably + * isn't v4 compatible, so let's silently bail out. Holding + * the ITS lock should ensure that nothing can modify the + * target vcpu. + */ + map = (struct its_vlpi_map) { + .vm = &kvm->arch.vgic.its_vm, + .vpe = &irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe, + .vintid = irq->intid, + .properties = ((irq->priority & 0xfc) | + (irq->enabled ? LPI_PROP_ENABLED : 0) | + LPI_PROP_GROUP1), + .db_enabled = true, + }; + + ret = its_map_vlpi(virq, &map); + if (ret) + goto out; + + irq->hw = true; + irq->host_irq = virq; + +out: + mutex_unlock(&its->its_lock); + return ret; +} + +int kvm_vgic_v4_unset_forwarding(struct kvm *kvm, int virq, + struct kvm_kernel_irq_routing_entry *irq_entry) +{ + struct vgic_its *its; + struct vgic_irq *irq; + int ret; + + if (!vgic_supports_direct_msis(kvm)) + return 0; + + /* + * Get the ITS, and escape early on error (not a valid + * doorbell for any of our vITSs). + */ + its = vgic_get_its(kvm, irq_entry); + if (IS_ERR(its)) + return 0; + + mutex_lock(&its->its_lock); + + ret = vgic_its_resolve_lpi(kvm, its, irq_entry->msi.devid, + irq_entry->msi.data, &irq); + if (ret) + goto out; + + WARN_ON(!(irq->hw && irq->host_irq == virq)); + if (irq->hw) { + irq->hw = false; + ret = its_unmap_vlpi(virq); + } + +out: + mutex_unlock(&its->its_lock); + return ret; +} + +void kvm_vgic_v4_enable_doorbell(struct kvm_vcpu *vcpu) +{ + if (vgic_supports_direct_msis(vcpu->kvm)) { + int irq = vcpu->arch.vgic_cpu.vgic_v3.its_vpe.irq; + if (irq) + enable_irq(irq); + } +} + +void kvm_vgic_v4_disable_doorbell(struct kvm_vcpu *vcpu) +{ + if (vgic_supports_direct_msis(vcpu->kvm)) { + int irq = vcpu->arch.vgic_cpu.vgic_v3.its_vpe.irq; + if (irq) + disable_irq(irq); + } +} diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c index fed717e07938..c7c5ef190afa 100644 --- a/virt/kvm/arm/vgic/vgic.c +++ b/virt/kvm/arm/vgic/vgic.c @@ -17,6 +17,8 @@ #include <linux/kvm.h> #include <linux/kvm_host.h> #include <linux/list_sort.h> +#include <linux/interrupt.h> +#include <linux/irq.h> #include "vgic.h" @@ -53,6 +55,10 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = { * vcpuX->vcpu_id < vcpuY->vcpu_id: * spin_lock(vcpuX->arch.vgic_cpu.ap_list_lock); * spin_lock(vcpuY->arch.vgic_cpu.ap_list_lock); + * + * Since the VGIC must support injecting virtual interrupts from ISRs, we have + * to use the spin_lock_irqsave/spin_unlock_irqrestore versions of outer + * spinlocks for any lock that may be taken while injecting an interrupt. */ /* @@ -138,6 +144,38 @@ void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq) kfree(irq); } +void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending) +{ + WARN_ON(irq_set_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + pending)); +} + +bool vgic_get_phys_line_level(struct vgic_irq *irq) +{ + bool line_level; + + BUG_ON(!irq->hw); + + if (irq->get_input_level) + return irq->get_input_level(irq->intid); + + WARN_ON(irq_get_irqchip_state(irq->host_irq, + IRQCHIP_STATE_PENDING, + &line_level)); + return line_level; +} + +/* Set/Clear the physical active state */ +void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active) +{ + + BUG_ON(!irq->hw); + WARN_ON(irq_set_irqchip_state(irq->host_irq, + IRQCHIP_STATE_ACTIVE, + active)); +} + /** * kvm_vgic_target_oracle - compute the target vcpu for an irq * @@ -261,7 +299,8 @@ static bool vgic_validate_injection(struct vgic_irq *irq, bool level, void *owne * Needs to be entered with the IRQ lock already held, but will return * with all locks dropped. */ -bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq) +bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, + unsigned long flags) { struct kvm_vcpu *vcpu; @@ -279,7 +318,7 @@ retry: * not need to be inserted into an ap_list and there is also * no more work for us to do. */ - spin_unlock(&irq->irq_lock); + spin_unlock_irqrestore(&irq->irq_lock, flags); /* * We have to kick the VCPU here, because we could be @@ -301,11 +340,11 @@ retry: * We must unlock the irq lock to take the ap_list_lock where * we are going to insert this new pending interrupt. */ - spin_unlock(&irq->irq_lock); + spin_unlock_irqrestore(&irq->irq_lock, flags); /* someone can do stuff here, which we re-check below */ - spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock); + spin_lock_irqsave(&vcpu->arch.vgic_cpu.ap_list_lock, flags); spin_lock(&irq->irq_lock); /* @@ -322,9 +361,9 @@ retry: if (unlikely(irq->vcpu || vcpu != vgic_target_oracle(irq))) { spin_unlock(&irq->irq_lock); - spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock); + spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags); - spin_lock(&irq->irq_lock); + spin_lock_irqsave(&irq->irq_lock, flags); goto retry; } @@ -337,7 +376,7 @@ retry: irq->vcpu = vcpu; spin_unlock(&irq->irq_lock); - spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock); + spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags); kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu); kvm_vcpu_kick(vcpu); @@ -367,6 +406,7 @@ int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid, { struct kvm_vcpu *vcpu; struct vgic_irq *irq; + unsigned long flags; int ret; trace_vgic_update_irq_pending(cpuid, intid, level); @@ -383,11 +423,11 @@ int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid, if (!irq) return -EINVAL; - spin_lock(&irq->irq_lock); + spin_lock_irqsave(&irq->irq_lock, flags); if (!vgic_validate_injection(irq, level, owner)) { /* Nothing to see here, move along... */ - spin_unlock(&irq->irq_lock); + spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(kvm, irq); return 0; } @@ -397,45 +437,78 @@ int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid, else irq->pending_latch = true; - vgic_queue_irq_unlock(kvm, irq); + vgic_queue_irq_unlock(kvm, irq, flags); vgic_put_irq(kvm, irq); return 0; } -int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq) +/* @irq->irq_lock must be held */ +static int kvm_vgic_map_irq(struct kvm_vcpu *vcpu, struct vgic_irq *irq, + unsigned int host_irq, + bool (*get_input_level)(int vindid)) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq); - - BUG_ON(!irq); + struct irq_desc *desc; + struct irq_data *data; - spin_lock(&irq->irq_lock); + /* + * Find the physical IRQ number corresponding to @host_irq + */ + desc = irq_to_desc(host_irq); + if (!desc) { + kvm_err("%s: no interrupt descriptor\n", __func__); + return -EINVAL; + } + data = irq_desc_get_irq_data(desc); + while (data->parent_data) + data = data->parent_data; irq->hw = true; - irq->hwintid = phys_irq; + irq->host_irq = host_irq; + irq->hwintid = data->hwirq; + irq->get_input_level = get_input_level; + return 0; +} - spin_unlock(&irq->irq_lock); +/* @irq->irq_lock must be held */ +static inline void kvm_vgic_unmap_irq(struct vgic_irq *irq) +{ + irq->hw = false; + irq->hwintid = 0; + irq->get_input_level = NULL; +} + +int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, unsigned int host_irq, + u32 vintid, bool (*get_input_level)(int vindid)) +{ + struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, vintid); + unsigned long flags; + int ret; + + BUG_ON(!irq); + + spin_lock_irqsave(&irq->irq_lock, flags); + ret = kvm_vgic_map_irq(vcpu, irq, host_irq, get_input_level); + spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); - return 0; + return ret; } -int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq) +int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int vintid) { struct vgic_irq *irq; + unsigned long flags; if (!vgic_initialized(vcpu->kvm)) return -EAGAIN; - irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq); + irq = vgic_get_irq(vcpu->kvm, vcpu, vintid); BUG_ON(!irq); - spin_lock(&irq->irq_lock); - - irq->hw = false; - irq->hwintid = 0; - - spin_unlock(&irq->irq_lock); + spin_lock_irqsave(&irq->irq_lock, flags); + kvm_vgic_unmap_irq(irq); + spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); return 0; @@ -454,6 +527,7 @@ int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq) int kvm_vgic_set_owner(struct kvm_vcpu *vcpu, unsigned int intid, void *owner) { struct vgic_irq *irq; + unsigned long flags; int ret = 0; if (!vgic_initialized(vcpu->kvm)) @@ -464,12 +538,12 @@ int kvm_vgic_set_owner(struct kvm_vcpu *vcpu, unsigned int intid, void *owner) return -EINVAL; irq = vgic_get_irq(vcpu->kvm, vcpu, intid); - spin_lock(&irq->irq_lock); + spin_lock_irqsave(&irq->irq_lock, flags); if (irq->owner && irq->owner != owner) ret = -EEXIST; else irq->owner = owner; - spin_unlock(&irq->irq_lock); + spin_unlock_irqrestore(&irq->irq_lock, flags); return ret; } @@ -486,9 +560,10 @@ static void vgic_prune_ap_list(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_irq *irq, *tmp; + unsigned long flags; retry: - spin_lock(&vgic_cpu->ap_list_lock); + spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags); list_for_each_entry_safe(irq, tmp, &vgic_cpu->ap_list_head, ap_list) { struct kvm_vcpu *target_vcpu, *vcpuA, *vcpuB; @@ -528,7 +603,7 @@ retry: /* This interrupt looks like it has to be migrated. */ spin_unlock(&irq->irq_lock); - spin_unlock(&vgic_cpu->ap_list_lock); + spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags); /* * Ensure locking order by always locking the smallest @@ -542,7 +617,7 @@ retry: vcpuB = vcpu; } - spin_lock(&vcpuA->arch.vgic_cpu.ap_list_lock); + spin_lock_irqsave(&vcpuA->arch.vgic_cpu.ap_list_lock, flags); spin_lock_nested(&vcpuB->arch.vgic_cpu.ap_list_lock, SINGLE_DEPTH_NESTING); spin_lock(&irq->irq_lock); @@ -566,11 +641,11 @@ retry: spin_unlock(&irq->irq_lock); spin_unlock(&vcpuB->arch.vgic_cpu.ap_list_lock); - spin_unlock(&vcpuA->arch.vgic_cpu.ap_list_lock); + spin_unlock_irqrestore(&vcpuA->arch.vgic_cpu.ap_list_lock, flags); goto retry; } - spin_unlock(&vgic_cpu->ap_list_lock); + spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags); } static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu) @@ -679,6 +754,8 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + WARN_ON(vgic_v4_sync_hwstate(vcpu)); + /* An empty ap_list_head implies used_lrs == 0 */ if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) return; @@ -691,6 +768,8 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) /* Flush our emulation state into the GIC hardware before entering the guest. */ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) { + WARN_ON(vgic_v4_flush_hwstate(vcpu)); + /* * If there are no virtual interrupts active or pending for this * VCPU, then there is no work to do and we can bail out without @@ -703,6 +782,8 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) return; + DEBUG_SPINLOCK_BUG_ON(!irqs_disabled()); + spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock); vgic_flush_lr_state(vcpu); spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock); @@ -735,11 +816,15 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; struct vgic_irq *irq; bool pending = false; + unsigned long flags; if (!vcpu->kvm->arch.vgic.enabled) return false; - spin_lock(&vgic_cpu->ap_list_lock); + if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last) + return true; + + spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags); list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { spin_lock(&irq->irq_lock); @@ -750,7 +835,7 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) break; } - spin_unlock(&vgic_cpu->ap_list_lock); + spin_unlock_irqrestore(&vgic_cpu->ap_list_lock, flags); return pending; } @@ -772,14 +857,19 @@ void vgic_kick_vcpus(struct kvm *kvm) } } -bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq) +bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid) { - struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq); + struct vgic_irq *irq; bool map_is_active; + unsigned long flags; - spin_lock(&irq->irq_lock); + if (!vgic_initialized(vcpu->kvm)) + return false; + + irq = vgic_get_irq(vcpu->kvm, vcpu, vintid); + spin_lock_irqsave(&irq->irq_lock, flags); map_is_active = irq->hw && irq->active; - spin_unlock(&irq->irq_lock); + spin_unlock_irqrestore(&irq->irq_lock, flags); vgic_put_irq(vcpu->kvm, irq); return map_is_active; diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index bf9ceab67c77..12c37b89f7a3 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -104,6 +104,11 @@ static inline bool irq_is_pending(struct vgic_irq *irq) return irq->pending_latch || irq->line_level; } +static inline bool vgic_irq_is_mapped_level(struct vgic_irq *irq) +{ + return irq->config == VGIC_CONFIG_LEVEL && irq->hw; +} + /* * This struct provides an intermediate representation of the fields contained * in the GICH_VMCR and ICH_VMCR registers, such that code exporting the GIC @@ -140,7 +145,11 @@ vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev, struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, u32 intid); void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq); -bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq); +bool vgic_get_phys_line_level(struct vgic_irq *irq); +void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending); +void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active); +bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, + unsigned long flags); void vgic_kick_vcpus(struct kvm *kvm); int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, @@ -236,4 +245,14 @@ static inline int vgic_v3_max_apr_idx(struct kvm_vcpu *vcpu) } } +int vgic_its_resolve_lpi(struct kvm *kvm, struct vgic_its *its, + u32 devid, u32 eventid, struct vgic_irq **irq); +struct vgic_its *vgic_msi_to_its(struct kvm *kvm, struct kvm_msi *msi); + +bool vgic_supports_direct_msis(struct kvm *kvm); +int vgic_v4_init(struct kvm *kvm); +void vgic_v4_teardown(struct kvm *kvm); +int vgic_v4_sync_hwstate(struct kvm_vcpu *vcpu); +int vgic_v4_flush_hwstate(struct kvm_vcpu *vcpu); + #endif diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index f2ac53ab8243..6e865e8b5b10 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -188,13 +188,13 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { struct kvm_kernel_irqfd *irqfd = container_of(wait, struct kvm_kernel_irqfd, wait); - unsigned long flags = (unsigned long)key; + __poll_t flags = key_to_poll(key); struct kvm_kernel_irq_routing_entry irq; struct kvm *kvm = irqfd->kvm; unsigned seq; int idx; - if (flags & POLLIN) { + if (flags & EPOLLIN) { idx = srcu_read_lock(&kvm->irq_srcu); do { seq = read_seqcount_begin(&irqfd->irq_entry_sc); @@ -208,7 +208,7 @@ irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) srcu_read_unlock(&kvm->irq_srcu, idx); } - if (flags & POLLHUP) { + if (flags & EPOLLHUP) { /* The eventfd is closing, detach from KVM */ unsigned long flags; @@ -287,7 +287,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) struct fd f; struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL; int ret; - unsigned int events; + __poll_t events; int idx; if (!kvm_arch_intc_initialized(kvm)) @@ -399,12 +399,12 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) */ events = f.file->f_op->poll(f.file, &irqfd->pt); - if (events & POLLIN) + if (events & EPOLLIN) schedule_work(&irqfd->inject); /* * do not drop the file until the irqfd is fully initialized, otherwise - * we might race against the POLLHUP + * we might race against the EPOLLHUP */ fdput(f); #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 9deb5a245b83..65dea3ffef68 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -122,7 +122,6 @@ static void hardware_disable_all(void); static void kvm_io_bus_destroy(struct kvm_io_bus *bus); -static void kvm_release_pfn_dirty(kvm_pfn_t pfn); static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn); __visible bool kvm_rebooting; @@ -136,6 +135,11 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm); static unsigned long long kvm_createvm_count; static unsigned long long kvm_active_vms; +__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, + unsigned long start, unsigned long end) +{ +} + bool kvm_is_reserved_pfn(kvm_pfn_t pfn) { if (pfn_valid(pfn)) @@ -147,17 +151,12 @@ bool kvm_is_reserved_pfn(kvm_pfn_t pfn) /* * Switches to specified vcpu, until a matching vcpu_put() */ -int vcpu_load(struct kvm_vcpu *vcpu) +void vcpu_load(struct kvm_vcpu *vcpu) { - int cpu; - - if (mutex_lock_killable(&vcpu->mutex)) - return -EINTR; - cpu = get_cpu(); + int cpu = get_cpu(); preempt_notifier_register(&vcpu->preempt_notifier); kvm_arch_vcpu_load(vcpu, cpu); put_cpu(); - return 0; } EXPORT_SYMBOL_GPL(vcpu_load); @@ -167,7 +166,6 @@ void vcpu_put(struct kvm_vcpu *vcpu) kvm_arch_vcpu_put(vcpu); preempt_notifier_unregister(&vcpu->preempt_notifier); preempt_enable(); - mutex_unlock(&vcpu->mutex); } EXPORT_SYMBOL_GPL(vcpu_put); @@ -361,6 +359,9 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, kvm_flush_remote_tlbs(kvm); spin_unlock(&kvm->mmu_lock); + + kvm_arch_mmu_notifier_invalidate_range(kvm, start, end); + srcu_read_unlock(&kvm->srcu, idx); } @@ -469,6 +470,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn, } static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, .clear_flush_young = kvm_mmu_notifier_clear_flush_young, @@ -967,8 +969,7 @@ int __kvm_set_memory_region(struct kvm *kvm, /* Check for overlaps */ r = -EEXIST; kvm_for_each_memslot(slot, __kvm_memslots(kvm, as_id)) { - if ((slot->id >= KVM_USER_MEM_SLOTS) || - (slot->id == id)) + if (slot->id == id) continue; if (!((base_gfn + npages <= slot->base_gfn) || (base_gfn >= slot->base_gfn + slot->npages))) @@ -1315,17 +1316,6 @@ unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *w return gfn_to_hva_memslot_prot(slot, gfn, writable); } -static int get_user_page_nowait(unsigned long start, int write, - struct page **page) -{ - int flags = FOLL_NOWAIT | FOLL_HWPOISON; - - if (write) - flags |= FOLL_WRITE; - - return get_user_pages(start, 1, flags, page, NULL); -} - static inline int check_user_page_hwpoison(unsigned long addr) { int rc, flags = FOLL_HWPOISON | FOLL_WRITE; @@ -1374,7 +1364,8 @@ static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async, static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, bool *writable, kvm_pfn_t *pfn) { - struct page *page[1]; + unsigned int flags = FOLL_HWPOISON; + struct page *page; int npages = 0; might_sleep(); @@ -1382,35 +1373,26 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, if (writable) *writable = write_fault; - if (async) { - down_read(¤t->mm->mmap_sem); - npages = get_user_page_nowait(addr, write_fault, page); - up_read(¤t->mm->mmap_sem); - } else { - unsigned int flags = FOLL_HWPOISON; - - if (write_fault) - flags |= FOLL_WRITE; + if (write_fault) + flags |= FOLL_WRITE; + if (async) + flags |= FOLL_NOWAIT; - npages = get_user_pages_unlocked(addr, 1, page, flags); - } + npages = get_user_pages_unlocked(addr, 1, &page, flags); if (npages != 1) return npages; /* map read fault as writable if possible */ if (unlikely(!write_fault) && writable) { - struct page *wpage[1]; + struct page *wpage; - npages = __get_user_pages_fast(addr, 1, 1, wpage); - if (npages == 1) { + if (__get_user_pages_fast(addr, 1, 1, &wpage) == 1) { *writable = true; - put_page(page[0]); - page[0] = wpage[0]; + put_page(page); + page = wpage; } - - npages = 1; } - *pfn = page_to_pfn(page[0]); + *pfn = page_to_pfn(page); return npages; } @@ -1427,7 +1409,8 @@ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) static int hva_to_pfn_remapped(struct vm_area_struct *vma, unsigned long addr, bool *async, - bool write_fault, kvm_pfn_t *p_pfn) + bool write_fault, bool *writable, + kvm_pfn_t *p_pfn) { unsigned long pfn; int r; @@ -1453,6 +1436,8 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, } + if (writable) + *writable = true; /* * Get a reference here because callers of *hva_to_pfn* and @@ -1518,7 +1503,7 @@ retry: if (vma == NULL) pfn = KVM_PFN_ERR_FAULT; else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) { - r = hva_to_pfn_remapped(vma, addr, async, write_fault, &pfn); + r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn); if (r == -EAGAIN) goto retry; if (r < 0) @@ -1679,11 +1664,12 @@ void kvm_release_page_dirty(struct page *page) } EXPORT_SYMBOL_GPL(kvm_release_page_dirty); -static void kvm_release_pfn_dirty(kvm_pfn_t pfn) +void kvm_release_pfn_dirty(kvm_pfn_t pfn) { kvm_set_pfn_dirty(pfn); kvm_release_pfn_clean(pfn); } +EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); void kvm_set_pfn_dirty(kvm_pfn_t pfn) { @@ -2065,6 +2051,29 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn) } EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty); +void kvm_sigset_activate(struct kvm_vcpu *vcpu) +{ + if (!vcpu->sigset_active) + return; + + /* + * This does a lockless modification of ->real_blocked, which is fine + * because, only current can change ->real_blocked and all readers of + * ->real_blocked don't care as long ->real_blocked is always a subset + * of ->blocked. + */ + sigprocmask(SIG_SETMASK, &vcpu->sigset, ¤t->real_blocked); +} + +void kvm_sigset_deactivate(struct kvm_vcpu *vcpu) +{ + if (!vcpu->sigset_active) + return; + + sigprocmask(SIG_SETMASK, ¤t->real_blocked, NULL); + sigemptyset(¤t->real_blocked); +} + static void grow_halt_poll_ns(struct kvm_vcpu *vcpu) { unsigned int old, val, grow; @@ -2302,7 +2311,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) continue; } else if (pass && i > last_boosted_vcpu) break; - if (!ACCESS_ONCE(vcpu->preempted)) + if (!READ_ONCE(vcpu->preempted)) continue; if (vcpu == me) continue; @@ -2387,7 +2396,10 @@ static struct file_operations kvm_vcpu_fops = { */ static int create_vcpu_fd(struct kvm_vcpu *vcpu) { - return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC); + char name[8 + 1 + ITOA_MAX_LEN + 1]; + + snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id); + return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC); } static int kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) @@ -2519,19 +2531,16 @@ static long kvm_vcpu_ioctl(struct file *filp, if (unlikely(_IOC_TYPE(ioctl) != KVMIO)) return -EINVAL; -#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS) /* - * Special cases: vcpu ioctls that are asynchronous to vcpu execution, - * so vcpu_load() would break it. + * Some architectures have vcpu ioctls that are asynchronous to vcpu + * execution; mutex_lock() would break them. */ - if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_S390_IRQ || ioctl == KVM_INTERRUPT) - return kvm_arch_vcpu_ioctl(filp, ioctl, arg); -#endif - - - r = vcpu_load(vcpu); - if (r) + r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg); + if (r != -ENOIOCTLCMD) return r; + + if (mutex_lock_killable(&vcpu->mutex)) + return -EINTR; switch (ioctl) { case KVM_RUN: { struct pid *oldpid; @@ -2703,7 +2712,7 @@ out_free1: r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); } out: - vcpu_put(vcpu); + mutex_unlock(&vcpu->mutex); kfree(fpu); kfree(kvm_sregs); return r; @@ -2724,7 +2733,6 @@ static long kvm_vcpu_compat_ioctl(struct file *filp, case KVM_SET_SIGNAL_MASK: { struct kvm_signal_mask __user *sigmask_arg = argp; struct kvm_signal_mask kvm_sigmask; - compat_sigset_t csigset; sigset_t sigset; if (argp) { @@ -2733,13 +2741,11 @@ static long kvm_vcpu_compat_ioctl(struct file *filp, sizeof(kvm_sigmask))) goto out; r = -EINVAL; - if (kvm_sigmask.len != sizeof(csigset)) + if (kvm_sigmask.len != sizeof(compat_sigset_t)) goto out; r = -EFAULT; - if (copy_from_user(&csigset, sigmask_arg->sigset, - sizeof(csigset))) + if (get_compat_sigset(&sigset, (void *)sigmask_arg->sigset)) goto out; - sigset_from_compat(&sigset, &csigset); r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); } else r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL); @@ -3158,21 +3164,18 @@ static int kvm_dev_ioctl_create_vm(unsigned long type) return PTR_ERR(kvm); #ifdef CONFIG_KVM_MMIO r = kvm_coalesced_mmio_init(kvm); - if (r < 0) { - kvm_put_kvm(kvm); - return r; - } + if (r < 0) + goto put_kvm; #endif r = get_unused_fd_flags(O_CLOEXEC); - if (r < 0) { - kvm_put_kvm(kvm); - return r; - } + if (r < 0) + goto put_kvm; + file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); if (IS_ERR(file)) { put_unused_fd(r); - kvm_put_kvm(kvm); - return PTR_ERR(file); + r = PTR_ERR(file); + goto put_kvm; } /* @@ -3190,6 +3193,10 @@ static int kvm_dev_ioctl_create_vm(unsigned long type) fd_install(r, file); return r; + +put_kvm: + kvm_put_kvm(kvm); + return r; } static long kvm_dev_ioctl(struct file *filp, @@ -4009,8 +4016,12 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, /* A kmem cache lets us meet the alignment requirements of fx_save. */ if (!vcpu_align) vcpu_align = __alignof__(struct kvm_vcpu); - kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align, - 0, NULL); + kvm_vcpu_cache = + kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align, + SLAB_ACCOUNT, + offsetof(struct kvm_vcpu, arch), + sizeof_field(struct kvm_vcpu, arch), + NULL); if (!kvm_vcpu_cache) { r = -ENOMEM; goto out_free_3; |