From 450744051d201c4d72436ebf5b04b9a06ba2cf30 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 24 Mar 2016 13:05:18 -0600 Subject: vfio/pci: Hide broken INTx support from user INTx masking has two components, the first is that we need the ability to prevent the device from continuing to assert INTx. This is provided via the DisINTx bit in the command register and is the only thing we can really probe for when testing if INTx masking is supported. The second component is that the device needs to indicate if INTx is asserted via the interrupt status bit in the device status register. With these two features we can generically determine if one of the devices we own is asserting INTx, signal the user, and mask the interrupt while the user services the device. Generally if one or both of these components is broken we resort to APIC level interrupt masking, which requires an exclusive interrupt since we have no way to determine the source of the interrupt in a shared configuration. This often makes it difficult or impossible to configure the system for userspace use of the device, for an interrupt mode that the user may not need. One possible configuration of broken INTx masking is that the DisINTx support is fully functional, but the interrupt status bit never signals interrupt assertion. In this case we do have the ability to prevent the device from asserting INTx, but lack the ability to identify the interrupt source. For this case we can simply pretend that the device lacks INTx support entirely, keeping DisINTx set on the physical device, virtualizing this bit for the user, and virtualizing the interrupt pin register to indicate no INTx support. We already support virtualization of the DisINTx bit and already virtualize the interrupt pin for platforms without INTx support. By tying these components together, setting DisINTx on open and reset, and identifying devices broken in this particular way, we can provide support for them w/o the handicap of APIC level INTx masking. Intel i40e (XL710/X710) 10/20/40GbE NICs have been identified as being broken in this specific way. We leave the vfio-pci.nointxmask option as a mechanism to bypass this support, enabling INTx on the device with all the requirements of APIC level masking. Signed-off-by: Alex Williamson Cc: John Ronciak Cc: Jesse Brandeburg --- drivers/vfio/pci/vfio_pci.c | 55 ++++++++++++++++++++++++++++++------- drivers/vfio/pci/vfio_pci_config.c | 9 +++++- drivers/vfio/pci/vfio_pci_private.h | 1 + 3 files changed, 54 insertions(+), 11 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 712a84978e97..188b1ff03f5f 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -113,6 +113,35 @@ static inline bool vfio_pci_is_vga(struct pci_dev *pdev) static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev); static void vfio_pci_disable(struct vfio_pci_device *vdev); +/* + * INTx masking requires the ability to disable INTx signaling via PCI_COMMAND + * _and_ the ability detect when the device is asserting INTx via PCI_STATUS. + * If a device implements the former but not the latter we would typically + * expect broken_intx_masking be set and require an exclusive interrupt. + * However since we do have control of the device's ability to assert INTx, + * we can instead pretend that the device does not implement INTx, virtualizing + * the pin register to report zero and maintaining DisINTx set on the host. + */ +static bool vfio_pci_nointx(struct pci_dev *pdev) +{ + switch (pdev->vendor) { + case PCI_VENDOR_ID_INTEL: + switch (pdev->device) { + /* All i40e (XL710/X710) 10/20/40GbE NICs */ + case 0x1572: + case 0x1574: + case 0x1580 ... 0x1581: + case 0x1583 ... 0x1589: + case 0x37d0 ... 0x37d2: + return true; + default: + return false; + } + } + + return false; +} + static int vfio_pci_enable(struct vfio_pci_device *vdev) { struct pci_dev *pdev = vdev->pdev; @@ -136,23 +165,29 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev) pr_debug("%s: Couldn't store %s saved state\n", __func__, dev_name(&pdev->dev)); - ret = vfio_config_init(vdev); - if (ret) { - kfree(vdev->pci_saved_state); - vdev->pci_saved_state = NULL; - pci_disable_device(pdev); - return ret; + if (likely(!nointxmask)) { + if (vfio_pci_nointx(pdev)) { + dev_info(&pdev->dev, "Masking broken INTx support\n"); + vdev->nointx = true; + pci_intx(pdev, 0); + } else + vdev->pci_2_3 = pci_intx_mask_supported(pdev); } - if (likely(!nointxmask)) - vdev->pci_2_3 = pci_intx_mask_supported(pdev); - pci_read_config_word(pdev, PCI_COMMAND, &cmd); if (vdev->pci_2_3 && (cmd & PCI_COMMAND_INTX_DISABLE)) { cmd &= ~PCI_COMMAND_INTX_DISABLE; pci_write_config_word(pdev, PCI_COMMAND, cmd); } + ret = vfio_config_init(vdev); + if (ret) { + kfree(vdev->pci_saved_state); + vdev->pci_saved_state = NULL; + pci_disable_device(pdev); + return ret; + } + msix_pos = pdev->msix_cap; if (msix_pos) { u16 flags; @@ -304,7 +339,7 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type) if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) { u8 pin; pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin); - if (IS_ENABLED(CONFIG_VFIO_PCI_INTX) && pin) + if (IS_ENABLED(CONFIG_VFIO_PCI_INTX) && !vdev->nointx && pin) return 1; } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) { diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 142c533efec7..c9bb2290d39d 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -408,6 +408,7 @@ static void vfio_bar_restore(struct vfio_pci_device *vdev) { struct pci_dev *pdev = vdev->pdev; u32 *rbar = vdev->rbar; + u16 cmd; int i; if (pdev->is_virtfn) @@ -420,6 +421,12 @@ static void vfio_bar_restore(struct vfio_pci_device *vdev) pci_user_write_config_dword(pdev, i, *rbar); pci_user_write_config_dword(pdev, PCI_ROM_ADDRESS, *rbar); + + if (vdev->nointx) { + pci_user_read_config_word(pdev, PCI_COMMAND, &cmd); + cmd |= PCI_COMMAND_INTX_DISABLE; + pci_user_write_config_word(pdev, PCI_COMMAND, cmd); + } } static __le32 vfio_generate_bar_flags(struct pci_dev *pdev, int bar) @@ -1545,7 +1552,7 @@ int vfio_config_init(struct vfio_pci_device *vdev) *(__le16 *)&vconfig[PCI_DEVICE_ID] = cpu_to_le16(pdev->device); } - if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX)) + if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) || vdev->nointx) vconfig[PCI_INTERRUPT_PIN] = 0; ret = vfio_cap_init(vdev); diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 8a7d546d18a0..016c14a1b454 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -83,6 +83,7 @@ struct vfio_pci_device { bool bardirty; bool has_vga; bool needs_reset; + bool nointx; struct pci_saved_state *pci_saved_state; int refcnt; struct eventfd_ctx *err_trigger; -- cgit v1.2.3-59-g8ed1b From dc928109973336fd4d572d460cf3964d56aa087f Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 24 Mar 2016 13:06:16 -0600 Subject: vfio/pci: Add test for BAR restore If a device is reset without the memory or i/o bits enabled in the command register we may not detect it, potentially leaving the device without valid BAR programming. Add an additional test to check the BARs on each write to the command register. Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_config.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index c9bb2290d39d..b31107803362 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -522,6 +522,23 @@ static int vfio_basic_config_read(struct vfio_pci_device *vdev, int pos, return count; } +/* Test whether BARs match the value we think they should contain */ +static bool vfio_need_bar_restore(struct vfio_pci_device *vdev) +{ + int i = 0, pos = PCI_BASE_ADDRESS_0, ret; + u32 bar; + + for (; pos <= PCI_BASE_ADDRESS_5; i++, pos += 4) { + if (vdev->rbar[i]) { + ret = pci_user_read_config_dword(vdev->pdev, pos, &bar); + if (ret || vdev->rbar[i] != bar) + return true; + } + } + + return false; +} + static int vfio_basic_config_write(struct vfio_pci_device *vdev, int pos, int count, struct perm_bits *perm, int offset, __le32 val) @@ -560,7 +577,8 @@ static int vfio_basic_config_write(struct vfio_pci_device *vdev, int pos, * SR-IOV devices will trigger this, but we catch them later */ if ((new_mem && virt_mem && !phys_mem) || - (new_io && virt_io && !phys_io)) + (new_io && virt_io && !phys_io) || + vfio_need_bar_restore(vdev)) vfio_bar_restore(vdev); } -- cgit v1.2.3-59-g8ed1b From 5ed4aba1265f3c7532bf4ae1f25a277568b86871 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 8 Apr 2016 14:54:41 +1000 Subject: vfio_iommu_spapr_tce: Remove unneeded iommu_group_get_iommudata This removes iommu_group_get_iommudata() as the result is never used. As this is a minor cleanup, no change in behavior is expected. Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_spapr_tce.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 0582b72ef377..6419566a56d4 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -331,14 +331,12 @@ static void tce_iommu_free_table(struct iommu_table *tbl); static void tce_iommu_release(void *iommu_data) { struct tce_container *container = iommu_data; - struct iommu_table_group *table_group; struct tce_iommu_group *tcegrp; long i; while (tce_groups_attached(container)) { tcegrp = list_first_entry(&container->group_list, struct tce_iommu_group, next); - table_group = iommu_group_get_iommudata(tcegrp->grp); tce_iommu_detach_group(iommu_data, tcegrp->grp); } -- cgit v1.2.3-59-g8ed1b From f70552809419cd2abc0cc6469a07c9792a3aaa6c Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 29 Apr 2016 14:11:52 +1000 Subject: vfio_pci: Test for extended capabilities if config space > 256 bytes PCI-Express spec says that reading 4 bytes at offset 100h should return zero if there is no extended capability so VFIO reads this dword to know if there are extended capabilities. However it is not always possible to access the extended space so generic PCI code in pci_cfg_space_size_ext() checks if pci_read_config_dword() can read beyond 100h and if the check fails, it sets the config space size to 100h. VFIO does its own extended capabilities check by reading at offset 100h which may produce 0xffffffff which VFIO treats as the extended config space presense and calls vfio_ecap_init() which fails to parse capabilities (which is expected) but right before the exit, it writes zero at offset 100h which is beyond the buffer allocated for vdev->vconfig (which is 256 bytes) which leads to random memory corruption. This makes VFIO only check for the extended capabilities if the discovered config size is more than 256 bytes. Signed-off-by: Alexey Kardashevskiy Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_config.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index b31107803362..93601407dab8 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -1149,9 +1149,12 @@ static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos) return pcibios_err_to_errno(ret); if (PCI_X_CMD_VERSION(word)) { - /* Test for extended capabilities */ - pci_read_config_dword(pdev, PCI_CFG_SPACE_SIZE, &dword); - vdev->extended_caps = (dword != 0); + if (pdev->cfg_size > PCI_CFG_SPACE_SIZE) { + /* Test for extended capabilities */ + pci_read_config_dword(pdev, PCI_CFG_SPACE_SIZE, + &dword); + vdev->extended_caps = (dword != 0); + } return PCI_CAP_PCIX_SIZEOF_V2; } else return PCI_CAP_PCIX_SIZEOF_V0; @@ -1163,9 +1166,11 @@ static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos) return byte; case PCI_CAP_ID_EXP: - /* Test for extended capabilities */ - pci_read_config_dword(pdev, PCI_CFG_SPACE_SIZE, &dword); - vdev->extended_caps = (dword != 0); + if (pdev->cfg_size > PCI_CFG_SPACE_SIZE) { + /* Test for extended capabilities */ + pci_read_config_dword(pdev, PCI_CFG_SPACE_SIZE, &dword); + vdev->extended_caps = (dword != 0); + } /* length based on version */ if ((pcie_caps_reg(pdev) & PCI_EXP_FLAGS_VERS) == 1) -- cgit v1.2.3-59-g8ed1b