From 00a5c58d9499bd0c290b57205f43a70f2e69d3f6 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Wed, 4 Jul 2018 16:13:46 +1000 Subject: KVM: PPC: Make iommu_table::it_userspace big endian We are going to reuse multilevel TCE code for the userspace copy of the TCE table and since it is big endian, let's make the copy big endian too. Reviewed-by: David Gibson Signed-off-by: Alexey Kardashevskiy Acked-by: Paul Mackerras Signed-off-by: Michael Ellerman --- drivers/vfio/vfio_iommu_spapr_tce.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 759a5bdd40e1..8ab124a67311 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -230,7 +230,7 @@ static long tce_iommu_userspace_view_alloc(struct iommu_table *tbl, decrement_locked_vm(mm, cb >> PAGE_SHIFT); return -ENOMEM; } - tbl->it_userspace = uas; + tbl->it_userspace = (__be64 *) uas; return 0; } @@ -482,20 +482,20 @@ static void tce_iommu_unuse_page_v2(struct tce_container *container, struct mm_iommu_table_group_mem_t *mem = NULL; int ret; unsigned long hpa = 0; - unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); if (!pua) return; - ret = tce_iommu_prereg_ua_to_hpa(container, *pua, IOMMU_PAGE_SIZE(tbl), - &hpa, &mem); + ret = tce_iommu_prereg_ua_to_hpa(container, be64_to_cpu(*pua), + IOMMU_PAGE_SIZE(tbl), &hpa, &mem); if (ret) - pr_debug("%s: tce %lx at #%lx was not cached, ret=%d\n", - __func__, *pua, entry, ret); + pr_debug("%s: tce %llx at #%lx was not cached, ret=%d\n", + __func__, be64_to_cpu(*pua), entry, ret); if (mem) mm_iommu_mapped_dec(mem); - *pua = 0; + *pua = cpu_to_be64(0); } static int tce_iommu_clear(struct tce_container *container, @@ -607,8 +607,7 @@ static long tce_iommu_build_v2(struct tce_container *container, for (i = 0; i < pages; ++i) { struct mm_iommu_table_group_mem_t *mem = NULL; - unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, - entry + i); + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry + i); ret = tce_iommu_prereg_ua_to_hpa(container, tce, IOMMU_PAGE_SIZE(tbl), &hpa, &mem); @@ -642,7 +641,7 @@ static long tce_iommu_build_v2(struct tce_container *container, if (dirtmp != DMA_NONE) tce_iommu_unuse_page_v2(container, tbl, entry + i); - *pua = tce; + *pua = cpu_to_be64(tce); tce += IOMMU_PAGE_SIZE(tbl); } -- cgit v1.2.3-59-g8ed1b From 090bad39b237aad92d8e01baa033699cf0c81cbe Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Wed, 4 Jul 2018 16:13:47 +1000 Subject: powerpc/powernv: Add indirect levels to it_userspace We want to support sparse memory and therefore huge chunks of DMA windows do not need to be mapped. If a DMA window big enough to require 2 or more indirect levels, and a DMA window is used to map all RAM (which is a default case for 64bit window), we can actually save some memory by not allocation TCE for regions which we are not going to map anyway. The hardware tables alreary support indirect levels but we also keep host-physical-to-userspace translation array which is allocated by vmalloc() and is a flat array which might use quite some memory. This converts it_userspace from vmalloc'ed array to a multi level table. As the format becomes platform dependend, this replaces the direct access to it_usespace with a iommu_table_ops::useraddrptr hook which returns a pointer to the userspace copy of a TCE; future extension will return NULL if the level was not allocated. This should not change non-KVM handling of TCE tables and it_userspace will not be allocated for non-KVM tables. Reviewed-by: David Gibson Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/iommu.h | 6 +-- arch/powerpc/kvm/book3s_64_vio_hv.c | 8 ---- arch/powerpc/platforms/powernv/pci-ioda-tce.c | 65 +++++++++++++++++++++------ arch/powerpc/platforms/powernv/pci-ioda.c | 23 +++++++--- arch/powerpc/platforms/powernv/pci.h | 3 +- drivers/vfio/vfio_iommu_spapr_tce.c | 46 ------------------- 6 files changed, 73 insertions(+), 78 deletions(-) (limited to 'drivers/vfio') diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 803ac70ecedf..4bdcf22509e6 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -69,6 +69,8 @@ struct iommu_table_ops { long index, unsigned long *hpa, enum dma_data_direction *direction); + + __be64 *(*useraddrptr)(struct iommu_table *tbl, long index); #endif void (*clear)(struct iommu_table *tbl, long index, long npages); @@ -123,9 +125,7 @@ struct iommu_table { }; #define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \ - ((tbl)->it_userspace ? \ - &((tbl)->it_userspace[(entry) - (tbl)->it_offset]) : \ - NULL) + ((tbl)->it_ops->useraddrptr((tbl), (entry))) /* Pure 2^n version of get_order */ static inline __attribute_const__ diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 236f74b210a7..ee98cf6180d7 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -206,10 +206,6 @@ static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm, /* it_userspace allocation might be delayed */ return H_TOO_HARD; - pua = (void *) vmalloc_to_phys(pua); - if (WARN_ON_ONCE_RM(!pua)) - return H_HARDWARE; - mem = mm_iommu_lookup_rm(kvm->mm, be64_to_cpu(*pua), pgsize); if (!mem) return H_TOO_HARD; @@ -282,10 +278,6 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, if (WARN_ON_ONCE_RM(mm_iommu_ua_to_hpa_rm(mem, ua, &hpa))) return H_HARDWARE; - pua = (void *) vmalloc_to_phys(pua); - if (WARN_ON_ONCE_RM(!pua)) - return H_HARDWARE; - if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem))) return H_CLOSED; diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c index 726b8693f5ae..88cecc1815d9 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c @@ -31,9 +31,9 @@ void pnv_pci_setup_iommu_table(struct iommu_table *tbl, tbl->it_type = TCE_PCI; } -static __be64 *pnv_tce(struct iommu_table *tbl, long idx) +static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx) { - __be64 *tmp = ((__be64 *)tbl->it_base); + __be64 *tmp = user ? tbl->it_userspace : (__be64 *) tbl->it_base; int level = tbl->it_indirect_levels; const long shift = ilog2(tbl->it_level_size); unsigned long mask = (tbl->it_level_size - 1) << (level * shift); @@ -67,7 +67,7 @@ int pnv_tce_build(struct iommu_table *tbl, long index, long npages, ((rpn + i) << tbl->it_page_shift); unsigned long idx = index - tbl->it_offset + i; - *(pnv_tce(tbl, idx)) = cpu_to_be64(newtce); + *(pnv_tce(tbl, false, idx)) = cpu_to_be64(newtce); } return 0; @@ -86,12 +86,21 @@ int pnv_tce_xchg(struct iommu_table *tbl, long index, if (newtce & TCE_PCI_WRITE) newtce |= TCE_PCI_READ; - oldtce = be64_to_cpu(xchg(pnv_tce(tbl, idx), cpu_to_be64(newtce))); + oldtce = be64_to_cpu(xchg(pnv_tce(tbl, false, idx), + cpu_to_be64(newtce))); *hpa = oldtce & ~(TCE_PCI_READ | TCE_PCI_WRITE); *direction = iommu_tce_direction(oldtce); return 0; } + +__be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index) +{ + if (WARN_ON_ONCE(!tbl->it_userspace)) + return NULL; + + return pnv_tce(tbl, true, index - tbl->it_offset); +} #endif void pnv_tce_free(struct iommu_table *tbl, long index, long npages) @@ -101,13 +110,15 @@ void pnv_tce_free(struct iommu_table *tbl, long index, long npages) for (i = 0; i < npages; i++) { unsigned long idx = index - tbl->it_offset + i; - *(pnv_tce(tbl, idx)) = cpu_to_be64(0); + *(pnv_tce(tbl, false, idx)) = cpu_to_be64(0); } } unsigned long pnv_tce_get(struct iommu_table *tbl, long index) { - return be64_to_cpu(*(pnv_tce(tbl, index - tbl->it_offset))); + __be64 *ptce = pnv_tce(tbl, false, index - tbl->it_offset); + + return be64_to_cpu(*ptce); } static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr, @@ -144,6 +155,10 @@ void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl) pnv_pci_ioda2_table_do_free_pages((__be64 *)tbl->it_base, size, tbl->it_indirect_levels); + if (tbl->it_userspace) { + pnv_pci_ioda2_table_do_free_pages(tbl->it_userspace, size, + tbl->it_indirect_levels); + } } static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned int shift, @@ -191,10 +206,11 @@ static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned int shift, long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, __u32 page_shift, __u64 window_size, __u32 levels, - struct iommu_table *tbl) + bool alloc_userspace_copy, struct iommu_table *tbl) { - void *addr; + void *addr, *uas = NULL; unsigned long offset = 0, level_shift, total_allocated = 0; + unsigned long total_allocated_uas = 0; const unsigned int window_shift = ilog2(window_size); unsigned int entries_shift = window_shift - page_shift; unsigned int table_shift = max_t(unsigned int, entries_shift + 3, @@ -228,10 +244,20 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, * we did not allocate as much as we wanted, * release partially allocated table. */ - if (offset < tce_table_size) { - pnv_pci_ioda2_table_do_free_pages(addr, - 1ULL << (level_shift - 3), levels - 1); - return -ENOMEM; + if (offset < tce_table_size) + goto free_tces_exit; + + /* Allocate userspace view of the TCE table */ + if (alloc_userspace_copy) { + offset = 0; + uas = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift, + levels, tce_table_size, &offset, + &total_allocated_uas); + if (!uas) + goto free_tces_exit; + if (offset < tce_table_size || + total_allocated_uas != total_allocated) + goto free_uas_exit; } /* Setup linux iommu table */ @@ -240,11 +266,22 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, tbl->it_level_size = 1ULL << (level_shift - 3); tbl->it_indirect_levels = levels - 1; tbl->it_allocated_size = total_allocated; + tbl->it_userspace = uas; - pr_devel("Created TCE table: ws=%08llx ts=%lx @%08llx\n", - window_size, tce_table_size, bus_offset); + pr_debug("Created TCE table: ws=%08llx ts=%lx @%08llx base=%lx uas=%p levels=%d\n", + window_size, tce_table_size, bus_offset, tbl->it_base, + tbl->it_userspace, levels); return 0; + +free_uas_exit: + pnv_pci_ioda2_table_do_free_pages(uas, + 1ULL << (level_shift - 3), levels - 1); +free_tces_exit: + pnv_pci_ioda2_table_do_free_pages(addr, + 1ULL << (level_shift - 3), levels - 1); + + return -ENOMEM; } static void pnv_iommu_table_group_link_free(struct rcu_head *head) diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 4abf1175626e..fc38f06ee41d 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2036,6 +2036,7 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = { #ifdef CONFIG_IOMMU_API .exchange = pnv_ioda1_tce_xchg, .exchange_rm = pnv_ioda1_tce_xchg_rm, + .useraddrptr = pnv_tce_useraddrptr, #endif .clear = pnv_ioda1_tce_free, .get = pnv_tce_get, @@ -2200,6 +2201,7 @@ static struct iommu_table_ops pnv_ioda2_iommu_ops = { #ifdef CONFIG_IOMMU_API .exchange = pnv_ioda2_tce_xchg, .exchange_rm = pnv_ioda2_tce_xchg_rm, + .useraddrptr = pnv_tce_useraddrptr, #endif .clear = pnv_ioda2_tce_free, .get = pnv_tce_get, @@ -2455,7 +2457,7 @@ void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable) static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group, int num, __u32 page_shift, __u64 window_size, __u32 levels, - struct iommu_table **ptbl) + bool alloc_userspace_copy, struct iommu_table **ptbl) { struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe, table_group); @@ -2472,7 +2474,7 @@ static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group, ret = pnv_pci_ioda2_table_alloc_pages(nid, bus_offset, page_shift, window_size, - levels, tbl); + levels, alloc_userspace_copy, tbl); if (ret) { iommu_tce_table_put(tbl); return ret; @@ -2505,7 +2507,7 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe) rc = pnv_pci_ioda2_create_table(&pe->table_group, 0, IOMMU_PAGE_SHIFT_4K, window_size, - POWERNV_IOMMU_DEFAULT_LEVELS, &tbl); + POWERNV_IOMMU_DEFAULT_LEVELS, false, &tbl); if (rc) { pe_err(pe, "Failed to create 32-bit TCE table, err %ld", rc); @@ -2592,7 +2594,16 @@ static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift, tce_table_size, direct_table_size); } - return bytes; + return bytes + bytes; /* one for HW table, one for userspace copy */ +} + +static long pnv_pci_ioda2_create_table_userspace( + struct iommu_table_group *table_group, + int num, __u32 page_shift, __u64 window_size, __u32 levels, + struct iommu_table **ptbl) +{ + return pnv_pci_ioda2_create_table(table_group, + num, page_shift, window_size, levels, true, ptbl); } static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group) @@ -2621,7 +2632,7 @@ static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group) static struct iommu_table_group_ops pnv_pci_ioda2_ops = { .get_table_size = pnv_pci_ioda2_get_table_size, - .create_table = pnv_pci_ioda2_create_table, + .create_table = pnv_pci_ioda2_create_table_userspace, .set_window = pnv_pci_ioda2_set_window, .unset_window = pnv_pci_ioda2_unset_window, .take_ownership = pnv_ioda2_take_ownership, @@ -2726,7 +2737,7 @@ static void pnv_ioda2_npu_take_ownership(struct iommu_table_group *table_group) static struct iommu_table_group_ops pnv_pci_ioda2_npu_ops = { .get_table_size = pnv_pci_ioda2_get_table_size, - .create_table = pnv_pci_ioda2_create_table, + .create_table = pnv_pci_ioda2_create_table_userspace, .set_window = pnv_pci_ioda2_npu_set_window, .unset_window = pnv_pci_ioda2_npu_unset_window, .take_ownership = pnv_ioda2_npu_take_ownership, diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index fa90f60e89ce..2962f6ddb2a8 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -267,11 +267,12 @@ extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages, extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages); extern int pnv_tce_xchg(struct iommu_table *tbl, long index, unsigned long *hpa, enum dma_data_direction *direction); +extern __be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index); extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index); extern long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, __u32 page_shift, __u64 window_size, __u32 levels, - struct iommu_table *tbl); + bool alloc_userspace_copy, struct iommu_table *tbl); extern void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl); extern long pnv_pci_link_table_and_group(int node, int num, diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 8ab124a67311..54ae6c2be1b7 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -211,44 +211,6 @@ static long tce_iommu_register_pages(struct tce_container *container, return 0; } -static long tce_iommu_userspace_view_alloc(struct iommu_table *tbl, - struct mm_struct *mm) -{ - unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) * - tbl->it_size, PAGE_SIZE); - unsigned long *uas; - long ret; - - BUG_ON(tbl->it_userspace); - - ret = try_increment_locked_vm(mm, cb >> PAGE_SHIFT); - if (ret) - return ret; - - uas = vzalloc(cb); - if (!uas) { - decrement_locked_vm(mm, cb >> PAGE_SHIFT); - return -ENOMEM; - } - tbl->it_userspace = (__be64 *) uas; - - return 0; -} - -static void tce_iommu_userspace_view_free(struct iommu_table *tbl, - struct mm_struct *mm) -{ - unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) * - tbl->it_size, PAGE_SIZE); - - if (!tbl->it_userspace) - return; - - vfree(tbl->it_userspace); - tbl->it_userspace = NULL; - decrement_locked_vm(mm, cb >> PAGE_SHIFT); -} - static bool tce_page_is_contained(struct page *page, unsigned page_shift) { /* @@ -599,12 +561,6 @@ static long tce_iommu_build_v2(struct tce_container *container, unsigned long hpa; enum dma_data_direction dirtmp; - if (!tbl->it_userspace) { - ret = tce_iommu_userspace_view_alloc(tbl, container->mm); - if (ret) - return ret; - } - for (i = 0; i < pages; ++i) { struct mm_iommu_table_group_mem_t *mem = NULL; __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry + i); @@ -685,7 +641,6 @@ static void tce_iommu_free_table(struct tce_container *container, { unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT; - tce_iommu_userspace_view_free(tbl, container->mm); iommu_tce_table_put(tbl); decrement_locked_vm(container->mm, pages); } @@ -1200,7 +1155,6 @@ static void tce_iommu_release_ownership(struct tce_container *container, continue; tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); - tce_iommu_userspace_view_free(tbl, container->mm); if (tbl->it_map) iommu_release_ownership(tbl); -- cgit v1.2.3-59-g8ed1b From a68bd1267b7286b1687905651b404e765046de25 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Wed, 4 Jul 2018 16:13:49 +1000 Subject: powerpc/powernv/ioda: Allocate indirect TCE levels on demand At the moment we allocate the entire TCE table, twice (hardware part and userspace translation cache). This normally works as we normally have contigous memory and the guest will map entire RAM for 64bit DMA. However if we have sparse RAM (one example is a memory device), then we will allocate TCEs which will never be used as the guest only maps actual memory for DMA. If it is a single level TCE table, there is nothing we can really do but if it a multilevel table, we can skip allocating TCEs we know we won't need. This adds ability to allocate only first level, saving memory. This changes iommu_table::free() to avoid allocating of an extra level; iommu_table::set() will do this when needed. This adds @alloc parameter to iommu_table::exchange() to tell the callback if it can allocate an extra level; the flag is set to "false" for the realmode KVM handlers of H_PUT_TCE hcalls and the callback returns H_TOO_HARD. This still requires the entire table to be counted in mm::locked_vm. To be conservative, this only does on-demand allocation when the usespace cache table is requested which is the case of VFIO. The example math for a system replicating a powernv setup with NVLink2 in a guest: 16GB RAM mapped at 0x0 128GB GPU RAM window (16GB of actual RAM) mapped at 0x244000000000 the table to cover that all with 64K pages takes: (((0x244000000000 + 0x2000000000) >> 16)*8)>>20 = 4556MB If we allocate only necessary TCE levels, we will only need: (((0x400000000 + 0x400000000) >> 16)*8)>>20 = 4MB (plus some for indirect levels). Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/iommu.h | 7 ++- arch/powerpc/kvm/book3s_64_vio_hv.c | 4 +- arch/powerpc/platforms/powernv/pci-ioda-tce.c | 73 +++++++++++++++++++++------ arch/powerpc/platforms/powernv/pci-ioda.c | 8 +-- arch/powerpc/platforms/powernv/pci.h | 6 ++- drivers/vfio/vfio_iommu_spapr_tce.c | 2 +- 6 files changed, 73 insertions(+), 27 deletions(-) (limited to 'drivers/vfio') diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index 4bdcf22509e6..daa3ee5d7ad2 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -70,7 +70,7 @@ struct iommu_table_ops { unsigned long *hpa, enum dma_data_direction *direction); - __be64 *(*useraddrptr)(struct iommu_table *tbl, long index); + __be64 *(*useraddrptr)(struct iommu_table *tbl, long index, bool alloc); #endif void (*clear)(struct iommu_table *tbl, long index, long npages); @@ -122,10 +122,13 @@ struct iommu_table { __be64 *it_userspace; /* userspace view of the table */ struct iommu_table_ops *it_ops; struct kref it_kref; + int it_nid; }; +#define IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry) \ + ((tbl)->it_ops->useraddrptr((tbl), (entry), false)) #define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \ - ((tbl)->it_ops->useraddrptr((tbl), (entry))) + ((tbl)->it_ops->useraddrptr((tbl), (entry), true)) /* Pure 2^n version of get_order */ static inline __attribute_const__ diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index ee98cf6180d7..d4bcd1b17b09 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -200,7 +200,7 @@ static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm, { struct mm_iommu_table_group_mem_t *mem = NULL; const unsigned long pgsize = 1ULL << tbl->it_page_shift; - __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry); if (!pua) /* it_userspace allocation might be delayed */ @@ -264,7 +264,7 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, { long ret; unsigned long hpa = 0; - __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); + __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry); struct mm_iommu_table_group_mem_t *mem; if (!pua) diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c index 123c49925b46..6c5db1acbe8d 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c @@ -48,7 +48,7 @@ static __be64 *pnv_alloc_tce_level(int nid, unsigned int shift) return addr; } -static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx) +static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx, bool alloc) { __be64 *tmp = user ? tbl->it_userspace : (__be64 *) tbl->it_base; int level = tbl->it_indirect_levels; @@ -57,7 +57,23 @@ static __be64 *pnv_tce(struct iommu_table *tbl, bool user, long idx) while (level) { int n = (idx & mask) >> (level * shift); - unsigned long tce = be64_to_cpu(tmp[n]); + unsigned long tce; + + if (tmp[n] == 0) { + __be64 *tmp2; + + if (!alloc) + return NULL; + + tmp2 = pnv_alloc_tce_level(tbl->it_nid, + ilog2(tbl->it_level_size) + 3); + if (!tmp2) + return NULL; + + tmp[n] = cpu_to_be64(__pa(tmp2) | + TCE_PCI_READ | TCE_PCI_WRITE); + } + tce = be64_to_cpu(tmp[n]); tmp = __va(tce & ~(TCE_PCI_READ | TCE_PCI_WRITE)); idx &= ~mask; @@ -84,7 +100,7 @@ int pnv_tce_build(struct iommu_table *tbl, long index, long npages, ((rpn + i) << tbl->it_page_shift); unsigned long idx = index - tbl->it_offset + i; - *(pnv_tce(tbl, false, idx)) = cpu_to_be64(newtce); + *(pnv_tce(tbl, false, idx, true)) = cpu_to_be64(newtce); } return 0; @@ -92,31 +108,46 @@ int pnv_tce_build(struct iommu_table *tbl, long index, long npages, #ifdef CONFIG_IOMMU_API int pnv_tce_xchg(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction) + unsigned long *hpa, enum dma_data_direction *direction, + bool alloc) { u64 proto_tce = iommu_direction_to_tce_perm(*direction); unsigned long newtce = *hpa | proto_tce, oldtce; unsigned long idx = index - tbl->it_offset; + __be64 *ptce = NULL; BUG_ON(*hpa & ~IOMMU_PAGE_MASK(tbl)); + if (*direction == DMA_NONE) { + ptce = pnv_tce(tbl, false, idx, false); + if (!ptce) { + *hpa = 0; + return 0; + } + } + + if (!ptce) { + ptce = pnv_tce(tbl, false, idx, alloc); + if (!ptce) + return alloc ? H_HARDWARE : H_TOO_HARD; + } + if (newtce & TCE_PCI_WRITE) newtce |= TCE_PCI_READ; - oldtce = be64_to_cpu(xchg(pnv_tce(tbl, false, idx), - cpu_to_be64(newtce))); + oldtce = be64_to_cpu(xchg(ptce, cpu_to_be64(newtce))); *hpa = oldtce & ~(TCE_PCI_READ | TCE_PCI_WRITE); *direction = iommu_tce_direction(oldtce); return 0; } -__be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index) +__be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index, bool alloc) { if (WARN_ON_ONCE(!tbl->it_userspace)) return NULL; - return pnv_tce(tbl, true, index - tbl->it_offset); + return pnv_tce(tbl, true, index - tbl->it_offset, alloc); } #endif @@ -126,14 +157,19 @@ void pnv_tce_free(struct iommu_table *tbl, long index, long npages) for (i = 0; i < npages; i++) { unsigned long idx = index - tbl->it_offset + i; + __be64 *ptce = pnv_tce(tbl, false, idx, false); - *(pnv_tce(tbl, false, idx)) = cpu_to_be64(0); + if (ptce) + *ptce = cpu_to_be64(0); } } unsigned long pnv_tce_get(struct iommu_table *tbl, long index) { - __be64 *ptce = pnv_tce(tbl, false, index - tbl->it_offset); + __be64 *ptce = pnv_tce(tbl, false, index - tbl->it_offset, false); + + if (!ptce) + return 0; return be64_to_cpu(*ptce); } @@ -224,6 +260,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, unsigned int table_shift = max_t(unsigned int, entries_shift + 3, PAGE_SHIFT); const unsigned long tce_table_size = 1UL << table_shift; + unsigned int tmplevels = levels; if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS)) return -EINVAL; @@ -231,6 +268,9 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, if (!is_power_of_2(window_size)) return -EINVAL; + if (alloc_userspace_copy && (window_size > (1ULL << 32))) + tmplevels = 1; + /* Adjust direct table size from window_size and levels */ entries_shift = (entries_shift + levels - 1) / levels; level_shift = entries_shift + 3; @@ -241,7 +281,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, /* Allocate TCE table */ addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift, - levels, tce_table_size, &offset, &total_allocated); + tmplevels, tce_table_size, &offset, &total_allocated); /* addr==NULL means that the first level allocation failed */ if (!addr) @@ -252,7 +292,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, * we did not allocate as much as we wanted, * release partially allocated table. */ - if (offset < tce_table_size) + if (tmplevels == levels && offset < tce_table_size) goto free_tces_exit; /* Allocate userspace view of the TCE table */ @@ -263,8 +303,8 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, &total_allocated_uas); if (!uas) goto free_tces_exit; - if (offset < tce_table_size || - total_allocated_uas != total_allocated) + if (tmplevels == levels && (offset < tce_table_size || + total_allocated_uas != total_allocated)) goto free_uas_exit; } @@ -275,10 +315,11 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, tbl->it_indirect_levels = levels - 1; tbl->it_allocated_size = total_allocated; tbl->it_userspace = uas; + tbl->it_nid = nid; - pr_debug("Created TCE table: ws=%08llx ts=%lx @%08llx base=%lx uas=%p levels=%d\n", + pr_debug("Created TCE table: ws=%08llx ts=%lx @%08llx base=%lx uas=%p levels=%d/%d\n", window_size, tce_table_size, bus_offset, tbl->it_base, - tbl->it_userspace, levels); + tbl->it_userspace, tmplevels, levels); return 0; diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index fc38f06ee41d..b4475f71a0b4 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2003,7 +2003,7 @@ static int pnv_ioda1_tce_build(struct iommu_table *tbl, long index, static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index, unsigned long *hpa, enum dma_data_direction *direction) { - long ret = pnv_tce_xchg(tbl, index, hpa, direction); + long ret = pnv_tce_xchg(tbl, index, hpa, direction, true); if (!ret) pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, false); @@ -2014,7 +2014,7 @@ static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index, static int pnv_ioda1_tce_xchg_rm(struct iommu_table *tbl, long index, unsigned long *hpa, enum dma_data_direction *direction) { - long ret = pnv_tce_xchg(tbl, index, hpa, direction); + long ret = pnv_tce_xchg(tbl, index, hpa, direction, false); if (!ret) pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, true); @@ -2168,7 +2168,7 @@ static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index, static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index, unsigned long *hpa, enum dma_data_direction *direction) { - long ret = pnv_tce_xchg(tbl, index, hpa, direction); + long ret = pnv_tce_xchg(tbl, index, hpa, direction, true); if (!ret) pnv_pci_ioda2_tce_invalidate(tbl, index, 1, false); @@ -2179,7 +2179,7 @@ static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index, static int pnv_ioda2_tce_xchg_rm(struct iommu_table *tbl, long index, unsigned long *hpa, enum dma_data_direction *direction) { - long ret = pnv_tce_xchg(tbl, index, hpa, direction); + long ret = pnv_tce_xchg(tbl, index, hpa, direction, false); if (!ret) pnv_pci_ioda2_tce_invalidate(tbl, index, 1, true); diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index 2962f6ddb2a8..0020937fc694 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -266,8 +266,10 @@ extern int pnv_tce_build(struct iommu_table *tbl, long index, long npages, unsigned long attrs); extern void pnv_tce_free(struct iommu_table *tbl, long index, long npages); extern int pnv_tce_xchg(struct iommu_table *tbl, long index, - unsigned long *hpa, enum dma_data_direction *direction); -extern __be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index); + unsigned long *hpa, enum dma_data_direction *direction, + bool alloc); +extern __be64 *pnv_tce_useraddrptr(struct iommu_table *tbl, long index, + bool alloc); extern unsigned long pnv_tce_get(struct iommu_table *tbl, long index); extern long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset, diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 54ae6c2be1b7..11a4c194d6e3 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -631,7 +631,7 @@ static long tce_iommu_create_table(struct tce_container *container, page_shift, window_size, levels, ptbl); WARN_ON(!ret && !(*ptbl)->it_ops->free); - WARN_ON(!ret && ((*ptbl)->it_allocated_size != table_size)); + WARN_ON(!ret && ((*ptbl)->it_allocated_size > table_size)); return ret; } -- cgit v1.2.3-59-g8ed1b