diff options
Diffstat (limited to 'arch/s390/mm/gmap.c')
-rw-r--r-- | arch/s390/mm/gmap.c | 226 |
1 files changed, 188 insertions, 38 deletions
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index dfee0ebb2fac..02d15c8dc92e 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -974,18 +974,18 @@ static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr, return -EAGAIN; if (prot == PROT_NONE && !pmd_i) { - pmd_val(new) |= _SEGMENT_ENTRY_INVALID; + new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID)); gmap_pmdp_xchg(gmap, pmdp, new, gaddr); } if (prot == PROT_READ && !pmd_p) { - pmd_val(new) &= ~_SEGMENT_ENTRY_INVALID; - pmd_val(new) |= _SEGMENT_ENTRY_PROTECT; + new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID)); + new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_PROTECT)); gmap_pmdp_xchg(gmap, pmdp, new, gaddr); } if (bits & GMAP_NOTIFY_MPROT) - pmd_val(*pmdp) |= _SEGMENT_ENTRY_GMAP_IN; + set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN))); /* Shadow GMAP protection needs split PMDs */ if (bits & GMAP_NOTIFY_SHADOW) @@ -1151,7 +1151,7 @@ int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val) address = pte_val(pte) & PAGE_MASK; address += gaddr & ~PAGE_MASK; *val = *(unsigned long *) address; - pte_val(*ptep) |= _PAGE_YOUNG; + set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG))); /* Do *NOT* clear the _PAGE_INVALID bit! */ rc = 0; } @@ -1183,6 +1183,7 @@ EXPORT_SYMBOL_GPL(gmap_read_table); static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, struct gmap_rmap *rmap) { + struct gmap_rmap *temp; void __rcu **slot; BUG_ON(!gmap_is_shadow(sg)); @@ -1190,6 +1191,12 @@ static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, if (slot) { rmap->next = radix_tree_deref_slot_protected(slot, &sg->guest_table_lock); + for (temp = rmap->next; temp; temp = temp->next) { + if (temp->raddr == rmap->raddr) { + kfree(rmap); + return; + } + } radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap); } else { rmap->next = NULL; @@ -1278,7 +1285,7 @@ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr, static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr) { asm volatile( - " .insn rrf,0xb98e0000,%0,%1,0,0" + " idte %0,0,%1" : : "a" (asce), "a" (vaddr) : "cc", "memory"); } @@ -2275,7 +2282,7 @@ EXPORT_SYMBOL_GPL(ptep_notify); static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp, unsigned long gaddr) { - pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_IN; + set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN))); gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1); } @@ -2294,7 +2301,7 @@ static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new, { gaddr &= HPAGE_MASK; pmdp_notify_gmap(gmap, pmdp, gaddr); - pmd_val(new) &= ~_SEGMENT_ENTRY_GMAP_IN; + new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_GMAP_IN)); if (MACHINE_HAS_TLB_GUEST) __pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce, IDTE_GLOBAL); @@ -2302,7 +2309,7 @@ static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new, __pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL); else __pmdp_csp(pmdp); - *pmdp = new; + set_pmd(pmdp, new); } static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr, @@ -2324,7 +2331,7 @@ static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr, _SEGMENT_ENTRY_GMAP_UC)); if (purge) __pmdp_csp(pmdp); - pmd_val(*pmdp) = _SEGMENT_ENTRY_EMPTY; + set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); } spin_unlock(&gmap->guest_table_lock); } @@ -2447,7 +2454,7 @@ static bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp, return false; /* Clear UC indication and reset protection */ - pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_UC; + set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_UC))); gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0); return true; } @@ -2508,8 +2515,9 @@ static const struct mm_walk_ops thp_split_walk_ops = { static inline void thp_split_mm(struct mm_struct *mm) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); - for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { + for_each_vma(vmi, vma) { vma->vm_flags &= ~VM_HUGEPAGE; vma->vm_flags |= VM_NOHUGEPAGE; walk_page_vma(vma, &thp_split_walk_ops, NULL); @@ -2577,8 +2585,9 @@ int gmap_mark_unmergeable(void) struct mm_struct *mm = current->mm; struct vm_area_struct *vma; int ret; + VMA_ITERATOR(vmi, mm, 0); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { ret = ksm_madvise(vma, vma->vm_start, vma->vm_end, MADV_UNMERGEABLE, &vma->vm_flags); if (ret) @@ -2601,6 +2610,18 @@ static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr, return 0; } +/* + * Give a chance to schedule after setting a key to 256 pages. + * We only hold the mm lock, which is a rwsem and the kvm srcu. + * Both can sleep. + */ +static int __s390_enable_skey_pmd(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + cond_resched(); + return 0; +} + static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr, unsigned long hmask, unsigned long next, struct mm_walk *walk) @@ -2623,12 +2644,14 @@ static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr, end = start + HPAGE_SIZE - 1; __storage_key_init_range(start, end); set_bit(PG_arch_1, &page->flags); + cond_resched(); return 0; } static const struct mm_walk_ops enable_skey_walk_ops = { .hugetlb_entry = __s390_enable_skey_hugetlb, .pte_entry = __s390_enable_skey_pte, + .pmd_entry = __s390_enable_skey_pmd, }; int s390_enable_skey(void) @@ -2676,41 +2699,168 @@ void s390_reset_cmma(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(s390_reset_cmma); +#define GATHER_GET_PAGES 32 + +struct reset_walk_state { + unsigned long next; + unsigned long count; + unsigned long pfns[GATHER_GET_PAGES]; +}; + +static int s390_gather_pages(pte_t *ptep, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct reset_walk_state *p = walk->private; + pte_t pte = READ_ONCE(*ptep); + + if (pte_present(pte)) { + /* we have a reference from the mapping, take an extra one */ + get_page(phys_to_page(pte_val(pte))); + p->pfns[p->count] = phys_to_pfn(pte_val(pte)); + p->next = next; + p->count++; + } + return p->count >= GATHER_GET_PAGES; +} + +static const struct mm_walk_ops gather_pages_ops = { + .pte_entry = s390_gather_pages, +}; + /* - * make inaccessible pages accessible again + * Call the Destroy secure page UVC on each page in the given array of PFNs. + * Each page needs to have an extra reference, which will be released here. */ -static int __s390_reset_acc(pte_t *ptep, unsigned long addr, - unsigned long next, struct mm_walk *walk) +void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns) { - pte_t pte = READ_ONCE(*ptep); + unsigned long i; - /* There is a reference through the mapping */ - if (pte_present(pte)) - WARN_ON_ONCE(uv_destroy_owned_page(pte_val(pte) & PAGE_MASK)); + for (i = 0; i < count; i++) { + /* we always have an extra reference */ + uv_destroy_owned_page(pfn_to_phys(pfns[i])); + /* get rid of the extra reference */ + put_page(pfn_to_page(pfns[i])); + cond_resched(); + } +} +EXPORT_SYMBOL_GPL(s390_uv_destroy_pfns); +/** + * __s390_uv_destroy_range - Call the destroy secure page UVC on each page + * in the given range of the given address space. + * @mm: the mm to operate on + * @start: the start of the range + * @end: the end of the range + * @interruptible: if not 0, stop when a fatal signal is received + * + * Walk the given range of the given address space and call the destroy + * secure page UVC on each page. Optionally exit early if a fatal signal is + * pending. + * + * Return: 0 on success, -EINTR if the function stopped before completing + */ +int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start, + unsigned long end, bool interruptible) +{ + struct reset_walk_state state = { .next = start }; + int r = 1; + + while (r > 0) { + state.count = 0; + mmap_read_lock(mm); + r = walk_page_range(mm, state.next, end, &gather_pages_ops, &state); + mmap_read_unlock(mm); + cond_resched(); + s390_uv_destroy_pfns(state.count, state.pfns); + if (interruptible && fatal_signal_pending(current)) + return -EINTR; + } return 0; } +EXPORT_SYMBOL_GPL(__s390_uv_destroy_range); -static const struct mm_walk_ops reset_acc_walk_ops = { - .pte_entry = __s390_reset_acc, -}; +/** + * s390_unlist_old_asce - Remove the topmost level of page tables from the + * list of page tables of the gmap. + * @gmap: the gmap whose table is to be removed + * + * On s390x, KVM keeps a list of all pages containing the page tables of the + * gmap (the CRST list). This list is used at tear down time to free all + * pages that are now not needed anymore. + * + * This function removes the topmost page of the tree (the one pointed to by + * the ASCE) from the CRST list. + * + * This means that it will not be freed when the VM is torn down, and needs + * to be handled separately by the caller, unless a leak is actually + * intended. Notice that this function will only remove the page from the + * list, the page will still be used as a top level page table (and ASCE). + */ +void s390_unlist_old_asce(struct gmap *gmap) +{ + struct page *old; -#include <linux/sched/mm.h> -void s390_reset_acc(struct mm_struct *mm) + old = virt_to_page(gmap->table); + spin_lock(&gmap->guest_table_lock); + list_del(&old->lru); + /* + * Sometimes the topmost page might need to be "removed" multiple + * times, for example if the VM is rebooted into secure mode several + * times concurrently, or if s390_replace_asce fails after calling + * s390_remove_old_asce and is attempted again later. In that case + * the old asce has been removed from the list, and therefore it + * will not be freed when the VM terminates, but the ASCE is still + * in use and still pointed to. + * A subsequent call to replace_asce will follow the pointer and try + * to remove the same page from the list again. + * Therefore it's necessary that the page of the ASCE has valid + * pointers, so list_del can work (and do nothing) without + * dereferencing stale or invalid pointers. + */ + INIT_LIST_HEAD(&old->lru); + spin_unlock(&gmap->guest_table_lock); +} +EXPORT_SYMBOL_GPL(s390_unlist_old_asce); + +/** + * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy + * @gmap: the gmap whose ASCE needs to be replaced + * + * If the allocation of the new top level page table fails, the ASCE is not + * replaced. + * In any case, the old ASCE is always removed from the gmap CRST list. + * Therefore the caller has to make sure to save a pointer to it + * beforehand, unless a leak is actually intended. + */ +int s390_replace_asce(struct gmap *gmap) { - if (!mm_is_protected(mm)) - return; + unsigned long asce; + struct page *page; + void *table; + + s390_unlist_old_asce(gmap); + + page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER); + if (!page) + return -ENOMEM; + table = page_to_virt(page); + memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT)); + /* - * we might be called during - * reset: we walk the pages and clear - * close of all kvm file descriptors: we walk the pages and clear - * exit of process on fd closure: vma already gone, do nothing + * The caller has to deal with the old ASCE, but here we make sure + * the new one is properly added to the CRST list, so that + * it will be freed when the VM is torn down. */ - if (!mmget_not_zero(mm)) - return; - mmap_read_lock(mm); - walk_page_range(mm, 0, TASK_SIZE, &reset_acc_walk_ops, NULL); - mmap_read_unlock(mm); - mmput(mm); + spin_lock(&gmap->guest_table_lock); + list_add(&page->lru, &gmap->crst_list); + spin_unlock(&gmap->guest_table_lock); + + /* Set new table origin while preserving existing ASCE control bits */ + asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table); + WRITE_ONCE(gmap->asce, asce); + WRITE_ONCE(gmap->mm->context.gmap_asce, asce); + WRITE_ONCE(gmap->table, table); + + return 0; } -EXPORT_SYMBOL_GPL(s390_reset_acc); +EXPORT_SYMBOL_GPL(s390_replace_asce); |