diff options
Diffstat (limited to 'arch/x86/mm/pat/set_memory.c')
-rw-r--r-- | arch/x86/mm/pat/set_memory.c | 389 |
1 files changed, 342 insertions, 47 deletions
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 80c9037ffadf..8834c76f91c9 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -32,8 +32,6 @@ #include <asm/pgalloc.h> #include <asm/proto.h> #include <asm/memtype.h> -#include <asm/hyperv-tlfs.h> -#include <asm/mshyperv.h> #include "../mm_internal.h" @@ -75,6 +73,7 @@ static DEFINE_SPINLOCK(cpa_lock); #define CPA_ARRAY 2 #define CPA_PAGES_ARRAY 4 #define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */ +#define CPA_COLLAPSE 16 /* try to collapse large pages */ static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm) { @@ -107,6 +106,18 @@ static void split_page_count(int level) direct_pages_count[level - 1] += PTRS_PER_PTE; } +static void collapse_page_count(int level) +{ + direct_pages_count[level]++; + if (system_state == SYSTEM_RUNNING) { + if (level == PG_LEVEL_2M) + count_vm_event(DIRECT_MAP_LEVEL2_COLLAPSE); + else if (level == PG_LEVEL_1G) + count_vm_event(DIRECT_MAP_LEVEL3_COLLAPSE); + } + direct_pages_count[level - 1] -= PTRS_PER_PTE; +} + void arch_report_meminfo(struct seq_file *m) { seq_printf(m, "DirectMap4k: %8lu kB\n", @@ -124,6 +135,7 @@ void arch_report_meminfo(struct seq_file *m) } #else static inline void split_page_count(int level) { } +static inline void collapse_page_count(int level) { } #endif #ifdef CONFIG_X86_CPA_STATISTICS @@ -213,14 +225,14 @@ within(unsigned long addr, unsigned long start, unsigned long end) return addr >= start && addr < end; } +#ifdef CONFIG_X86_64 + static inline int within_inclusive(unsigned long addr, unsigned long start, unsigned long end) { return addr >= start && addr <= end; } -#ifdef CONFIG_X86_64 - /* * The kernel image is mapped into two places in the virtual address space * (addresses without KASLR, of course): @@ -354,7 +366,7 @@ bool cpu_cache_has_invalidate_memregion(void) { return !cpu_feature_enabled(X86_FEATURE_HYPERVISOR); } -EXPORT_SYMBOL_NS_GPL(cpu_cache_has_invalidate_memregion, DEVMEM); +EXPORT_SYMBOL_NS_GPL(cpu_cache_has_invalidate_memregion, "DEVMEM"); int cpu_cache_invalidate_memregion(int res_desc) { @@ -363,7 +375,7 @@ int cpu_cache_invalidate_memregion(int res_desc) wbinvd_on_all_cpus(); return 0; } -EXPORT_SYMBOL_NS_GPL(cpu_cache_invalidate_memregion, DEVMEM); +EXPORT_SYMBOL_NS_GPL(cpu_cache_invalidate_memregion, "DEVMEM"); #endif static void __cpa_flush_all(void *arg) @@ -396,16 +408,49 @@ static void __cpa_flush_tlb(void *data) flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i))); } -static void cpa_flush(struct cpa_data *data, int cache) +static int collapse_large_pages(unsigned long addr, struct list_head *pgtables); + +static void cpa_collapse_large_pages(struct cpa_data *cpa) +{ + unsigned long start, addr, end; + struct ptdesc *ptdesc, *tmp; + LIST_HEAD(pgtables); + int collapsed = 0; + int i; + + if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { + for (i = 0; i < cpa->numpages; i++) + collapsed += collapse_large_pages(__cpa_addr(cpa, i), + &pgtables); + } else { + addr = __cpa_addr(cpa, 0); + start = addr & PMD_MASK; + end = addr + PAGE_SIZE * cpa->numpages; + + for (addr = start; within(addr, start, end); addr += PMD_SIZE) + collapsed += collapse_large_pages(addr, &pgtables); + } + + if (!collapsed) + return; + + flush_tlb_all(); + + list_for_each_entry_safe(ptdesc, tmp, &pgtables, pt_list) { + list_del(&ptdesc->pt_list); + __free_page(ptdesc_page(ptdesc)); + } +} + +static void cpa_flush(struct cpa_data *cpa, int cache) { - struct cpa_data *cpa = data; unsigned int i; BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) { cpa_flush_all(cache); - return; + goto collapse_large_pages; } if (cpa->force_flush_all || cpa->numpages > tlb_single_page_flush_ceiling) @@ -414,7 +459,7 @@ static void cpa_flush(struct cpa_data *data, int cache) on_each_cpu(__cpa_flush_tlb, cpa, 1); if (!cache) - return; + goto collapse_large_pages; mb(); for (i = 0; i < cpa->numpages; i++) { @@ -430,6 +475,10 @@ static void cpa_flush(struct cpa_data *data, int cache) clflush_cache_range_opt((void *)fix_addr(addr), PAGE_SIZE); } mb(); + +collapse_large_pages: + if (cpa->flags & CPA_COLLAPSE) + cpa_collapse_large_pages(cpa); } static bool overlaps(unsigned long r1_start, unsigned long r1_end, @@ -619,7 +668,8 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long start, * Validate strict W^X semantics. */ static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long start, - unsigned long pfn, unsigned long npg) + unsigned long pfn, unsigned long npg, + bool nx, bool rw) { unsigned long end; @@ -641,6 +691,10 @@ static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long star if ((pgprot_val(new) & (_PAGE_RW | _PAGE_NX)) != _PAGE_RW) return new; + /* Non-leaf translation entries can disable writing or execution. */ + if (!rw || nx) + return new; + end = start + npg * PAGE_SIZE - 1; WARN_ONCE(1, "CPA detected W^X violation: %016llx -> %016llx range: 0x%016lx - 0x%016lx PFN %lx\n", (unsigned long long)pgprot_val(old), @@ -657,56 +711,82 @@ static inline pgprot_t verify_rwx(pgprot_t old, pgprot_t new, unsigned long star /* * Lookup the page table entry for a virtual address in a specific pgd. - * Return a pointer to the entry and the level of the mapping. + * Return a pointer to the entry (or NULL if the entry does not exist), + * the level of the entry, and the effective NX and RW bits of all + * page table levels. */ -pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, - unsigned int *level) +pte_t *lookup_address_in_pgd_attr(pgd_t *pgd, unsigned long address, + unsigned int *level, bool *nx, bool *rw) { p4d_t *p4d; pud_t *pud; pmd_t *pmd; - *level = PG_LEVEL_NONE; + *level = PG_LEVEL_256T; + *nx = false; + *rw = true; if (pgd_none(*pgd)) return NULL; + *level = PG_LEVEL_512G; + *nx |= pgd_flags(*pgd) & _PAGE_NX; + *rw &= pgd_flags(*pgd) & _PAGE_RW; + p4d = p4d_offset(pgd, address); if (p4d_none(*p4d)) return NULL; - *level = PG_LEVEL_512G; if (p4d_leaf(*p4d) || !p4d_present(*p4d)) return (pte_t *)p4d; + *level = PG_LEVEL_1G; + *nx |= p4d_flags(*p4d) & _PAGE_NX; + *rw &= p4d_flags(*p4d) & _PAGE_RW; + pud = pud_offset(p4d, address); if (pud_none(*pud)) return NULL; - *level = PG_LEVEL_1G; if (pud_leaf(*pud) || !pud_present(*pud)) return (pte_t *)pud; + *level = PG_LEVEL_2M; + *nx |= pud_flags(*pud) & _PAGE_NX; + *rw &= pud_flags(*pud) & _PAGE_RW; + pmd = pmd_offset(pud, address); if (pmd_none(*pmd)) return NULL; - *level = PG_LEVEL_2M; if (pmd_leaf(*pmd) || !pmd_present(*pmd)) return (pte_t *)pmd; *level = PG_LEVEL_4K; + *nx |= pmd_flags(*pmd) & _PAGE_NX; + *rw &= pmd_flags(*pmd) & _PAGE_RW; return pte_offset_kernel(pmd, address); } /* + * Lookup the page table entry for a virtual address in a specific pgd. + * Return a pointer to the entry and the level of the mapping. + */ +pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, + unsigned int *level) +{ + bool nx, rw; + + return lookup_address_in_pgd_attr(pgd, address, level, &nx, &rw); +} + +/* * Lookup the page table entry for a virtual address. Return a pointer * to the entry and the level of the mapping. * - * Note: We return pud and pmd either when the entry is marked large - * or when the present bit is not set. Otherwise we would return a - * pointer to a nonexisting mapping. + * Note: the function returns p4d, pud or pmd either when the entry is marked + * large or when the present bit is not set. Otherwise it returns NULL. */ pte_t *lookup_address(unsigned long address, unsigned int *level) { @@ -715,13 +795,16 @@ pte_t *lookup_address(unsigned long address, unsigned int *level) EXPORT_SYMBOL_GPL(lookup_address); static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, - unsigned int *level) + unsigned int *level, bool *nx, bool *rw) { - if (cpa->pgd) - return lookup_address_in_pgd(cpa->pgd + pgd_index(address), - address, level); + pgd_t *pgd; - return lookup_address(address, level); + if (!cpa->pgd) + pgd = pgd_offset_k(address); + else + pgd = cpa->pgd + pgd_index(address); + + return lookup_address_in_pgd_attr(pgd, address, level, nx, rw); } /* @@ -806,7 +889,7 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) /* change init_mm */ set_pte_atomic(kpte, pte); #ifdef CONFIG_X86_32 - if (!SHARED_KERNEL_PMD) { + { struct page *page; list_for_each_entry(page, &pgd_list, lru) { @@ -849,12 +932,13 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address, pgprot_t old_prot, new_prot, req_prot, chk_prot; pte_t new_pte, *tmp; enum pg_level level; + bool nx, rw; /* * Check for races, another CPU might have split this page * up already: */ - tmp = _lookup_address_cpa(cpa, address, &level); + tmp = _lookup_address_cpa(cpa, address, &level, &nx, &rw); if (tmp != kpte) return 1; @@ -965,7 +1049,8 @@ static int __should_split_large_page(pte_t *kpte, unsigned long address, new_prot = static_protections(req_prot, lpaddr, old_pfn, numpages, psize, CPA_DETECT); - new_prot = verify_rwx(old_prot, new_prot, lpaddr, old_pfn, numpages); + new_prot = verify_rwx(old_prot, new_prot, lpaddr, old_pfn, numpages, + nx, rw); /* * If there is a conflict, split the large page. @@ -1046,6 +1131,7 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, pte_t *pbase = (pte_t *)page_address(base); unsigned int i, level; pgprot_t ref_prot; + bool nx, rw; pte_t *tmp; spin_lock(&pgd_lock); @@ -1053,7 +1139,7 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, * Check for races, another CPU might have split this page * up for us already: */ - tmp = _lookup_address_cpa(cpa, address, &level); + tmp = _lookup_address_cpa(cpa, address, &level, &nx, &rw); if (tmp != kpte) { spin_unlock(&pgd_lock); return 1; @@ -1082,8 +1168,8 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, lpinc = PMD_SIZE; /* * Clear the PSE flags if the PRESENT flag is not set - * otherwise pmd_present/pmd_huge will return true - * even on a non present pmd. + * otherwise pmd_present() will return true even on a non + * present pmd. */ if (!(pgprot_val(ref_prot) & _PAGE_PRESENT)) pgprot_val(ref_prot) &= ~_PAGE_PSE; @@ -1162,6 +1248,164 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte, return 0; } +static int collapse_pmd_page(pmd_t *pmd, unsigned long addr, + struct list_head *pgtables) +{ + pmd_t _pmd, old_pmd; + pte_t *pte, first; + unsigned long pfn; + pgprot_t pgprot; + int i = 0; + + if (!cpu_feature_enabled(X86_FEATURE_PSE)) + return 0; + + addr &= PMD_MASK; + pte = pte_offset_kernel(pmd, addr); + first = *pte; + pfn = pte_pfn(first); + + /* Make sure alignment is suitable */ + if (PFN_PHYS(pfn) & ~PMD_MASK) + return 0; + + /* The page is 4k intentionally */ + if (pte_flags(first) & _PAGE_KERNEL_4K) + return 0; + + /* Check that the rest of PTEs are compatible with the first one */ + for (i = 1, pte++; i < PTRS_PER_PTE; i++, pte++) { + pte_t entry = *pte; + + if (!pte_present(entry)) + return 0; + if (pte_flags(entry) != pte_flags(first)) + return 0; + if (pte_pfn(entry) != pte_pfn(first) + i) + return 0; + } + + old_pmd = *pmd; + + /* Success: set up a large page */ + pgprot = pgprot_4k_2_large(pte_pgprot(first)); + pgprot_val(pgprot) |= _PAGE_PSE; + _pmd = pfn_pmd(pfn, pgprot); + set_pmd(pmd, _pmd); + + /* Queue the page table to be freed after TLB flush */ + list_add(&page_ptdesc(pmd_page(old_pmd))->pt_list, pgtables); + + if (IS_ENABLED(CONFIG_X86_32)) { + struct page *page; + + /* Update all PGD tables to use the same large page */ + list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd = (pgd_t *)page_address(page) + pgd_index(addr); + p4d_t *p4d = p4d_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); + pmd_t *pmd = pmd_offset(pud, addr); + /* Something is wrong if entries doesn't match */ + if (WARN_ON(pmd_val(old_pmd) != pmd_val(*pmd))) + continue; + set_pmd(pmd, _pmd); + } + } + + if (virt_addr_valid(addr) && pfn_range_is_mapped(pfn, pfn + 1)) + collapse_page_count(PG_LEVEL_2M); + + return 1; +} + +static int collapse_pud_page(pud_t *pud, unsigned long addr, + struct list_head *pgtables) +{ + unsigned long pfn; + pmd_t *pmd, first; + int i; + + if (!direct_gbpages) + return 0; + + addr &= PUD_MASK; + pmd = pmd_offset(pud, addr); + first = *pmd; + + /* + * To restore PUD page all PMD entries must be large and + * have suitable alignment + */ + pfn = pmd_pfn(first); + if (!pmd_leaf(first) || (PFN_PHYS(pfn) & ~PUD_MASK)) + return 0; + + /* + * To restore PUD page, all following PMDs must be compatible with the + * first one. + */ + for (i = 1, pmd++; i < PTRS_PER_PMD; i++, pmd++) { + pmd_t entry = *pmd; + + if (!pmd_present(entry) || !pmd_leaf(entry)) + return 0; + if (pmd_flags(entry) != pmd_flags(first)) + return 0; + if (pmd_pfn(entry) != pmd_pfn(first) + i * PTRS_PER_PTE) + return 0; + } + + /* Restore PUD page and queue page table to be freed after TLB flush */ + list_add(&page_ptdesc(pud_page(*pud))->pt_list, pgtables); + set_pud(pud, pfn_pud(pfn, pmd_pgprot(first))); + + if (virt_addr_valid(addr) && pfn_range_is_mapped(pfn, pfn + 1)) + collapse_page_count(PG_LEVEL_1G); + + return 1; +} + +/* + * Collapse PMD and PUD pages in the kernel mapping around the address where + * possible. + * + * Caller must flush TLB and free page tables queued on the list before + * touching the new entries. CPU must not see TLB entries of different size + * with different attributes. + */ +static int collapse_large_pages(unsigned long addr, struct list_head *pgtables) +{ + int collapsed = 0; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + addr &= PMD_MASK; + + spin_lock(&pgd_lock); + pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) + goto out; + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) + goto out; + pud = pud_offset(p4d, addr); + if (!pud_present(*pud) || pud_leaf(*pud)) + goto out; + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd) || pmd_leaf(*pmd)) + goto out; + + collapsed = collapse_pmd_page(pmd, addr, pgtables); + if (collapsed) + collapsed += collapse_pud_page(pud, addr, pgtables); + +out: + spin_unlock(&pgd_lock); + return collapsed; +} + static bool try_to_free_pte_page(pte_t *pte) { int i; @@ -1594,10 +1838,11 @@ static int __change_page_attr(struct cpa_data *cpa, int primary) int do_split, err; unsigned int level; pte_t *kpte, old_pte; + bool nx, rw; address = __cpa_addr(cpa, cpa->curpage); repeat: - kpte = _lookup_address_cpa(cpa, address, &level); + kpte = _lookup_address_cpa(cpa, address, &level, &nx, &rw); if (!kpte) return __cpa_process_fault(cpa, address, primary); @@ -1619,7 +1864,8 @@ repeat: new_prot = static_protections(new_prot, address, pfn, 1, 0, CPA_PROTECT); - new_prot = verify_rwx(old_prot, new_prot, address, pfn, 1); + new_prot = verify_rwx(old_prot, new_prot, address, pfn, 1, + nx, rw); new_prot = pgprot_clear_protnone_bits(new_prot); @@ -2044,6 +2290,7 @@ int set_mce_nospec(unsigned long pfn) pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); return rc; } +EXPORT_SYMBOL_GPL(set_mce_nospec); /* Restore full speculative operation to the pfn. */ int clear_mce_nospec(unsigned long pfn) @@ -2083,7 +2330,8 @@ int set_memory_rox(unsigned long addr, int numpages) if (__supported_pte_mask & _PAGE_NX) clr.pgprot |= _PAGE_NX; - return change_page_attr_clear(&addr, numpages, clr, 0); + return change_page_attr_set_clr(&addr, numpages, __pgprot(0), clr, 0, + CPA_COLLAPSE, NULL); } int set_memory_rw(unsigned long addr, int numpages) @@ -2110,7 +2358,8 @@ int set_memory_p(unsigned long addr, int numpages) int set_memory_4k(unsigned long addr, int numpages) { - return change_page_attr_set_clr(&addr, numpages, __pgprot(0), + return change_page_attr_set_clr(&addr, numpages, + __pgprot(_PAGE_KERNEL_4K), __pgprot(0), 1, 0, NULL); } @@ -2156,7 +2405,8 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc) cpa_flush(&cpa, x86_platform.guest.enc_cache_flush_required()); /* Notify hypervisor that we are about to set/clr encryption attribute. */ - if (!x86_platform.guest.enc_status_change_prepare(addr, numpages, enc)) + ret = x86_platform.guest.enc_status_change_prepare(addr, numpages, enc); + if (ret) goto vmm_fail; ret = __change_page_attr_set_clr(&cpa, 1); @@ -2174,24 +2424,61 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc) return ret; /* Notify hypervisor that we have successfully set/clr encryption attribute. */ - if (!x86_platform.guest.enc_status_change_finish(addr, numpages, enc)) + ret = x86_platform.guest.enc_status_change_finish(addr, numpages, enc); + if (ret) goto vmm_fail; return 0; vmm_fail: - WARN_ONCE(1, "CPA VMM failure to convert memory (addr=%p, numpages=%d) to %s.\n", - (void *)addr, numpages, enc ? "private" : "shared"); + WARN_ONCE(1, "CPA VMM failure to convert memory (addr=%p, numpages=%d) to %s: %d\n", + (void *)addr, numpages, enc ? "private" : "shared", ret); - return -EIO; + return ret; +} + +/* + * The lock serializes conversions between private and shared memory. + * + * It is taken for read on conversion. A write lock guarantees that no + * concurrent conversions are in progress. + */ +static DECLARE_RWSEM(mem_enc_lock); + +/* + * Stop new private<->shared conversions. + * + * Taking the exclusive mem_enc_lock waits for in-flight conversions to complete. + * The lock is not released to prevent new conversions from being started. + */ +bool set_memory_enc_stop_conversion(void) +{ + /* + * In a crash scenario, sleep is not allowed. Try to take the lock. + * Failure indicates that there is a race with the conversion. + */ + if (oops_in_progress) + return down_write_trylock(&mem_enc_lock); + + down_write(&mem_enc_lock); + + return true; } static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) { - if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) - return __set_memory_enc_pgtable(addr, numpages, enc); + int ret = 0; - return 0; + if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) { + if (!down_read_trylock(&mem_enc_lock)) + return -EBUSY; + + ret = __set_memory_enc_pgtable(addr, numpages, enc); + + up_read(&mem_enc_lock); + } + + return ret; } int set_memory_encrypted(unsigned long addr, int numpages) @@ -2345,7 +2632,7 @@ static int __set_pages_np(struct page *page, int numpages) .pgd = NULL, .numpages = numpages, .mask_set = __pgprot(0), - .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), + .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY), .flags = CPA_NO_CHECK_ALIAS }; /* @@ -2367,6 +2654,14 @@ int set_direct_map_default_noflush(struct page *page) return __set_pages_p(page, 1); } +int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid) +{ + if (valid) + return __set_pages_p(page, nr); + + return __set_pages_np(page, nr); +} + #ifdef CONFIG_DEBUG_PAGEALLOC void __kernel_map_pages(struct page *page, int numpages, int enable) { @@ -2424,7 +2719,7 @@ int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, .pgd = pgd, .numpages = numpages, .mask_set = __pgprot(0), - .mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW)), + .mask_clr = __pgprot(~page_flags & (_PAGE_NX|_PAGE_RW|_PAGE_DIRTY)), .flags = CPA_NO_CHECK_ALIAS, }; @@ -2467,7 +2762,7 @@ int __init kernel_unmap_pages_in_pgd(pgd_t *pgd, unsigned long address, .pgd = pgd, .numpages = numpages, .mask_set = __pgprot(0), - .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), + .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY), .flags = CPA_NO_CHECK_ALIAS, }; |