aboutsummaryrefslogtreecommitdiffstats
path: root/arch/s390/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/s390/mm')
-rw-r--r--arch/s390/mm/Makefile2
-rw-r--r--arch/s390/mm/cmm.c2
-rw-r--r--arch/s390/mm/dump_pagetables.c20
-rw-r--r--arch/s390/mm/extable.c81
-rw-r--r--arch/s390/mm/fault.c110
-rw-r--r--arch/s390/mm/gmap.c226
-rw-r--r--arch/s390/mm/hugetlbpage.c47
-rw-r--r--arch/s390/mm/init.c21
-rw-r--r--arch/s390/mm/kasan_init.c8
-rw-r--r--arch/s390/mm/maccess.c217
-rw-r--r--arch/s390/mm/mmap.c26
-rw-r--r--arch/s390/mm/page-states.c1
-rw-r--r--arch/s390/mm/pageattr.c33
-rw-r--r--arch/s390/mm/pgalloc.c252
-rw-r--r--arch/s390/mm/pgtable.c46
-rw-r--r--arch/s390/mm/vmem.c123
16 files changed, 819 insertions, 396 deletions
diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile
index cd67e94c16aa..57e4f3a24829 100644
--- a/arch/s390/mm/Makefile
+++ b/arch/s390/mm/Makefile
@@ -4,7 +4,7 @@
#
obj-y := init.o fault.o extmem.o mmap.o vmem.o maccess.o
-obj-y += page-states.o pageattr.o pgtable.o pgalloc.o
+obj-y += page-states.o pageattr.o pgtable.o pgalloc.o extable.o
obj-$(CONFIG_CMM) += cmm.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c
index 2203164b39da..9141ed4c52e9 100644
--- a/arch/s390/mm/cmm.c
+++ b/arch/s390/mm/cmm.c
@@ -90,7 +90,7 @@ static long cmm_alloc_pages(long nr, long *counter,
} else
free_page((unsigned long) npa);
}
- diag10_range(addr >> PAGE_SHIFT, 1);
+ diag10_range(virt_to_pfn(addr), 1);
pa->pages[pa->index++] = addr;
(*counter)++;
spin_unlock(&cmm_lock);
diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c
index 9f9af5298dd6..9953819d7959 100644
--- a/arch/s390/mm/dump_pagetables.c
+++ b/arch/s390/mm/dump_pagetables.c
@@ -8,8 +8,10 @@
#include <linux/kasan.h>
#include <asm/ptdump.h>
#include <asm/kasan.h>
+#include <asm/abs_lowcore.h>
#include <asm/nospec-branch.h>
#include <asm/sections.h>
+#include <asm/maccess.h>
static unsigned long max_addr;
@@ -21,6 +23,8 @@ struct addr_marker {
enum address_markers_idx {
IDENTITY_BEFORE_NR = 0,
IDENTITY_BEFORE_END_NR,
+ AMODE31_START_NR,
+ AMODE31_END_NR,
KERNEL_START_NR,
KERNEL_END_NR,
#ifdef CONFIG_KFENCE
@@ -39,11 +43,17 @@ enum address_markers_idx {
VMALLOC_END_NR,
MODULES_NR,
MODULES_END_NR,
+ ABS_LOWCORE_NR,
+ ABS_LOWCORE_END_NR,
+ MEMCPY_REAL_NR,
+ MEMCPY_REAL_END_NR,
};
static struct addr_marker address_markers[] = {
[IDENTITY_BEFORE_NR] = {0, "Identity Mapping Start"},
[IDENTITY_BEFORE_END_NR] = {(unsigned long)_stext, "Identity Mapping End"},
+ [AMODE31_START_NR] = {0, "Amode31 Area Start"},
+ [AMODE31_END_NR] = {0, "Amode31 Area End"},
[KERNEL_START_NR] = {(unsigned long)_stext, "Kernel Image Start"},
[KERNEL_END_NR] = {(unsigned long)_end, "Kernel Image End"},
#ifdef CONFIG_KFENCE
@@ -62,6 +72,10 @@ static struct addr_marker address_markers[] = {
[VMALLOC_END_NR] = {0, "vmalloc Area End"},
[MODULES_NR] = {0, "Modules Area Start"},
[MODULES_END_NR] = {0, "Modules Area End"},
+ [ABS_LOWCORE_NR] = {0, "Lowcore Area Start"},
+ [ABS_LOWCORE_END_NR] = {0, "Lowcore Area End"},
+ [MEMCPY_REAL_NR] = {0, "Real Memory Copy Area Start"},
+ [MEMCPY_REAL_END_NR] = {0, "Real Memory Copy Area End"},
{ -1, NULL }
};
@@ -276,8 +290,14 @@ static int pt_dump_init(void)
max_addr = (S390_lowcore.kernel_asce & _REGION_ENTRY_TYPE_MASK) >> 2;
max_addr = 1UL << (max_addr * 11 + 31);
address_markers[IDENTITY_AFTER_END_NR].start_address = ident_map_size;
+ address_markers[AMODE31_START_NR].start_address = __samode31;
+ address_markers[AMODE31_END_NR].start_address = __eamode31;
address_markers[MODULES_NR].start_address = MODULES_VADDR;
address_markers[MODULES_END_NR].start_address = MODULES_END;
+ address_markers[ABS_LOWCORE_NR].start_address = __abs_lowcore;
+ address_markers[ABS_LOWCORE_END_NR].start_address = __abs_lowcore + ABS_LOWCORE_MAP_SIZE;
+ address_markers[MEMCPY_REAL_NR].start_address = __memcpy_real_area;
+ address_markers[MEMCPY_REAL_END_NR].start_address = __memcpy_real_area + PAGE_SIZE;
address_markers[VMEMMAP_NR].start_address = (unsigned long) vmemmap;
address_markers[VMEMMAP_END_NR].start_address = (unsigned long)vmemmap + vmemmap_size;
address_markers[VMALLOC_NR].start_address = VMALLOC_START;
diff --git a/arch/s390/mm/extable.c b/arch/s390/mm/extable.c
new file mode 100644
index 000000000000..1e4d2187541a
--- /dev/null
+++ b/arch/s390/mm/extable.c
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bitfield.h>
+#include <linux/extable.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/panic.h>
+#include <asm/asm-extable.h>
+#include <asm/extable.h>
+
+const struct exception_table_entry *s390_search_extables(unsigned long addr)
+{
+ const struct exception_table_entry *fixup;
+ size_t num;
+
+ fixup = search_exception_tables(addr);
+ if (fixup)
+ return fixup;
+ num = __stop_amode31_ex_table - __start_amode31_ex_table;
+ return search_extable(__start_amode31_ex_table, num, addr);
+}
+
+static bool ex_handler_fixup(const struct exception_table_entry *ex, struct pt_regs *regs)
+{
+ regs->psw.addr = extable_fixup(ex);
+ return true;
+}
+
+static bool ex_handler_ua_store(const struct exception_table_entry *ex, struct pt_regs *regs)
+{
+ unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data);
+
+ regs->gprs[reg_err] = -EFAULT;
+ regs->psw.addr = extable_fixup(ex);
+ return true;
+}
+
+static bool ex_handler_ua_load_mem(const struct exception_table_entry *ex, struct pt_regs *regs)
+{
+ unsigned int reg_addr = FIELD_GET(EX_DATA_REG_ADDR, ex->data);
+ unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data);
+ size_t len = FIELD_GET(EX_DATA_LEN, ex->data);
+
+ regs->gprs[reg_err] = -EFAULT;
+ memset((void *)regs->gprs[reg_addr], 0, len);
+ regs->psw.addr = extable_fixup(ex);
+ return true;
+}
+
+static bool ex_handler_ua_load_reg(const struct exception_table_entry *ex, struct pt_regs *regs)
+{
+ unsigned int reg_zero = FIELD_GET(EX_DATA_REG_ADDR, ex->data);
+ unsigned int reg_err = FIELD_GET(EX_DATA_REG_ERR, ex->data);
+
+ regs->gprs[reg_err] = -EFAULT;
+ regs->gprs[reg_zero] = 0;
+ regs->psw.addr = extable_fixup(ex);
+ return true;
+}
+
+bool fixup_exception(struct pt_regs *regs)
+{
+ const struct exception_table_entry *ex;
+
+ ex = s390_search_extables(instruction_pointer(regs));
+ if (!ex)
+ return false;
+ switch (ex->type) {
+ case EX_TYPE_FIXUP:
+ return ex_handler_fixup(ex, regs);
+ case EX_TYPE_BPF:
+ return ex_handler_bpf(ex, regs);
+ case EX_TYPE_UA_STORE:
+ return ex_handler_ua_store(ex, regs);
+ case EX_TYPE_UA_LOAD_MEM:
+ return ex_handler_ua_load_mem(ex, regs);
+ case EX_TYPE_UA_LOAD_REG:
+ return ex_handler_ua_load_reg(ex, regs);
+ }
+ panic("invalid exception table entry");
+}
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index d30f5986fa85..9649d9382e0a 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -32,6 +32,7 @@
#include <linux/uaccess.h>
#include <linux/hugetlb.h>
#include <linux/kfence.h>
+#include <asm/asm-extable.h>
#include <asm/asm-offsets.h>
#include <asm/diag.h>
#include <asm/gmap.h>
@@ -115,7 +116,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address)
pr_cont("R1:%016lx ", *table);
if (*table & _REGION_ENTRY_INVALID)
goto out;
- table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+ table = __va(*table & _REGION_ENTRY_ORIGIN);
fallthrough;
case _ASCE_TYPE_REGION2:
table += (address & _REGION2_INDEX) >> _REGION2_SHIFT;
@@ -124,7 +125,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address)
pr_cont("R2:%016lx ", *table);
if (*table & _REGION_ENTRY_INVALID)
goto out;
- table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+ table = __va(*table & _REGION_ENTRY_ORIGIN);
fallthrough;
case _ASCE_TYPE_REGION3:
table += (address & _REGION3_INDEX) >> _REGION3_SHIFT;
@@ -133,7 +134,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address)
pr_cont("R3:%016lx ", *table);
if (*table & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE))
goto out;
- table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+ table = __va(*table & _REGION_ENTRY_ORIGIN);
fallthrough;
case _ASCE_TYPE_SEGMENT:
table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
@@ -142,7 +143,7 @@ static void dump_pagetable(unsigned long asce, unsigned long address)
pr_cont("S:%016lx ", *table);
if (*table & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE))
goto out;
- table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
+ table = __va(*table & _SEGMENT_ENTRY_ORIGIN);
}
table += (address & _PAGE_INDEX) >> _PAGE_SHIFT;
if (bad_address(table))
@@ -227,27 +228,10 @@ static noinline void do_sigsegv(struct pt_regs *regs, int si_code)
(void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK));
}
-const struct exception_table_entry *s390_search_extables(unsigned long addr)
-{
- const struct exception_table_entry *fixup;
-
- fixup = search_extable(__start_amode31_ex_table,
- __stop_amode31_ex_table - __start_amode31_ex_table,
- addr);
- if (!fixup)
- fixup = search_exception_tables(addr);
- return fixup;
-}
-
static noinline void do_no_context(struct pt_regs *regs)
{
- const struct exception_table_entry *fixup;
-
- /* Are we prepared to handle this kernel fault? */
- fixup = s390_search_extables(regs->psw.addr);
- if (fixup && ex_handle(fixup, regs))
+ if (fixup_exception(regs))
return;
-
/*
* Oops. The kernel tried to access some bad page. We'll have to
* terminate things with extreme prejudice.
@@ -284,8 +268,7 @@ static noinline void do_sigbus(struct pt_regs *regs)
(void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK));
}
-static noinline void do_fault_error(struct pt_regs *regs, int access,
- vm_fault_t fault)
+static noinline void do_fault_error(struct pt_regs *regs, vm_fault_t fault)
{
int si_code;
@@ -395,7 +378,9 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
flags = FAULT_FLAG_DEFAULT;
if (user_mode(regs))
flags |= FAULT_FLAG_USER;
- if (access == VM_WRITE || is_write)
+ if (is_write)
+ access = VM_WRITE;
+ if (access == VM_WRITE)
flags |= FAULT_FLAG_WRITE;
mmap_read_lock(mm);
@@ -435,8 +420,6 @@ retry:
if (unlikely(!(vma->vm_flags & access)))
goto out_up;
- if (is_vm_hugetlb_page(vma))
- address &= HPAGE_MASK;
/*
* If for any reason at all we couldn't handle the fault,
* make sure we exit gracefully rather than endlessly redo
@@ -449,25 +432,37 @@ retry:
goto out_up;
goto out;
}
+
+ /* The fault is fully completed (including releasing mmap lock) */
+ if (fault & VM_FAULT_COMPLETED) {
+ if (gmap) {
+ mmap_read_lock(mm);
+ goto out_gmap;
+ }
+ fault = 0;
+ goto out;
+ }
+
if (unlikely(fault & VM_FAULT_ERROR))
goto out_up;
- if (flags & FAULT_FLAG_ALLOW_RETRY) {
- if (fault & VM_FAULT_RETRY) {
- if (IS_ENABLED(CONFIG_PGSTE) && gmap &&
- (flags & FAULT_FLAG_RETRY_NOWAIT)) {
- /* FAULT_FLAG_RETRY_NOWAIT has been set,
- * mmap_lock has not been released */
- current->thread.gmap_pfault = 1;
- fault = VM_FAULT_PFAULT;
- goto out_up;
- }
- flags &= ~FAULT_FLAG_RETRY_NOWAIT;
- flags |= FAULT_FLAG_TRIED;
- mmap_read_lock(mm);
- goto retry;
+ if (fault & VM_FAULT_RETRY) {
+ if (IS_ENABLED(CONFIG_PGSTE) && gmap &&
+ (flags & FAULT_FLAG_RETRY_NOWAIT)) {
+ /*
+ * FAULT_FLAG_RETRY_NOWAIT has been set, mmap_lock has
+ * not been released
+ */
+ current->thread.gmap_pfault = 1;
+ fault = VM_FAULT_PFAULT;
+ goto out_up;
}
+ flags &= ~FAULT_FLAG_RETRY_NOWAIT;
+ flags |= FAULT_FLAG_TRIED;
+ mmap_read_lock(mm);
+ goto retry;
}
+out_gmap:
if (IS_ENABLED(CONFIG_PGSTE) && gmap) {
address = __gmap_link(gmap, current->thread.gmap_addr,
address);
@@ -520,7 +515,7 @@ void do_protection_exception(struct pt_regs *regs)
fault = do_exception(regs, access);
}
if (unlikely(fault))
- do_fault_error(regs, access, fault);
+ do_fault_error(regs, fault);
}
NOKPROBE_SYMBOL(do_protection_exception);
@@ -532,7 +527,7 @@ void do_dat_exception(struct pt_regs *regs)
access = VM_ACCESS_FLAGS;
fault = do_exception(regs, access);
if (unlikely(fault))
- do_fault_error(regs, access, fault);
+ do_fault_error(regs, fault);
}
NOKPROBE_SYMBOL(do_dat_exception);
@@ -770,6 +765,7 @@ void do_secure_storage_access(struct pt_regs *regs)
struct vm_area_struct *vma;
struct mm_struct *mm;
struct page *page;
+ struct gmap *gmap;
int rc;
/*
@@ -799,13 +795,24 @@ void do_secure_storage_access(struct pt_regs *regs)
}
switch (get_fault_type(regs)) {
+ case GMAP_FAULT:
+ mm = current->mm;
+ gmap = (struct gmap *)S390_lowcore.gmap;
+ mmap_read_lock(mm);
+ addr = __gmap_translate(gmap, addr);
+ mmap_read_unlock(mm);
+ if (IS_ERR_VALUE(addr)) {
+ do_fault_error(regs, VM_FAULT_BADMAP);
+ break;
+ }
+ fallthrough;
case USER_FAULT:
mm = current->mm;
mmap_read_lock(mm);
vma = find_vma(mm, addr);
if (!vma) {
mmap_read_unlock(mm);
- do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP);
+ do_fault_error(regs, VM_FAULT_BADMAP);
break;
}
page = follow_page(vma, addr, FOLL_WRITE | FOLL_GET);
@@ -827,9 +834,8 @@ void do_secure_storage_access(struct pt_regs *regs)
if (rc)
BUG();
break;
- case GMAP_FAULT:
default:
- do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP);
+ do_fault_error(regs, VM_FAULT_BADMAP);
WARN_ON_ONCE(1);
}
}
@@ -841,7 +847,7 @@ void do_non_secure_storage_access(struct pt_regs *regs)
struct gmap *gmap = (struct gmap *)S390_lowcore.gmap;
if (get_fault_type(regs) != GMAP_FAULT) {
- do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP);
+ do_fault_error(regs, VM_FAULT_BADMAP);
WARN_ON_ONCE(1);
return;
}
@@ -853,6 +859,16 @@ NOKPROBE_SYMBOL(do_non_secure_storage_access);
void do_secure_storage_violation(struct pt_regs *regs)
{
+ unsigned long gaddr = regs->int_parm_long & __FAIL_ADDR_MASK;
+ struct gmap *gmap = (struct gmap *)S390_lowcore.gmap;
+
+ /*
+ * If the VM has been rebooted, its address space might still contain
+ * secure pages from the previous boot.
+ * Clear the page so it can be reused.
+ */
+ if (!gmap_destroy_page(gmap, gaddr))
+ return;
/*
* Either KVM messed up the secure guest mapping or the same
* page is mapped into multiple secure guests.
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index dfee0ebb2fac..02d15c8dc92e 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -974,18 +974,18 @@ static int gmap_protect_pmd(struct gmap *gmap, unsigned long gaddr,
return -EAGAIN;
if (prot == PROT_NONE && !pmd_i) {
- pmd_val(new) |= _SEGMENT_ENTRY_INVALID;
+ new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID));
gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
}
if (prot == PROT_READ && !pmd_p) {
- pmd_val(new) &= ~_SEGMENT_ENTRY_INVALID;
- pmd_val(new) |= _SEGMENT_ENTRY_PROTECT;
+ new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_INVALID));
+ new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_PROTECT));
gmap_pmdp_xchg(gmap, pmdp, new, gaddr);
}
if (bits & GMAP_NOTIFY_MPROT)
- pmd_val(*pmdp) |= _SEGMENT_ENTRY_GMAP_IN;
+ set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN)));
/* Shadow GMAP protection needs split PMDs */
if (bits & GMAP_NOTIFY_SHADOW)
@@ -1151,7 +1151,7 @@ int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
address = pte_val(pte) & PAGE_MASK;
address += gaddr & ~PAGE_MASK;
*val = *(unsigned long *) address;
- pte_val(*ptep) |= _PAGE_YOUNG;
+ set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_YOUNG)));
/* Do *NOT* clear the _PAGE_INVALID bit! */
rc = 0;
}
@@ -1183,6 +1183,7 @@ EXPORT_SYMBOL_GPL(gmap_read_table);
static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
struct gmap_rmap *rmap)
{
+ struct gmap_rmap *temp;
void __rcu **slot;
BUG_ON(!gmap_is_shadow(sg));
@@ -1190,6 +1191,12 @@ static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
if (slot) {
rmap->next = radix_tree_deref_slot_protected(slot,
&sg->guest_table_lock);
+ for (temp = rmap->next; temp; temp = temp->next) {
+ if (temp->raddr == rmap->raddr) {
+ kfree(rmap);
+ return;
+ }
+ }
radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap);
} else {
rmap->next = NULL;
@@ -1278,7 +1285,7 @@ static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr)
{
asm volatile(
- " .insn rrf,0xb98e0000,%0,%1,0,0"
+ " idte %0,0,%1"
: : "a" (asce), "a" (vaddr) : "cc", "memory");
}
@@ -2275,7 +2282,7 @@ EXPORT_SYMBOL_GPL(ptep_notify);
static void pmdp_notify_gmap(struct gmap *gmap, pmd_t *pmdp,
unsigned long gaddr)
{
- pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_IN;
+ set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_IN)));
gmap_call_notifier(gmap, gaddr, gaddr + HPAGE_SIZE - 1);
}
@@ -2294,7 +2301,7 @@ static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new,
{
gaddr &= HPAGE_MASK;
pmdp_notify_gmap(gmap, pmdp, gaddr);
- pmd_val(new) &= ~_SEGMENT_ENTRY_GMAP_IN;
+ new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_GMAP_IN));
if (MACHINE_HAS_TLB_GUEST)
__pmdp_idte(gaddr, (pmd_t *)pmdp, IDTE_GUEST_ASCE, gmap->asce,
IDTE_GLOBAL);
@@ -2302,7 +2309,7 @@ static void gmap_pmdp_xchg(struct gmap *gmap, pmd_t *pmdp, pmd_t new,
__pmdp_idte(gaddr, (pmd_t *)pmdp, 0, 0, IDTE_GLOBAL);
else
__pmdp_csp(pmdp);
- *pmdp = new;
+ set_pmd(pmdp, new);
}
static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr,
@@ -2324,7 +2331,7 @@ static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr,
_SEGMENT_ENTRY_GMAP_UC));
if (purge)
__pmdp_csp(pmdp);
- pmd_val(*pmdp) = _SEGMENT_ENTRY_EMPTY;
+ set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
}
spin_unlock(&gmap->guest_table_lock);
}
@@ -2447,7 +2454,7 @@ static bool gmap_test_and_clear_dirty_pmd(struct gmap *gmap, pmd_t *pmdp,
return false;
/* Clear UC indication and reset protection */
- pmd_val(*pmdp) &= ~_SEGMENT_ENTRY_GMAP_UC;
+ set_pmd(pmdp, clear_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_GMAP_UC)));
gmap_protect_pmd(gmap, gaddr, pmdp, PROT_READ, 0);
return true;
}
@@ -2508,8 +2515,9 @@ static const struct mm_walk_ops thp_split_walk_ops = {
static inline void thp_split_mm(struct mm_struct *mm)
{
struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, 0);
- for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
vma->vm_flags &= ~VM_HUGEPAGE;
vma->vm_flags |= VM_NOHUGEPAGE;
walk_page_vma(vma, &thp_split_walk_ops, NULL);
@@ -2577,8 +2585,9 @@ int gmap_mark_unmergeable(void)
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
int ret;
+ VMA_ITERATOR(vmi, mm, 0);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ for_each_vma(vmi, vma) {
ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
MADV_UNMERGEABLE, &vma->vm_flags);
if (ret)
@@ -2601,6 +2610,18 @@ static int __s390_enable_skey_pte(pte_t *pte, unsigned long addr,
return 0;
}
+/*
+ * Give a chance to schedule after setting a key to 256 pages.
+ * We only hold the mm lock, which is a rwsem and the kvm srcu.
+ * Both can sleep.
+ */
+static int __s390_enable_skey_pmd(pmd_t *pmd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ cond_resched();
+ return 0;
+}
+
static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
unsigned long hmask, unsigned long next,
struct mm_walk *walk)
@@ -2623,12 +2644,14 @@ static int __s390_enable_skey_hugetlb(pte_t *pte, unsigned long addr,
end = start + HPAGE_SIZE - 1;
__storage_key_init_range(start, end);
set_bit(PG_arch_1, &page->flags);
+ cond_resched();
return 0;
}
static const struct mm_walk_ops enable_skey_walk_ops = {
.hugetlb_entry = __s390_enable_skey_hugetlb,
.pte_entry = __s390_enable_skey_pte,
+ .pmd_entry = __s390_enable_skey_pmd,
};
int s390_enable_skey(void)
@@ -2676,41 +2699,168 @@ void s390_reset_cmma(struct mm_struct *mm)
}
EXPORT_SYMBOL_GPL(s390_reset_cmma);
+#define GATHER_GET_PAGES 32
+
+struct reset_walk_state {
+ unsigned long next;
+ unsigned long count;
+ unsigned long pfns[GATHER_GET_PAGES];
+};
+
+static int s390_gather_pages(pte_t *ptep, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ struct reset_walk_state *p = walk->private;
+ pte_t pte = READ_ONCE(*ptep);
+
+ if (pte_present(pte)) {
+ /* we have a reference from the mapping, take an extra one */
+ get_page(phys_to_page(pte_val(pte)));
+ p->pfns[p->count] = phys_to_pfn(pte_val(pte));
+ p->next = next;
+ p->count++;
+ }
+ return p->count >= GATHER_GET_PAGES;
+}
+
+static const struct mm_walk_ops gather_pages_ops = {
+ .pte_entry = s390_gather_pages,
+};
+
/*
- * make inaccessible pages accessible again
+ * Call the Destroy secure page UVC on each page in the given array of PFNs.
+ * Each page needs to have an extra reference, which will be released here.
*/
-static int __s390_reset_acc(pte_t *ptep, unsigned long addr,
- unsigned long next, struct mm_walk *walk)
+void s390_uv_destroy_pfns(unsigned long count, unsigned long *pfns)
{
- pte_t pte = READ_ONCE(*ptep);
+ unsigned long i;
- /* There is a reference through the mapping */
- if (pte_present(pte))
- WARN_ON_ONCE(uv_destroy_owned_page(pte_val(pte) & PAGE_MASK));
+ for (i = 0; i < count; i++) {
+ /* we always have an extra reference */
+ uv_destroy_owned_page(pfn_to_phys(pfns[i]));
+ /* get rid of the extra reference */
+ put_page(pfn_to_page(pfns[i]));
+ cond_resched();
+ }
+}
+EXPORT_SYMBOL_GPL(s390_uv_destroy_pfns);
+/**
+ * __s390_uv_destroy_range - Call the destroy secure page UVC on each page
+ * in the given range of the given address space.
+ * @mm: the mm to operate on
+ * @start: the start of the range
+ * @end: the end of the range
+ * @interruptible: if not 0, stop when a fatal signal is received
+ *
+ * Walk the given range of the given address space and call the destroy
+ * secure page UVC on each page. Optionally exit early if a fatal signal is
+ * pending.
+ *
+ * Return: 0 on success, -EINTR if the function stopped before completing
+ */
+int __s390_uv_destroy_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end, bool interruptible)
+{
+ struct reset_walk_state state = { .next = start };
+ int r = 1;
+
+ while (r > 0) {
+ state.count = 0;
+ mmap_read_lock(mm);
+ r = walk_page_range(mm, state.next, end, &gather_pages_ops, &state);
+ mmap_read_unlock(mm);
+ cond_resched();
+ s390_uv_destroy_pfns(state.count, state.pfns);
+ if (interruptible && fatal_signal_pending(current))
+ return -EINTR;
+ }
return 0;
}
+EXPORT_SYMBOL_GPL(__s390_uv_destroy_range);
-static const struct mm_walk_ops reset_acc_walk_ops = {
- .pte_entry = __s390_reset_acc,
-};
+/**
+ * s390_unlist_old_asce - Remove the topmost level of page tables from the
+ * list of page tables of the gmap.
+ * @gmap: the gmap whose table is to be removed
+ *
+ * On s390x, KVM keeps a list of all pages containing the page tables of the
+ * gmap (the CRST list). This list is used at tear down time to free all
+ * pages that are now not needed anymore.
+ *
+ * This function removes the topmost page of the tree (the one pointed to by
+ * the ASCE) from the CRST list.
+ *
+ * This means that it will not be freed when the VM is torn down, and needs
+ * to be handled separately by the caller, unless a leak is actually
+ * intended. Notice that this function will only remove the page from the
+ * list, the page will still be used as a top level page table (and ASCE).
+ */
+void s390_unlist_old_asce(struct gmap *gmap)
+{
+ struct page *old;
-#include <linux/sched/mm.h>
-void s390_reset_acc(struct mm_struct *mm)
+ old = virt_to_page(gmap->table);
+ spin_lock(&gmap->guest_table_lock);
+ list_del(&old->lru);
+ /*
+ * Sometimes the topmost page might need to be "removed" multiple
+ * times, for example if the VM is rebooted into secure mode several
+ * times concurrently, or if s390_replace_asce fails after calling
+ * s390_remove_old_asce and is attempted again later. In that case
+ * the old asce has been removed from the list, and therefore it
+ * will not be freed when the VM terminates, but the ASCE is still
+ * in use and still pointed to.
+ * A subsequent call to replace_asce will follow the pointer and try
+ * to remove the same page from the list again.
+ * Therefore it's necessary that the page of the ASCE has valid
+ * pointers, so list_del can work (and do nothing) without
+ * dereferencing stale or invalid pointers.
+ */
+ INIT_LIST_HEAD(&old->lru);
+ spin_unlock(&gmap->guest_table_lock);
+}
+EXPORT_SYMBOL_GPL(s390_unlist_old_asce);
+
+/**
+ * s390_replace_asce - Try to replace the current ASCE of a gmap with a copy
+ * @gmap: the gmap whose ASCE needs to be replaced
+ *
+ * If the allocation of the new top level page table fails, the ASCE is not
+ * replaced.
+ * In any case, the old ASCE is always removed from the gmap CRST list.
+ * Therefore the caller has to make sure to save a pointer to it
+ * beforehand, unless a leak is actually intended.
+ */
+int s390_replace_asce(struct gmap *gmap)
{
- if (!mm_is_protected(mm))
- return;
+ unsigned long asce;
+ struct page *page;
+ void *table;
+
+ s390_unlist_old_asce(gmap);
+
+ page = alloc_pages(GFP_KERNEL_ACCOUNT, CRST_ALLOC_ORDER);
+ if (!page)
+ return -ENOMEM;
+ table = page_to_virt(page);
+ memcpy(table, gmap->table, 1UL << (CRST_ALLOC_ORDER + PAGE_SHIFT));
+
/*
- * we might be called during
- * reset: we walk the pages and clear
- * close of all kvm file descriptors: we walk the pages and clear
- * exit of process on fd closure: vma already gone, do nothing
+ * The caller has to deal with the old ASCE, but here we make sure
+ * the new one is properly added to the CRST list, so that
+ * it will be freed when the VM is torn down.
*/
- if (!mmget_not_zero(mm))
- return;
- mmap_read_lock(mm);
- walk_page_range(mm, 0, TASK_SIZE, &reset_acc_walk_ops, NULL);
- mmap_read_unlock(mm);
- mmput(mm);
+ spin_lock(&gmap->guest_table_lock);
+ list_add(&page->lru, &gmap->crst_list);
+ spin_unlock(&gmap->guest_table_lock);
+
+ /* Set new table origin while preserving existing ASCE control bits */
+ asce = (gmap->asce & ~_ASCE_ORIGIN) | __pa(table);
+ WRITE_ONCE(gmap->asce, asce);
+ WRITE_ONCE(gmap->mm->context.gmap_asce, asce);
+ WRITE_ONCE(gmap->table, table);
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(s390_reset_acc);
+EXPORT_SYMBOL_GPL(s390_replace_asce);
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index da36d13ffc16..c299a18273ff 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -9,6 +9,7 @@
#define KMSG_COMPONENT "hugetlb"
#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+#include <asm/pgalloc.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/mman.h>
@@ -72,8 +73,8 @@ static inline unsigned long __pte_to_rste(pte_t pte)
static inline pte_t __rste_to_pte(unsigned long rste)
{
+ unsigned long pteval;
int present;
- pte_t pte;
if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
present = pud_present(__pud(rste));
@@ -101,29 +102,21 @@ static inline pte_t __rste_to_pte(unsigned long rste)
* u unused, l large
*/
if (present) {
- pte_val(pte) = rste & _SEGMENT_ENTRY_ORIGIN_LARGE;
- pte_val(pte) |= _PAGE_LARGE | _PAGE_PRESENT;
- pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_READ,
- _PAGE_READ);
- pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_WRITE,
- _PAGE_WRITE);
- pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_INVALID,
- _PAGE_INVALID);
- pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_PROTECT,
- _PAGE_PROTECT);
- pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_DIRTY,
- _PAGE_DIRTY);
- pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_YOUNG,
- _PAGE_YOUNG);
+ pteval = rste & _SEGMENT_ENTRY_ORIGIN_LARGE;
+ pteval |= _PAGE_LARGE | _PAGE_PRESENT;
+ pteval |= move_set_bit(rste, _SEGMENT_ENTRY_READ, _PAGE_READ);
+ pteval |= move_set_bit(rste, _SEGMENT_ENTRY_WRITE, _PAGE_WRITE);
+ pteval |= move_set_bit(rste, _SEGMENT_ENTRY_INVALID, _PAGE_INVALID);
+ pteval |= move_set_bit(rste, _SEGMENT_ENTRY_PROTECT, _PAGE_PROTECT);
+ pteval |= move_set_bit(rste, _SEGMENT_ENTRY_DIRTY, _PAGE_DIRTY);
+ pteval |= move_set_bit(rste, _SEGMENT_ENTRY_YOUNG, _PAGE_YOUNG);
#ifdef CONFIG_MEM_SOFT_DIRTY
- pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_SOFT_DIRTY,
- _PAGE_SOFT_DIRTY);
+ pteval |= move_set_bit(rste, _SEGMENT_ENTRY_SOFT_DIRTY, _PAGE_SOFT_DIRTY);
#endif
- pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_NOEXEC,
- _PAGE_NOEXEC);
+ pteval |= move_set_bit(rste, _SEGMENT_ENTRY_NOEXEC, _PAGE_NOEXEC);
} else
- pte_val(pte) = _PAGE_INVALID;
- return pte;
+ pteval = _PAGE_INVALID;
+ return __pte(pteval);
}
static void clear_huge_pte_skeys(struct mm_struct *mm, unsigned long rste)
@@ -167,7 +160,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
rste |= _SEGMENT_ENTRY_LARGE;
clear_huge_pte_skeys(mm, rste);
- pte_val(*ptep) = rste;
+ set_pte(ptep, __pte(rste));
}
pte_t huge_ptep_get(pte_t *ptep)
@@ -244,16 +237,6 @@ int pud_huge(pud_t pud)
return pud_large(pud);
}
-struct page *
-follow_huge_pud(struct mm_struct *mm, unsigned long address,
- pud_t *pud, int flags)
-{
- if (flags & FOLL_GET)
- return NULL;
-
- return pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
-}
-
bool __init arch_hugetlb_valid_size(unsigned long size)
{
if (MACHINE_HAS_EDAT1 && size == PMD_SIZE)
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 8c6f258a6183..97d66a3e60fb 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -37,7 +37,7 @@
#include <asm/kfence.h>
#include <asm/ptdump.h>
#include <asm/dma.h>
-#include <asm/lowcore.h>
+#include <asm/abs_lowcore.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/sections.h>
@@ -47,6 +47,7 @@
#include <asm/kasan.h>
#include <asm/dma-mapping.h>
#include <asm/uv.h>
+#include <linux/virtio_anchor.h>
#include <linux/virtio_config.h>
pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(".bss..swapper_pg_dir");
@@ -168,25 +169,16 @@ bool force_dma_unencrypted(struct device *dev)
return is_prot_virt_guest();
}
-#ifdef CONFIG_ARCH_HAS_RESTRICTED_VIRTIO_MEMORY_ACCESS
-
-int arch_has_restricted_virtio_memory_access(void)
-{
- return is_prot_virt_guest();
-}
-EXPORT_SYMBOL(arch_has_restricted_virtio_memory_access);
-
-#endif
-
/* protected virtualization */
static void pv_init(void)
{
if (!is_prot_virt_guest())
return;
+ virtio_set_mem_acc_cb(virtio_require_restricted_mem_acc);
+
/* make sure bounce buffers are shared */
- swiotlb_force = SWIOTLB_FORCE;
- swiotlb_init(1);
+ swiotlb_init(true, SWIOTLB_FORCE | SWIOTLB_VERBOSE);
swiotlb_update_mem_attributes();
}
@@ -215,6 +207,9 @@ void free_initmem(void)
__set_memory((unsigned long)_sinittext,
(unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT,
SET_MEMORY_RW | SET_MEMORY_NX);
+ free_reserved_area(sclp_early_sccb,
+ sclp_early_sccb + EXT_SCCB_READ_SCP,
+ POISON_FREE_INITMEM, "unused early sccb");
free_initmem_default(POISON_FREE_INITMEM);
}
diff --git a/arch/s390/mm/kasan_init.c b/arch/s390/mm/kasan_init.c
index 483b9dbe0970..9f988d4582ed 100644
--- a/arch/s390/mm/kasan_init.c
+++ b/arch/s390/mm/kasan_init.c
@@ -175,7 +175,7 @@ static void __init kasan_early_pgtable_populate(unsigned long address,
page = kasan_early_alloc_segment();
memset(page, 0, _SEGMENT_SIZE);
}
- pmd_val(*pm_dir) = __pa(page) | sgt_prot;
+ set_pmd(pm_dir, __pmd(__pa(page) | sgt_prot));
address = (address + PMD_SIZE) & PMD_MASK;
continue;
}
@@ -194,16 +194,16 @@ static void __init kasan_early_pgtable_populate(unsigned long address,
switch (mode) {
case POPULATE_ONE2ONE:
page = (void *)address;
- pte_val(*pt_dir) = __pa(page) | pgt_prot;
+ set_pte(pt_dir, __pte(__pa(page) | pgt_prot));
break;
case POPULATE_MAP:
page = kasan_early_alloc_pages(0);
memset(page, 0, PAGE_SIZE);
- pte_val(*pt_dir) = __pa(page) | pgt_prot;
+ set_pte(pt_dir, __pte(__pa(page) | pgt_prot));
break;
case POPULATE_ZERO_SHADOW:
page = kasan_early_shadow_page;
- pte_val(*pt_dir) = __pa(page) | pgt_prot_zero;
+ set_pte(pt_dir, __pte(__pa(page) | pgt_prot_zero));
break;
case POPULATE_SHALLOW:
/* should never happen */
diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c
index 9663ce3625bc..1571cdcb0c50 100644
--- a/arch/s390/mm/maccess.c
+++ b/arch/s390/mm/maccess.c
@@ -4,8 +4,6 @@
*
* Copyright IBM Corp. 2009, 2015
*
- * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>,
- *
*/
#include <linux/uaccess.h>
@@ -14,9 +12,17 @@
#include <linux/errno.h>
#include <linux/gfp.h>
#include <linux/cpu.h>
+#include <linux/uio.h>
+#include <asm/asm-extable.h>
#include <asm/ctl_reg.h>
#include <asm/io.h>
+#include <asm/abs_lowcore.h>
#include <asm/stacktrace.h>
+#include <asm/maccess.h>
+
+unsigned long __bootdata_preserved(__memcpy_real_area);
+static __ro_after_init pte_t *memcpy_real_ptep;
+static DEFINE_MUTEX(memcpy_real_mutex);
static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t size)
{
@@ -77,144 +83,72 @@ notrace void *s390_kernel_write(void *dst, const void *src, size_t size)
return dst;
}
-static int __no_sanitize_address __memcpy_real(void *dest, void *src, size_t count)
-{
- union register_pair _dst, _src;
- int rc = -EFAULT;
-
- _dst.even = (unsigned long) dest;
- _dst.odd = (unsigned long) count;
- _src.even = (unsigned long) src;
- _src.odd = (unsigned long) count;
- asm volatile (
- "0: mvcle %[dst],%[src],0\n"
- "1: jo 0b\n"
- " lhi %[rc],0\n"
- "2:\n"
- EX_TABLE(1b,2b)
- : [rc] "+&d" (rc), [dst] "+&d" (_dst.pair), [src] "+&d" (_src.pair)
- : : "cc", "memory");
- return rc;
-}
-
-static unsigned long __no_sanitize_address _memcpy_real(unsigned long dest,
- unsigned long src,
- unsigned long count)
+void __init memcpy_real_init(void)
{
- int irqs_disabled, rc;
- unsigned long flags;
-
- if (!count)
- return 0;
- flags = arch_local_irq_save();
- irqs_disabled = arch_irqs_disabled_flags(flags);
- if (!irqs_disabled)
- trace_hardirqs_off();
- __arch_local_irq_stnsm(0xf8); // disable DAT
- rc = __memcpy_real((void *) dest, (void *) src, (size_t) count);
- if (flags & PSW_MASK_DAT)
- __arch_local_irq_stosm(0x04); // enable DAT
- if (!irqs_disabled)
- trace_hardirqs_on();
- __arch_local_irq_ssm(flags);
- return rc;
+ memcpy_real_ptep = vmem_get_alloc_pte(__memcpy_real_area, true);
+ if (!memcpy_real_ptep)
+ panic("Couldn't setup memcpy real area");
}
-/*
- * Copy memory in real mode (kernel to kernel)
- */
-int memcpy_real(void *dest, void *src, size_t count)
-{
- unsigned long _dest = (unsigned long)dest;
- unsigned long _src = (unsigned long)src;
- unsigned long _count = (unsigned long)count;
- int rc;
-
- if (S390_lowcore.nodat_stack != 0) {
- preempt_disable();
- rc = call_on_stack(3, S390_lowcore.nodat_stack,
- unsigned long, _memcpy_real,
- unsigned long, _dest,
- unsigned long, _src,
- unsigned long, _count);
- preempt_enable();
- return rc;
- }
- /*
- * This is a really early memcpy_real call, the stacks are
- * not set up yet. Just call _memcpy_real on the early boot
- * stack
- */
- return _memcpy_real(_dest, _src, _count);
-}
-
-/*
- * Copy memory in absolute mode (kernel to kernel)
- */
-void memcpy_absolute(void *dest, void *src, size_t count)
+size_t memcpy_real_iter(struct iov_iter *iter, unsigned long src, size_t count)
{
- unsigned long cr0, flags, prefix;
-
- flags = arch_local_irq_save();
- __ctl_store(cr0, 0, 0);
- __ctl_clear_bit(0, 28); /* disable lowcore protection */
- prefix = store_prefix();
- if (prefix) {
- local_mcck_disable();
- set_prefix(0);
- memcpy(dest, src, count);
- set_prefix(prefix);
- local_mcck_enable();
- } else {
- memcpy(dest, src, count);
+ size_t len, copied, res = 0;
+ unsigned long phys, offset;
+ void *chunk;
+ pte_t pte;
+
+ while (count) {
+ phys = src & PAGE_MASK;
+ offset = src & ~PAGE_MASK;
+ chunk = (void *)(__memcpy_real_area + offset);
+ len = min(count, PAGE_SIZE - offset);
+ pte = mk_pte_phys(phys, PAGE_KERNEL_RO);
+
+ mutex_lock(&memcpy_real_mutex);
+ if (pte_val(pte) != pte_val(*memcpy_real_ptep)) {
+ __ptep_ipte(__memcpy_real_area, memcpy_real_ptep, 0, 0, IPTE_GLOBAL);
+ set_pte(memcpy_real_ptep, pte);
+ }
+ copied = copy_to_iter(chunk, len, iter);
+ mutex_unlock(&memcpy_real_mutex);
+
+ count -= copied;
+ src += copied;
+ res += copied;
+ if (copied < len)
+ break;
}
- __ctl_load(cr0, 0, 0);
- arch_local_irq_restore(flags);
+ return res;
}
-/*
- * Copy memory from kernel (real) to user (virtual)
- */
-int copy_to_user_real(void __user *dest, void *src, unsigned long count)
+int memcpy_real(void *dest, unsigned long src, size_t count)
{
- int offs = 0, size, rc;
- char *buf;
-
- buf = (char *) __get_free_page(GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
- rc = -EFAULT;
- while (offs < count) {
- size = min(PAGE_SIZE, count - offs);
- if (memcpy_real(buf, src + offs, size))
- goto out;
- if (copy_to_user(dest + offs, buf, size))
- goto out;
- offs += size;
- }
- rc = 0;
-out:
- free_page((unsigned long) buf);
- return rc;
+ struct iov_iter iter;
+ struct kvec kvec;
+
+ kvec.iov_base = dest;
+ kvec.iov_len = count;
+ iov_iter_kvec(&iter, WRITE, &kvec, 1, count);
+ if (memcpy_real_iter(&iter, src, count) < count)
+ return -EFAULT;
+ return 0;
}
/*
- * Check if physical address is within prefix or zero page
+ * Find CPU that owns swapped prefix page
*/
-static int is_swapped(unsigned long addr)
+static int get_swapped_owner(phys_addr_t addr)
{
- unsigned long lc;
+ phys_addr_t lc;
int cpu;
- if (addr < sizeof(struct lowcore))
- return 1;
for_each_online_cpu(cpu) {
- lc = (unsigned long) lowcore_ptr[cpu];
+ lc = virt_to_phys(lowcore_ptr[cpu]);
if (addr > lc + sizeof(struct lowcore) - 1 || addr < lc)
continue;
- return 1;
+ return cpu;
}
- return 0;
+ return -1;
}
/*
@@ -225,18 +159,37 @@ static int is_swapped(unsigned long addr)
*/
void *xlate_dev_mem_ptr(phys_addr_t addr)
{
- void *bounce = (void *) addr;
+ void *ptr = phys_to_virt(addr);
+ void *bounce = ptr;
+ struct lowcore *abs_lc;
+ unsigned long flags;
unsigned long size;
+ int this_cpu, cpu;
cpus_read_lock();
- preempt_disable();
- if (is_swapped(addr)) {
- size = PAGE_SIZE - (addr & ~PAGE_MASK);
- bounce = (void *) __get_free_page(GFP_ATOMIC);
- if (bounce)
- memcpy_absolute(bounce, (void *) addr, size);
+ this_cpu = get_cpu();
+ if (addr >= sizeof(struct lowcore)) {
+ cpu = get_swapped_owner(addr);
+ if (cpu < 0)
+ goto out;
}
- preempt_enable();
+ bounce = (void *)__get_free_page(GFP_ATOMIC);
+ if (!bounce)
+ goto out;
+ size = PAGE_SIZE - (addr & ~PAGE_MASK);
+ if (addr < sizeof(struct lowcore)) {
+ abs_lc = get_abs_lowcore(&flags);
+ ptr = (void *)abs_lc + addr;
+ memcpy(bounce, ptr, size);
+ put_abs_lowcore(abs_lc, flags);
+ } else if (cpu == this_cpu) {
+ ptr = (void *)(addr - virt_to_phys(lowcore_ptr[cpu]));
+ memcpy(bounce, ptr, size);
+ } else {
+ memcpy(bounce, ptr, size);
+ }
+out:
+ put_cpu();
cpus_read_unlock();
return bounce;
}
@@ -244,8 +197,8 @@ void *xlate_dev_mem_ptr(phys_addr_t addr)
/*
* Free converted buffer for /dev/mem access (if necessary)
*/
-void unxlate_dev_mem_ptr(phys_addr_t addr, void *buf)
+void unxlate_dev_mem_ptr(phys_addr_t addr, void *ptr)
{
- if ((void *) addr != buf)
- free_page((unsigned long) buf);
+ if (addr != virt_to_phys(ptr))
+ free_page((unsigned long)ptr);
}
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index e54f928503c5..3327c47bc181 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -37,7 +37,7 @@ static inline int mmap_is_legacy(struct rlimit *rlim_stack)
unsigned long arch_mmap_rnd(void)
{
- return (get_random_int() & MMAP_RND_MASK) << PAGE_SHIFT;
+ return (get_random_u32() & MMAP_RND_MASK) << PAGE_SHIFT;
}
static unsigned long mmap_base_legacy(unsigned long rnd)
@@ -58,9 +58,9 @@ static inline unsigned long mmap_base(unsigned long rnd,
/*
* Top of mmap area (just below the process stack).
- * Leave at least a ~32 MB hole.
+ * Leave at least a ~128 MB hole.
*/
- gap_min = 32 * 1024 * 1024UL;
+ gap_min = SZ_128M;
gap_max = (STACK_TOP / 6) * 5;
if (gap < gap_min)
@@ -188,3 +188,23 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
mm->get_unmapped_area = arch_get_unmapped_area_topdown;
}
}
+
+static const pgprot_t protection_map[16] = {
+ [VM_NONE] = PAGE_NONE,
+ [VM_READ] = PAGE_RO,
+ [VM_WRITE] = PAGE_RO,
+ [VM_WRITE | VM_READ] = PAGE_RO,
+ [VM_EXEC] = PAGE_RX,
+ [VM_EXEC | VM_READ] = PAGE_RX,
+ [VM_EXEC | VM_WRITE] = PAGE_RX,
+ [VM_EXEC | VM_WRITE | VM_READ] = PAGE_RX,
+ [VM_SHARED] = PAGE_NONE,
+ [VM_SHARED | VM_READ] = PAGE_RO,
+ [VM_SHARED | VM_WRITE] = PAGE_RW,
+ [VM_SHARED | VM_WRITE | VM_READ] = PAGE_RW,
+ [VM_SHARED | VM_EXEC] = PAGE_RX,
+ [VM_SHARED | VM_EXEC | VM_READ] = PAGE_RX,
+ [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_RWX,
+ [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_RWX
+};
+DECLARE_VM_GET_PAGE_PROT
diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c
index 18a6381097a9..d5ea09d78938 100644
--- a/arch/s390/mm/page-states.c
+++ b/arch/s390/mm/page-states.c
@@ -14,6 +14,7 @@
#include <linux/memblock.h>
#include <linux/gfp.h>
#include <linux/init.h>
+#include <asm/asm-extable.h>
#include <asm/facility.h>
#include <asm/page-states.h>
diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c
index 654019181a37..85195c18b2e8 100644
--- a/arch/s390/mm/pageattr.c
+++ b/arch/s390/mm/pageattr.c
@@ -98,9 +98,9 @@ static int walk_pte_level(pmd_t *pmdp, unsigned long addr, unsigned long end,
else if (flags & SET_MEMORY_RW)
new = pte_mkwrite(pte_mkdirty(new));
if (flags & SET_MEMORY_NX)
- pte_val(new) |= _PAGE_NOEXEC;
+ new = set_pte_bit(new, __pgprot(_PAGE_NOEXEC));
else if (flags & SET_MEMORY_X)
- pte_val(new) &= ~_PAGE_NOEXEC;
+ new = clear_pte_bit(new, __pgprot(_PAGE_NOEXEC));
pgt_set((unsigned long *)ptep, pte_val(new), addr, CRDTE_DTT_PAGE);
ptep++;
addr += PAGE_SIZE;
@@ -127,11 +127,11 @@ static int split_pmd_page(pmd_t *pmdp, unsigned long addr)
prot &= ~_PAGE_NOEXEC;
ptep = pt_dir;
for (i = 0; i < PTRS_PER_PTE; i++) {
- pte_val(*ptep) = pte_addr | prot;
+ set_pte(ptep, __pte(pte_addr | prot));
pte_addr += PAGE_SIZE;
ptep++;
}
- pmd_val(new) = __pa(pt_dir) | _SEGMENT_ENTRY;
+ new = __pmd(__pa(pt_dir) | _SEGMENT_ENTRY);
pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT);
update_page_count(PG_DIRECT_MAP_4K, PTRS_PER_PTE);
update_page_count(PG_DIRECT_MAP_1M, -1);
@@ -148,9 +148,9 @@ static void modify_pmd_page(pmd_t *pmdp, unsigned long addr,
else if (flags & SET_MEMORY_RW)
new = pmd_mkwrite(pmd_mkdirty(new));
if (flags & SET_MEMORY_NX)
- pmd_val(new) |= _SEGMENT_ENTRY_NOEXEC;
+ new = set_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC));
else if (flags & SET_MEMORY_X)
- pmd_val(new) &= ~_SEGMENT_ENTRY_NOEXEC;
+ new = clear_pmd_bit(new, __pgprot(_SEGMENT_ENTRY_NOEXEC));
pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT);
}
@@ -208,11 +208,11 @@ static int split_pud_page(pud_t *pudp, unsigned long addr)
prot &= ~_SEGMENT_ENTRY_NOEXEC;
pmdp = pm_dir;
for (i = 0; i < PTRS_PER_PMD; i++) {
- pmd_val(*pmdp) = pmd_addr | prot;
+ set_pmd(pmdp, __pmd(pmd_addr | prot));
pmd_addr += PMD_SIZE;
pmdp++;
}
- pud_val(new) = __pa(pm_dir) | _REGION3_ENTRY;
+ new = __pud(__pa(pm_dir) | _REGION3_ENTRY);
pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3);
update_page_count(PG_DIRECT_MAP_1M, PTRS_PER_PMD);
update_page_count(PG_DIRECT_MAP_2G, -1);
@@ -229,9 +229,9 @@ static void modify_pud_page(pud_t *pudp, unsigned long addr,
else if (flags & SET_MEMORY_RW)
new = pud_mkwrite(pud_mkdirty(new));
if (flags & SET_MEMORY_NX)
- pud_val(new) |= _REGION_ENTRY_NOEXEC;
+ new = set_pud_bit(new, __pgprot(_REGION_ENTRY_NOEXEC));
else if (flags & SET_MEMORY_X)
- pud_val(new) &= ~_REGION_ENTRY_NOEXEC;
+ new = clear_pud_bit(new, __pgprot(_REGION_ENTRY_NOEXEC));
pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3);
}
@@ -347,23 +347,24 @@ static void ipte_range(pte_t *pte, unsigned long address, int nr)
void __kernel_map_pages(struct page *page, int numpages, int enable)
{
unsigned long address;
+ pte_t *ptep, pte;
int nr, i, j;
- pte_t *pte;
for (i = 0; i < numpages;) {
address = (unsigned long)page_to_virt(page + i);
- pte = virt_to_kpte(address);
- nr = (unsigned long)pte >> ilog2(sizeof(long));
+ ptep = virt_to_kpte(address);
+ nr = (unsigned long)ptep >> ilog2(sizeof(long));
nr = PTRS_PER_PTE - (nr & (PTRS_PER_PTE - 1));
nr = min(numpages - i, nr);
if (enable) {
for (j = 0; j < nr; j++) {
- pte_val(*pte) &= ~_PAGE_INVALID;
+ pte = clear_pte_bit(*ptep, __pgprot(_PAGE_INVALID));
+ set_pte(ptep, pte);
address += PAGE_SIZE;
- pte++;
+ ptep++;
}
} else {
- ipte_range(pte, address, nr);
+ ipte_range(ptep, address, nr);
}
i += nr;
}
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index 781965f7210e..2de48b2c1b04 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -53,17 +53,17 @@ __initcall(page_table_register_sysctl);
unsigned long *crst_table_alloc(struct mm_struct *mm)
{
- struct page *page = alloc_pages(GFP_KERNEL, 2);
+ struct page *page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
if (!page)
return NULL;
- arch_set_page_dat(page, 2);
+ arch_set_page_dat(page, CRST_ALLOC_ORDER);
return (unsigned long *) page_to_virt(page);
}
void crst_table_free(struct mm_struct *mm, unsigned long *table)
{
- free_pages((unsigned long) table, 2);
+ free_pages((unsigned long)table, CRST_ALLOC_ORDER);
}
static void __crst_table_upgrade(void *arg)
@@ -176,7 +176,75 @@ void page_table_free_pgste(struct page *page)
#endif /* CONFIG_PGSTE */
/*
- * page table entry allocation/free routines.
+ * A 2KB-pgtable is either upper or lower half of a normal page.
+ * The second half of the page may be unused or used as another
+ * 2KB-pgtable.
+ *
+ * Whenever possible the parent page for a new 2KB-pgtable is picked
+ * from the list of partially allocated pages mm_context_t::pgtable_list.
+ * In case the list is empty a new parent page is allocated and added to
+ * the list.
+ *
+ * When a parent page gets fully allocated it contains 2KB-pgtables in both
+ * upper and lower halves and is removed from mm_context_t::pgtable_list.
+ *
+ * When 2KB-pgtable is freed from to fully allocated parent page that
+ * page turns partially allocated and added to mm_context_t::pgtable_list.
+ *
+ * If 2KB-pgtable is freed from the partially allocated parent page that
+ * page turns unused and gets removed from mm_context_t::pgtable_list.
+ * Furthermore, the unused parent page is released.
+ *
+ * As follows from the above, no unallocated or fully allocated parent
+ * pages are contained in mm_context_t::pgtable_list.
+ *
+ * The upper byte (bits 24-31) of the parent page _refcount is used
+ * for tracking contained 2KB-pgtables and has the following format:
+ *
+ * PP AA
+ * 01234567 upper byte (bits 24-31) of struct page::_refcount
+ * || ||
+ * || |+--- upper 2KB-pgtable is allocated
+ * || +---- lower 2KB-pgtable is allocated
+ * |+------- upper 2KB-pgtable is pending for removal
+ * +-------- lower 2KB-pgtable is pending for removal
+ *
+ * (See commit 620b4e903179 ("s390: use _refcount for pgtables") on why
+ * using _refcount is possible).
+ *
+ * When 2KB-pgtable is allocated the corresponding AA bit is set to 1.
+ * The parent page is either:
+ * - added to mm_context_t::pgtable_list in case the second half of the
+ * parent page is still unallocated;
+ * - removed from mm_context_t::pgtable_list in case both hales of the
+ * parent page are allocated;
+ * These operations are protected with mm_context_t::lock.
+ *
+ * When 2KB-pgtable is deallocated the corresponding AA bit is set to 0
+ * and the corresponding PP bit is set to 1 in a single atomic operation.
+ * Thus, PP and AA bits corresponding to the same 2KB-pgtable are mutually
+ * exclusive and may never be both set to 1!
+ * The parent page is either:
+ * - added to mm_context_t::pgtable_list in case the second half of the
+ * parent page is still allocated;
+ * - removed from mm_context_t::pgtable_list in case the second half of
+ * the parent page is unallocated;
+ * These operations are protected with mm_context_t::lock.
+ *
+ * It is important to understand that mm_context_t::lock only protects
+ * mm_context_t::pgtable_list and AA bits, but not the parent page itself
+ * and PP bits.
+ *
+ * Releasing the parent page happens whenever the PP bit turns from 1 to 0,
+ * while both AA bits and the second PP bit are already unset. Then the
+ * parent page does not contain any 2KB-pgtable fragment anymore, and it has
+ * also been removed from mm_context_t::pgtable_list. It is safe to release
+ * the page therefore.
+ *
+ * PGSTE memory spaces use full 4KB-pgtables and do not need most of the
+ * logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable
+ * while the PP bits are never used, nor such a page is added to or removed
+ * from mm_context_t::pgtable_list.
*/
unsigned long *page_table_alloc(struct mm_struct *mm)
{
@@ -192,14 +260,23 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
page = list_first_entry(&mm->context.pgtable_list,
struct page, lru);
mask = atomic_read(&page->_refcount) >> 24;
- mask = (mask | (mask >> 4)) & 3;
- if (mask != 3) {
+ /*
+ * The pending removal bits must also be checked.
+ * Failure to do so might lead to an impossible
+ * value of (i.e 0x13 or 0x23) written to _refcount.
+ * Such values violate the assumption that pending and
+ * allocation bits are mutually exclusive, and the rest
+ * of the code unrails as result. That could lead to
+ * a whole bunch of races and corruptions.
+ */
+ mask = (mask | (mask >> 4)) & 0x03U;
+ if (mask != 0x03U) {
table = (unsigned long *) page_to_virt(page);
bit = mask & 1; /* =1 -> second 2K */
if (bit)
table += PTRS_PER_PTE;
atomic_xor_bits(&page->_refcount,
- 1U << (bit + 24));
+ 0x01U << (bit + 24));
list_del(&page->lru);
}
}
@@ -220,12 +297,12 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
table = (unsigned long *) page_to_virt(page);
if (mm_alloc_pgste(mm)) {
/* Return 4K page table with PGSTEs */
- atomic_xor_bits(&page->_refcount, 3 << 24);
+ atomic_xor_bits(&page->_refcount, 0x03U << 24);
memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
} else {
/* Return the first 2K fragment of the page */
- atomic_xor_bits(&page->_refcount, 1 << 24);
+ atomic_xor_bits(&page->_refcount, 0x01U << 24);
memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE);
spin_lock_bh(&mm->context.lock);
list_add(&page->lru, &mm->context.pgtable_list);
@@ -234,29 +311,53 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
return table;
}
+static void page_table_release_check(struct page *page, void *table,
+ unsigned int half, unsigned int mask)
+{
+ char msg[128];
+
+ if (!IS_ENABLED(CONFIG_DEBUG_VM) || !mask)
+ return;
+ snprintf(msg, sizeof(msg),
+ "Invalid pgtable %p release half 0x%02x mask 0x%02x",
+ table, half, mask);
+ dump_page(page, msg);
+}
+
void page_table_free(struct mm_struct *mm, unsigned long *table)
{
+ unsigned int mask, bit, half;
struct page *page;
- unsigned int bit, mask;
page = virt_to_page(table);
if (!mm_alloc_pgste(mm)) {
/* Free 2K page table fragment of a 4K page */
bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
spin_lock_bh(&mm->context.lock);
- mask = atomic_xor_bits(&page->_refcount, 1U << (bit + 24));
+ /*
+ * Mark the page for delayed release. The actual release
+ * will happen outside of the critical section from this
+ * function or from __tlb_remove_table()
+ */
+ mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
mask >>= 24;
- if (mask & 3)
+ if (mask & 0x03U)
list_add(&page->lru, &mm->context.pgtable_list);
else
list_del(&page->lru);
spin_unlock_bh(&mm->context.lock);
- if (mask != 0)
+ mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24));
+ mask >>= 24;
+ if (mask != 0x00U)
return;
+ half = 0x01U << bit;
} else {
- atomic_xor_bits(&page->_refcount, 3U << 24);
+ half = 0x03U;
+ mask = atomic_xor_bits(&page->_refcount, 0x03U << 24);
+ mask >>= 24;
}
+ page_table_release_check(page, table, half, mask);
pgtable_pte_page_dtor(page);
__free_page(page);
}
@@ -272,47 +373,54 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
page = virt_to_page(table);
if (mm_alloc_pgste(mm)) {
gmap_unlink(mm, table, vmaddr);
- table = (unsigned long *) ((unsigned long)table | 3);
+ table = (unsigned long *) ((unsigned long)table | 0x03U);
tlb_remove_table(tlb, table);
return;
}
bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
spin_lock_bh(&mm->context.lock);
+ /*
+ * Mark the page for delayed release. The actual release will happen
+ * outside of the critical section from __tlb_remove_table() or from
+ * page_table_free()
+ */
mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
mask >>= 24;
- if (mask & 3)
+ if (mask & 0x03U)
list_add_tail(&page->lru, &mm->context.pgtable_list);
else
list_del(&page->lru);
spin_unlock_bh(&mm->context.lock);
- table = (unsigned long *) ((unsigned long) table | (1U << bit));
+ table = (unsigned long *) ((unsigned long) table | (0x01U << bit));
tlb_remove_table(tlb, table);
}
void __tlb_remove_table(void *_table)
{
- unsigned int mask = (unsigned long) _table & 3;
+ unsigned int mask = (unsigned long) _table & 0x03U, half = mask;
void *table = (void *)((unsigned long) _table ^ mask);
struct page *page = virt_to_page(table);
- switch (mask) {
- case 0: /* pmd, pud, or p4d */
- free_pages((unsigned long) table, 2);
- break;
- case 1: /* lower 2K of a 4K page table */
- case 2: /* higher 2K of a 4K page table */
+ switch (half) {
+ case 0x00U: /* pmd, pud, or p4d */
+ free_pages((unsigned long)table, CRST_ALLOC_ORDER);
+ return;
+ case 0x01U: /* lower 2K of a 4K page table */
+ case 0x02U: /* higher 2K of a 4K page table */
mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24));
mask >>= 24;
- if (mask != 0)
- break;
- fallthrough;
- case 3: /* 4K page table with pgstes */
- if (mask & 3)
- atomic_xor_bits(&page->_refcount, 3 << 24);
- pgtable_pte_page_dtor(page);
- __free_page(page);
+ if (mask != 0x00U)
+ return;
+ break;
+ case 0x03U: /* 4K page table with pgstes */
+ mask = atomic_xor_bits(&page->_refcount, 0x03U << 24);
+ mask >>= 24;
break;
}
+
+ page_table_release_check(page, table, half, mask);
+ pgtable_pte_page_dtor(page);
+ __free_page(page);
}
/*
@@ -322,34 +430,34 @@ void __tlb_remove_table(void *_table)
static struct kmem_cache *base_pgt_cache;
-static unsigned long base_pgt_alloc(void)
+static unsigned long *base_pgt_alloc(void)
{
- u64 *table;
+ unsigned long *table;
table = kmem_cache_alloc(base_pgt_cache, GFP_KERNEL);
if (table)
- memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
- return (unsigned long) table;
+ memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
+ return table;
}
-static void base_pgt_free(unsigned long table)
+static void base_pgt_free(unsigned long *table)
{
- kmem_cache_free(base_pgt_cache, (void *) table);
+ kmem_cache_free(base_pgt_cache, table);
}
-static unsigned long base_crst_alloc(unsigned long val)
+static unsigned long *base_crst_alloc(unsigned long val)
{
- unsigned long table;
+ unsigned long *table;
- table = __get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
+ table = (unsigned long *)__get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
if (table)
- crst_table_init((unsigned long *)table, val);
+ crst_table_init(table, val);
return table;
}
-static void base_crst_free(unsigned long table)
+static void base_crst_free(unsigned long *table)
{
- free_pages(table, CRST_ALLOC_ORDER);
+ free_pages((unsigned long)table, CRST_ALLOC_ORDER);
}
#define BASE_ADDR_END_FUNC(NAME, SIZE) \
@@ -377,14 +485,14 @@ static inline unsigned long base_lra(unsigned long address)
return real;
}
-static int base_page_walk(unsigned long origin, unsigned long addr,
+static int base_page_walk(unsigned long *origin, unsigned long addr,
unsigned long end, int alloc)
{
unsigned long *pte, next;
if (!alloc)
return 0;
- pte = (unsigned long *) origin;
+ pte = origin;
pte += (addr & _PAGE_INDEX) >> _PAGE_SHIFT;
do {
next = base_page_addr_end(addr, end);
@@ -393,13 +501,13 @@ static int base_page_walk(unsigned long origin, unsigned long addr,
return 0;
}
-static int base_segment_walk(unsigned long origin, unsigned long addr,
+static int base_segment_walk(unsigned long *origin, unsigned long addr,
unsigned long end, int alloc)
{
- unsigned long *ste, next, table;
+ unsigned long *ste, next, *table;
int rc;
- ste = (unsigned long *) origin;
+ ste = origin;
ste += (addr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
do {
next = base_segment_addr_end(addr, end);
@@ -409,9 +517,9 @@ static int base_segment_walk(unsigned long origin, unsigned long addr,
table = base_pgt_alloc();
if (!table)
return -ENOMEM;
- *ste = table | _SEGMENT_ENTRY;
+ *ste = __pa(table) | _SEGMENT_ENTRY;
}
- table = *ste & _SEGMENT_ENTRY_ORIGIN;
+ table = __va(*ste & _SEGMENT_ENTRY_ORIGIN);
rc = base_page_walk(table, addr, next, alloc);
if (rc)
return rc;
@@ -422,13 +530,13 @@ static int base_segment_walk(unsigned long origin, unsigned long addr,
return 0;
}
-static int base_region3_walk(unsigned long origin, unsigned long addr,
+static int base_region3_walk(unsigned long *origin, unsigned long addr,
unsigned long end, int alloc)
{
- unsigned long *rtte, next, table;
+ unsigned long *rtte, next, *table;
int rc;
- rtte = (unsigned long *) origin;
+ rtte = origin;
rtte += (addr & _REGION3_INDEX) >> _REGION3_SHIFT;
do {
next = base_region3_addr_end(addr, end);
@@ -438,9 +546,9 @@ static int base_region3_walk(unsigned long origin, unsigned long addr,
table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
if (!table)
return -ENOMEM;
- *rtte = table | _REGION3_ENTRY;
+ *rtte = __pa(table) | _REGION3_ENTRY;
}
- table = *rtte & _REGION_ENTRY_ORIGIN;
+ table = __va(*rtte & _REGION_ENTRY_ORIGIN);
rc = base_segment_walk(table, addr, next, alloc);
if (rc)
return rc;
@@ -450,13 +558,13 @@ static int base_region3_walk(unsigned long origin, unsigned long addr,
return 0;
}
-static int base_region2_walk(unsigned long origin, unsigned long addr,
+static int base_region2_walk(unsigned long *origin, unsigned long addr,
unsigned long end, int alloc)
{
- unsigned long *rste, next, table;
+ unsigned long *rste, next, *table;
int rc;
- rste = (unsigned long *) origin;
+ rste = origin;
rste += (addr & _REGION2_INDEX) >> _REGION2_SHIFT;
do {
next = base_region2_addr_end(addr, end);
@@ -466,9 +574,9 @@ static int base_region2_walk(unsigned long origin, unsigned long addr,
table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
if (!table)
return -ENOMEM;
- *rste = table | _REGION2_ENTRY;
+ *rste = __pa(table) | _REGION2_ENTRY;
}
- table = *rste & _REGION_ENTRY_ORIGIN;
+ table = __va(*rste & _REGION_ENTRY_ORIGIN);
rc = base_region3_walk(table, addr, next, alloc);
if (rc)
return rc;
@@ -478,13 +586,13 @@ static int base_region2_walk(unsigned long origin, unsigned long addr,
return 0;
}
-static int base_region1_walk(unsigned long origin, unsigned long addr,
+static int base_region1_walk(unsigned long *origin, unsigned long addr,
unsigned long end, int alloc)
{
- unsigned long *rfte, next, table;
+ unsigned long *rfte, next, *table;
int rc;
- rfte = (unsigned long *) origin;
+ rfte = origin;
rfte += (addr & _REGION1_INDEX) >> _REGION1_SHIFT;
do {
next = base_region1_addr_end(addr, end);
@@ -494,9 +602,9 @@ static int base_region1_walk(unsigned long origin, unsigned long addr,
table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
if (!table)
return -ENOMEM;
- *rfte = table | _REGION1_ENTRY;
+ *rfte = __pa(table) | _REGION1_ENTRY;
}
- table = *rfte & _REGION_ENTRY_ORIGIN;
+ table = __va(*rfte & _REGION_ENTRY_ORIGIN);
rc = base_region2_walk(table, addr, next, alloc);
if (rc)
return rc;
@@ -515,7 +623,7 @@ static int base_region1_walk(unsigned long origin, unsigned long addr,
*/
void base_asce_free(unsigned long asce)
{
- unsigned long table = asce & _ASCE_ORIGIN;
+ unsigned long *table = __va(asce & _ASCE_ORIGIN);
if (!asce)
return;
@@ -567,7 +675,7 @@ static int base_pgt_cache_init(void)
*/
unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages)
{
- unsigned long asce, table, end;
+ unsigned long asce, *table, end;
int rc;
if (base_pgt_cache_init())
@@ -578,25 +686,25 @@ unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages)
if (!table)
return 0;
rc = base_segment_walk(table, addr, end, 1);
- asce = table | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH;
+ asce = __pa(table) | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH;
} else if (end <= _REGION2_SIZE) {
table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
if (!table)
return 0;
rc = base_region3_walk(table, addr, end, 1);
- asce = table | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
+ asce = __pa(table) | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
} else if (end <= _REGION1_SIZE) {
table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
if (!table)
return 0;
rc = base_region2_walk(table, addr, end, 1);
- asce = table | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
+ asce = __pa(table) | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
} else {
table = base_crst_alloc(_REGION1_ENTRY_EMPTY);
if (!table)
return 0;
rc = base_region1_walk(table, addr, end, 1);
- asce = table | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH;
+ asce = __pa(table) | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH;
}
if (rc) {
base_asce_free(asce);
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index c16232cd0ec5..4909dcd762e8 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -115,7 +115,7 @@ static inline pte_t ptep_flush_lazy(struct mm_struct *mm,
atomic_inc(&mm->context.flush_count);
if (cpumask_equal(&mm->context.cpu_attach_mask,
cpumask_of(smp_processor_id()))) {
- pte_val(*ptep) |= _PAGE_INVALID;
+ set_pte(ptep, set_pte_bit(*ptep, __pgprot(_PAGE_INVALID)));
mm->context.flush_mm = 1;
} else
ptep_ipte_global(mm, addr, ptep, nodat);
@@ -224,15 +224,15 @@ static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry)
* Without enhanced suppression-on-protection force
* the dirty bit on for all writable ptes.
*/
- pte_val(entry) |= _PAGE_DIRTY;
- pte_val(entry) &= ~_PAGE_PROTECT;
+ entry = set_pte_bit(entry, __pgprot(_PAGE_DIRTY));
+ entry = clear_pte_bit(entry, __pgprot(_PAGE_PROTECT));
}
if (!(pte_val(entry) & _PAGE_PROTECT))
/* This pte allows write access, set user-dirty */
pgste_val(pgste) |= PGSTE_UC_BIT;
}
#endif
- *ptep = entry;
+ set_pte(ptep, entry);
return pgste;
}
@@ -275,12 +275,12 @@ static inline pte_t ptep_xchg_commit(struct mm_struct *mm,
pgste = pgste_update_all(old, pgste, mm);
if ((pgste_val(pgste) & _PGSTE_GPS_USAGE_MASK) ==
_PGSTE_GPS_USAGE_UNUSED)
- pte_val(old) |= _PAGE_UNUSED;
+ old = set_pte_bit(old, __pgprot(_PAGE_UNUSED));
}
pgste = pgste_set_pte(ptep, pgste, new);
pgste_set_unlock(ptep, pgste);
} else {
- *ptep = new;
+ set_pte(ptep, new);
}
return old;
}
@@ -345,14 +345,14 @@ void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr,
struct mm_struct *mm = vma->vm_mm;
if (!MACHINE_HAS_NX)
- pte_val(pte) &= ~_PAGE_NOEXEC;
+ pte = clear_pte_bit(pte, __pgprot(_PAGE_NOEXEC));
if (mm_has_pgste(mm)) {
pgste = pgste_get(ptep);
pgste_set_key(ptep, pgste, pte, mm);
pgste = pgste_set_pte(ptep, pgste, pte);
pgste_set_unlock(ptep, pgste);
} else {
- *ptep = pte;
+ set_pte(ptep, pte);
}
preempt_enable();
}
@@ -417,7 +417,7 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm,
atomic_inc(&mm->context.flush_count);
if (cpumask_equal(&mm->context.cpu_attach_mask,
cpumask_of(smp_processor_id()))) {
- pmd_val(*pmdp) |= _SEGMENT_ENTRY_INVALID;
+ set_pmd(pmdp, set_pmd_bit(*pmdp, __pgprot(_SEGMENT_ENTRY_INVALID)));
mm->context.flush_mm = 1;
if (mm_has_pgste(mm))
gmap_pmdp_invalidate(mm, addr);
@@ -469,7 +469,7 @@ pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr,
preempt_disable();
old = pmdp_flush_direct(mm, addr, pmdp);
- *pmdp = new;
+ set_pmd(pmdp, new);
preempt_enable();
return old;
}
@@ -482,7 +482,7 @@ pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr,
preempt_disable();
old = pmdp_flush_lazy(mm, addr, pmdp);
- *pmdp = new;
+ set_pmd(pmdp, new);
preempt_enable();
return old;
}
@@ -539,7 +539,7 @@ pud_t pudp_xchg_direct(struct mm_struct *mm, unsigned long addr,
preempt_disable();
old = pudp_flush_direct(mm, addr, pudp);
- *pudp = new;
+ set_pud(pudp, new);
preempt_enable();
return old;
}
@@ -579,9 +579,9 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
list_del(lh);
}
ptep = (pte_t *) pgtable;
- pte_val(*ptep) = _PAGE_INVALID;
+ set_pte(ptep, __pte(_PAGE_INVALID));
ptep++;
- pte_val(*ptep) = _PAGE_INVALID;
+ set_pte(ptep, __pte(_PAGE_INVALID));
return pgtable;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -646,12 +646,12 @@ int ptep_force_prot(struct mm_struct *mm, unsigned long addr,
if (prot == PROT_NONE && !pte_i) {
ptep_flush_direct(mm, addr, ptep, nodat);
pgste = pgste_update_all(entry, pgste, mm);
- pte_val(entry) |= _PAGE_INVALID;
+ entry = set_pte_bit(entry, __pgprot(_PAGE_INVALID));
}
if (prot == PROT_READ && !pte_p) {
ptep_flush_direct(mm, addr, ptep, nodat);
- pte_val(entry) &= ~_PAGE_INVALID;
- pte_val(entry) |= _PAGE_PROTECT;
+ entry = clear_pte_bit(entry, __pgprot(_PAGE_INVALID));
+ entry = set_pte_bit(entry, __pgprot(_PAGE_PROTECT));
}
pgste_val(pgste) |= bit;
pgste = pgste_set_pte(ptep, pgste, entry);
@@ -675,8 +675,8 @@ int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
!(pte_val(pte) & _PAGE_PROTECT))) {
pgste_val(spgste) |= PGSTE_VSIE_BIT;
tpgste = pgste_get_lock(tptep);
- pte_val(tpte) = (pte_val(spte) & PAGE_MASK) |
- (pte_val(pte) & _PAGE_PROTECT);
+ tpte = __pte((pte_val(spte) & PAGE_MASK) |
+ (pte_val(pte) & _PAGE_PROTECT));
/* don't touch the storage key - it belongs to parent pgste */
tpgste = pgste_set_pte(tptep, tpgste, tpte);
pgste_set_unlock(tptep, tpgste);
@@ -748,7 +748,7 @@ void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
pgste_val(pgste) |= PGSTE_GR_BIT | PGSTE_GC_BIT;
ptev = pte_val(*ptep);
if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE))
- page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 1);
+ page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 0);
pgste_set_unlock(ptep, pgste);
preempt_enable();
}
@@ -773,10 +773,10 @@ bool ptep_test_and_clear_uc(struct mm_struct *mm, unsigned long addr,
nodat = !!(pgste_val(pgste) & _PGSTE_GPS_NODAT);
ptep_ipte_global(mm, addr, ptep, nodat);
if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE))
- pte_val(pte) |= _PAGE_PROTECT;
+ pte = set_pte_bit(pte, __pgprot(_PAGE_PROTECT));
else
- pte_val(pte) |= _PAGE_INVALID;
- *ptep = pte;
+ pte = set_pte_bit(pte, __pgprot(_PAGE_INVALID));
+ set_pte(ptep, pte);
}
pgste_set_unlock(ptep, pgste);
return dirty;
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index 7d9705eeb02f..ee1a97078527 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -1,7 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright IBM Corp. 2006
- * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
*/
#include <linux/memory_hotplug.h>
@@ -175,9 +174,9 @@ static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr,
if (!new_page)
goto out;
- pte_val(*pte) = __pa(new_page) | prot;
+ set_pte(pte, __pte(__pa(new_page) | prot));
} else {
- pte_val(*pte) = __pa(addr) | prot;
+ set_pte(pte, __pte(__pa(addr) | prot));
}
} else {
continue;
@@ -241,9 +240,9 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
} else if (pmd_none(*pmd)) {
if (IS_ALIGNED(addr, PMD_SIZE) &&
IS_ALIGNED(next, PMD_SIZE) &&
- MACHINE_HAS_EDAT1 && addr && direct &&
+ MACHINE_HAS_EDAT1 && direct &&
!debug_pagealloc_enabled()) {
- pmd_val(*pmd) = __pa(addr) | prot;
+ set_pmd(pmd, __pmd(__pa(addr) | prot));
pages++;
continue;
} else if (!direct && MACHINE_HAS_EDAT1) {
@@ -258,7 +257,7 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
*/
new_page = vmemmap_alloc_block(PMD_SIZE, NUMA_NO_NODE);
if (new_page) {
- pmd_val(*pmd) = __pa(new_page) | prot;
+ set_pmd(pmd, __pmd(__pa(new_page) | prot));
if (!IS_ALIGNED(addr, PMD_SIZE) ||
!IS_ALIGNED(next, PMD_SIZE)) {
vmemmap_use_new_sub_pmd(addr, next);
@@ -337,9 +336,9 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end,
} else if (pud_none(*pud)) {
if (IS_ALIGNED(addr, PUD_SIZE) &&
IS_ALIGNED(next, PUD_SIZE) &&
- MACHINE_HAS_EDAT2 && addr && direct &&
+ MACHINE_HAS_EDAT2 && direct &&
!debug_pagealloc_enabled()) {
- pud_val(*pud) = __pa(addr) | prot;
+ set_pud(pud, __pud(__pa(addr) | prot));
pages++;
continue;
}
@@ -562,6 +561,103 @@ int vmem_add_mapping(unsigned long start, unsigned long size)
}
/*
+ * Allocate new or return existing page-table entry, but do not map it
+ * to any physical address. If missing, allocate segment- and region-
+ * table entries along. Meeting a large segment- or region-table entry
+ * while traversing is an error, since the function is expected to be
+ * called against virtual regions reserverd for 4KB mappings only.
+ */
+pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc)
+{
+ pte_t *ptep = NULL;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = pgd_offset_k(addr);
+ if (pgd_none(*pgd)) {
+ if (!alloc)
+ goto out;
+ p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY);
+ if (!p4d)
+ goto out;
+ pgd_populate(&init_mm, pgd, p4d);
+ }
+ p4d = p4d_offset(pgd, addr);
+ if (p4d_none(*p4d)) {
+ if (!alloc)
+ goto out;
+ pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY);
+ if (!pud)
+ goto out;
+ p4d_populate(&init_mm, p4d, pud);
+ }
+ pud = pud_offset(p4d, addr);
+ if (pud_none(*pud)) {
+ if (!alloc)
+ goto out;
+ pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY);
+ if (!pmd)
+ goto out;
+ pud_populate(&init_mm, pud, pmd);
+ } else if (WARN_ON_ONCE(pud_large(*pud))) {
+ goto out;
+ }
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd)) {
+ if (!alloc)
+ goto out;
+ pte = vmem_pte_alloc();
+ if (!pte)
+ goto out;
+ pmd_populate(&init_mm, pmd, pte);
+ } else if (WARN_ON_ONCE(pmd_large(*pmd))) {
+ goto out;
+ }
+ ptep = pte_offset_kernel(pmd, addr);
+out:
+ return ptep;
+}
+
+int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc)
+{
+ pte_t *ptep, pte;
+
+ if (!IS_ALIGNED(addr, PAGE_SIZE))
+ return -EINVAL;
+ ptep = vmem_get_alloc_pte(addr, alloc);
+ if (!ptep)
+ return -ENOMEM;
+ __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL);
+ pte = mk_pte_phys(phys, prot);
+ set_pte(ptep, pte);
+ return 0;
+}
+
+int vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot)
+{
+ int rc;
+
+ mutex_lock(&vmem_mutex);
+ rc = __vmem_map_4k_page(addr, phys, prot, true);
+ mutex_unlock(&vmem_mutex);
+ return rc;
+}
+
+void vmem_unmap_4k_page(unsigned long addr)
+{
+ pte_t *ptep;
+
+ mutex_lock(&vmem_mutex);
+ ptep = virt_to_kpte(addr);
+ __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL);
+ pte_clear(&init_mm, addr, ptep);
+ mutex_unlock(&vmem_mutex);
+}
+
+/*
* map whole physical memory to virtual memory (identity mapping)
* we reserve enough space in the vmalloc area for vmemmap to hotplug
* additional memory segments.
@@ -585,13 +681,12 @@ void __init vmem_map_init(void)
__set_memory(__stext_amode31, (__etext_amode31 - __stext_amode31) >> PAGE_SHIFT,
SET_MEMORY_RO | SET_MEMORY_X);
- if (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)) {
- /*
- * Lowcore must be executable for LPSWE
- * and expoline trampoline branch instructions.
- */
+ /* lowcore requires 4k mapping for real addresses / prefixing */
+ set_memory_4k(0, LC_PAGES);
+
+ /* lowcore must be executable for LPSWE */
+ if (!static_key_enabled(&cpu_has_bear))
set_memory_x(0, 1);
- }
pr_info("Write protected kernel read-only data: %luk\n",
(unsigned long)(__end_rodata - _stext) >> 10);