aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/arch/riscv/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/riscv/mm')
-rw-r--r--arch/riscv/mm/Makefile7
-rw-r--r--arch/riscv/mm/cacheflush.c151
-rw-r--r--arch/riscv/mm/context.c46
-rw-r--r--arch/riscv/mm/dma-noncoherent.c3
-rw-r--r--arch/riscv/mm/fault.c74
-rw-r--r--arch/riscv/mm/hugetlbpage.c90
-rw-r--r--arch/riscv/mm/init.c637
-rw-r--r--arch/riscv/mm/kasan_init.c14
-rw-r--r--arch/riscv/mm/pageattr.c43
-rw-r--r--arch/riscv/mm/pgtable.c25
-rw-r--r--arch/riscv/mm/physaddr.c2
-rw-r--r--arch/riscv/mm/ptdump.c49
-rw-r--r--arch/riscv/mm/tlbflush.c133
13 files changed, 966 insertions, 308 deletions
diff --git a/arch/riscv/mm/Makefile b/arch/riscv/mm/Makefile
index 2c869f8026a8..b916a68d324a 100644
--- a/arch/riscv/mm/Makefile
+++ b/arch/riscv/mm/Makefile
@@ -13,16 +13,13 @@ endif
KCOV_INSTRUMENT_init.o := n
obj-y += init.o
-obj-$(CONFIG_MMU) += extable.o fault.o pageattr.o pgtable.o
+obj-$(CONFIG_MMU) += extable.o fault.o pageattr.o pgtable.o tlbflush.o
obj-y += cacheflush.o
obj-y += context.o
obj-y += pmem.o
-ifeq ($(CONFIG_MMU),y)
-obj-$(CONFIG_SMP) += tlbflush.o
-endif
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
-obj-$(CONFIG_PTDUMP_CORE) += ptdump.o
+obj-$(CONFIG_PTDUMP) += ptdump.o
obj-$(CONFIG_KASAN) += kasan_init.o
ifdef CONFIG_KASAN
diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c
index 55a34f2020a8..4ca5aafce22e 100644
--- a/arch/riscv/mm/cacheflush.c
+++ b/arch/riscv/mm/cacheflush.c
@@ -5,6 +5,7 @@
#include <linux/acpi.h>
#include <linux/of.h>
+#include <linux/prctl.h>
#include <asm/acpi.h>
#include <asm/cacheflush.h>
@@ -21,7 +22,22 @@ void flush_icache_all(void)
{
local_flush_icache_all();
- if (IS_ENABLED(CONFIG_RISCV_SBI) && !riscv_use_ipi_for_rfence())
+ if (num_online_cpus() < 2)
+ return;
+
+ /*
+ * Make sure all previous writes to the D$ are ordered before making
+ * the IPI. The RISC-V spec states that a hart must execute a data fence
+ * before triggering a remote fence.i in order to make the modification
+ * visable for remote harts.
+ *
+ * IPIs on RISC-V are triggered by MMIO writes to either CLINT or
+ * S-IMSIC, so the fence ensures previous data writes "happen before"
+ * the MMIO.
+ */
+ RISCV_FENCE(w, o);
+
+ if (riscv_use_sbi_for_rfence())
sbi_remote_fence_i(NULL);
else
on_each_cpu(ipi_remote_fence_i, NULL, 1);
@@ -69,8 +85,7 @@ void flush_icache_mm(struct mm_struct *mm, bool local)
* with flush_icache_deferred().
*/
smp_mb();
- } else if (IS_ENABLED(CONFIG_RISCV_SBI) &&
- !riscv_use_ipi_for_rfence()) {
+ } else if (riscv_use_sbi_for_rfence()) {
sbi_remote_fence_i(&others);
} else {
on_each_cpu_mask(&others, ipi_remote_fence_i, NULL, 1);
@@ -82,12 +97,12 @@ void flush_icache_mm(struct mm_struct *mm, bool local)
#endif /* CONFIG_SMP */
#ifdef CONFIG_MMU
-void flush_icache_pte(pte_t pte)
+void flush_icache_pte(struct mm_struct *mm, pte_t pte)
{
struct folio *folio = page_folio(pte_page(pte));
if (!test_bit(PG_dcache_clean, &folio->flags)) {
- flush_icache_all();
+ flush_icache_mm(mm, false);
set_bit(PG_dcache_clean, &folio->flags);
}
}
@@ -99,6 +114,9 @@ EXPORT_SYMBOL_GPL(riscv_cbom_block_size);
unsigned int riscv_cboz_block_size;
EXPORT_SYMBOL_GPL(riscv_cboz_block_size);
+unsigned int riscv_cbop_block_size;
+EXPORT_SYMBOL_GPL(riscv_cbop_block_size);
+
static void __init cbo_get_block_size(struct device_node *node,
const char *name, u32 *block_size,
unsigned long *first_hartid)
@@ -123,8 +141,8 @@ static void __init cbo_get_block_size(struct device_node *node,
void __init riscv_init_cbo_blocksizes(void)
{
- unsigned long cbom_hartid, cboz_hartid;
- u32 cbom_block_size = 0, cboz_block_size = 0;
+ unsigned long cbom_hartid, cboz_hartid, cbop_hartid;
+ u32 cbom_block_size = 0, cboz_block_size = 0, cbop_block_size = 0;
struct device_node *node;
struct acpi_table_header *rhct;
acpi_status status;
@@ -136,13 +154,15 @@ void __init riscv_init_cbo_blocksizes(void)
&cbom_block_size, &cbom_hartid);
cbo_get_block_size(node, "riscv,cboz-block-size",
&cboz_block_size, &cboz_hartid);
+ cbo_get_block_size(node, "riscv,cbop-block-size",
+ &cbop_block_size, &cbop_hartid);
}
} else {
status = acpi_get_table(ACPI_SIG_RHCT, 0, &rhct);
if (ACPI_FAILURE(status))
return;
- acpi_get_cbo_block_size(rhct, &cbom_block_size, &cboz_block_size, NULL);
+ acpi_get_cbo_block_size(rhct, &cbom_block_size, &cboz_block_size, &cbop_block_size);
acpi_put_table((struct acpi_table_header *)rhct);
}
@@ -151,4 +171,119 @@ void __init riscv_init_cbo_blocksizes(void)
if (cboz_block_size)
riscv_cboz_block_size = cboz_block_size;
+
+ if (cbop_block_size)
+ riscv_cbop_block_size = cbop_block_size;
+}
+
+#ifdef CONFIG_SMP
+static void set_icache_stale_mask(void)
+{
+ int cpu = get_cpu();
+ cpumask_t *mask;
+ bool stale_cpu;
+
+ /*
+ * Mark every other hart's icache as needing a flush for
+ * this MM. Maintain the previous value of the current
+ * cpu to handle the case when this function is called
+ * concurrently on different harts.
+ */
+ mask = &current->mm->context.icache_stale_mask;
+ stale_cpu = cpumask_test_cpu(cpu, mask);
+
+ cpumask_setall(mask);
+ __assign_cpu(cpu, mask, stale_cpu);
+ put_cpu();
+}
+#endif
+
+/**
+ * riscv_set_icache_flush_ctx() - Enable/disable icache flushing instructions in
+ * userspace.
+ * @ctx: Set the type of icache flushing instructions permitted/prohibited in
+ * userspace. Supported values described below.
+ *
+ * Supported values for ctx:
+ *
+ * * %PR_RISCV_CTX_SW_FENCEI_ON: Allow fence.i in user space.
+ *
+ * * %PR_RISCV_CTX_SW_FENCEI_OFF: Disallow fence.i in user space. All threads in
+ * a process will be affected when ``scope == PR_RISCV_SCOPE_PER_PROCESS``.
+ * Therefore, caution must be taken; use this flag only when you can guarantee
+ * that no thread in the process will emit fence.i from this point onward.
+ *
+ * @scope: Set scope of where icache flushing instructions are allowed to be
+ * emitted. Supported values described below.
+ *
+ * Supported values for scope:
+ *
+ * * %PR_RISCV_SCOPE_PER_PROCESS: Ensure the icache of any thread in this process
+ * is coherent with instruction storage upon
+ * migration.
+ *
+ * * %PR_RISCV_SCOPE_PER_THREAD: Ensure the icache of the current thread is
+ * coherent with instruction storage upon
+ * migration.
+ *
+ * When ``scope == PR_RISCV_SCOPE_PER_PROCESS``, all threads in the process are
+ * permitted to emit icache flushing instructions. Whenever any thread in the
+ * process is migrated, the corresponding hart's icache will be guaranteed to be
+ * consistent with instruction storage. This does not enforce any guarantees
+ * outside of migration. If a thread modifies an instruction that another thread
+ * may attempt to execute, the other thread must still emit an icache flushing
+ * instruction before attempting to execute the potentially modified
+ * instruction. This must be performed by the user-space program.
+ *
+ * In per-thread context (eg. ``scope == PR_RISCV_SCOPE_PER_THREAD``) only the
+ * thread calling this function is permitted to emit icache flushing
+ * instructions. When the thread is migrated, the corresponding hart's icache
+ * will be guaranteed to be consistent with instruction storage.
+ *
+ * On kernels configured without SMP, this function is a nop as migrations
+ * across harts will not occur.
+ */
+int riscv_set_icache_flush_ctx(unsigned long ctx, unsigned long scope)
+{
+#ifdef CONFIG_SMP
+ switch (ctx) {
+ case PR_RISCV_CTX_SW_FENCEI_ON:
+ switch (scope) {
+ case PR_RISCV_SCOPE_PER_PROCESS:
+ current->mm->context.force_icache_flush = true;
+ break;
+ case PR_RISCV_SCOPE_PER_THREAD:
+ current->thread.force_icache_flush = true;
+ break;
+ default:
+ return -EINVAL;
+ }
+ break;
+ case PR_RISCV_CTX_SW_FENCEI_OFF:
+ switch (scope) {
+ case PR_RISCV_SCOPE_PER_PROCESS:
+ set_icache_stale_mask();
+ current->mm->context.force_icache_flush = false;
+ break;
+ case PR_RISCV_SCOPE_PER_THREAD:
+ set_icache_stale_mask();
+ current->thread.force_icache_flush = false;
+ break;
+ default:
+ return -EINVAL;
+ }
+ break;
+ default:
+ return -EINVAL;
+ }
+ return 0;
+#else
+ switch (ctx) {
+ case PR_RISCV_CTX_SW_FENCEI_ON:
+ case PR_RISCV_CTX_SW_FENCEI_OFF:
+ return 0;
+ default:
+ return -EINVAL;
+ }
+#endif
}
diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c
index 217fd4de6134..55c20ad1f744 100644
--- a/arch/riscv/mm/context.c
+++ b/arch/riscv/mm/context.c
@@ -15,14 +15,13 @@
#include <asm/tlbflush.h>
#include <asm/cacheflush.h>
#include <asm/mmu_context.h>
+#include <asm/switch_to.h>
#ifdef CONFIG_MMU
DEFINE_STATIC_KEY_FALSE(use_asid_allocator);
-static unsigned long asid_bits;
static unsigned long num_asids;
-unsigned long asid_mask;
static atomic_long_t current_version;
@@ -81,7 +80,7 @@ static void __flush_context(void)
if (cntx == 0)
cntx = per_cpu(reserved_context, i);
- __set_bit(cntx & asid_mask, context_asid_map);
+ __set_bit(cntx2asid(cntx), context_asid_map);
per_cpu(reserved_context, i) = cntx;
}
@@ -102,7 +101,7 @@ static unsigned long __new_context(struct mm_struct *mm)
lockdep_assert_held(&context_lock);
if (cntx != 0) {
- unsigned long newcntx = ver | (cntx & asid_mask);
+ unsigned long newcntx = ver | cntx2asid(cntx);
/*
* If our current CONTEXT was active during a rollover, we
@@ -115,7 +114,7 @@ static unsigned long __new_context(struct mm_struct *mm)
* We had a valid CONTEXT in a previous life, so try to
* re-use it if possible.
*/
- if (!__test_and_set_bit(cntx & asid_mask, context_asid_map))
+ if (!__test_and_set_bit(cntx2asid(cntx), context_asid_map))
return newcntx;
}
@@ -128,7 +127,7 @@ static unsigned long __new_context(struct mm_struct *mm)
goto set_asid;
/* We're out of ASIDs, so increment current_version */
- ver = atomic_long_add_return_relaxed(num_asids, &current_version);
+ ver = atomic_long_add_return_relaxed(BIT(SATP_ASID_BITS), &current_version);
/* Flush everything */
__flush_context();
@@ -159,7 +158,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu)
*
* - We get a zero back from the cmpxchg and end up waiting on the
* lock. Taking the lock synchronises with the rollover and so
- * we are forced to see the updated verion.
+ * we are forced to see the updated version.
*
* - We get a valid context back from the cmpxchg then we continue
* using old ASID because __flush_context() would have marked ASID
@@ -168,7 +167,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu)
*/
old_active_cntx = atomic_long_read(&per_cpu(active_context, cpu));
if (old_active_cntx &&
- ((cntx & ~asid_mask) == atomic_long_read(&current_version)) &&
+ (cntx2version(cntx) == atomic_long_read(&current_version)) &&
atomic_long_cmpxchg_relaxed(&per_cpu(active_context, cpu),
old_active_cntx, cntx))
goto switch_mm_fast;
@@ -177,7 +176,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu)
/* Check that our ASID belongs to the current_version. */
cntx = atomic_long_read(&mm->context.id);
- if ((cntx & ~asid_mask) != atomic_long_read(&current_version)) {
+ if (cntx2version(cntx) != atomic_long_read(&current_version)) {
cntx = __new_context(mm);
atomic_long_set(&mm->context.id, cntx);
}
@@ -191,7 +190,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu)
switch_mm_fast:
csr_write(CSR_SATP, virt_to_pfn(mm->pgd) |
- ((cntx & asid_mask) << SATP_ASID_SHIFT) |
+ (cntx2asid(cntx) << SATP_ASID_SHIFT) |
satp_mode);
if (need_flush_tlb)
@@ -202,7 +201,7 @@ static void set_mm_noasid(struct mm_struct *mm)
{
/* Switch the page table and blindly nuke entire local TLB */
csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | satp_mode);
- local_flush_tlb_all();
+ local_flush_tlb_all_asid(0);
}
static inline void set_mm(struct mm_struct *prev,
@@ -227,7 +226,7 @@ static inline void set_mm(struct mm_struct *prev,
static int __init asids_init(void)
{
- unsigned long old;
+ unsigned long asid_bits, old;
/* Figure-out number of ASID bits in HW */
old = csr_read(CSR_SATP);
@@ -247,7 +246,6 @@ static int __init asids_init(void)
/* Pre-compute ASID details */
if (asid_bits) {
num_asids = 1 << asid_bits;
- asid_mask = num_asids - 1;
}
/*
@@ -255,7 +253,7 @@ static int __init asids_init(void)
* at-least twice more than CPUs
*/
if (num_asids > (2 * num_possible_cpus())) {
- atomic_long_set(&current_version, num_asids);
+ atomic_long_set(&current_version, BIT(SATP_ASID_BITS));
context_asid_map = bitmap_zalloc(num_asids, GFP_KERNEL);
if (!context_asid_map)
@@ -297,21 +295,23 @@ static inline void set_mm(struct mm_struct *prev,
*
* The "cpu" argument must be the current local CPU number.
*/
-static inline void flush_icache_deferred(struct mm_struct *mm, unsigned int cpu)
+static inline void flush_icache_deferred(struct mm_struct *mm, unsigned int cpu,
+ struct task_struct *task)
{
#ifdef CONFIG_SMP
- cpumask_t *mask = &mm->context.icache_stale_mask;
-
- if (cpumask_test_cpu(cpu, mask)) {
- cpumask_clear_cpu(cpu, mask);
+ if (cpumask_test_and_clear_cpu(cpu, &mm->context.icache_stale_mask)) {
/*
* Ensure the remote hart's writes are visible to this hart.
* This pairs with a barrier in flush_icache_mm.
*/
smp_mb();
- local_flush_icache_all();
- }
+ /*
+ * If cache will be flushed in switch_to, no need to flush here.
+ */
+ if (!(task && switch_to_should_flush_icache(task)))
+ local_flush_icache_all();
+ }
#endif
}
@@ -323,6 +323,8 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
if (unlikely(prev == next))
return;
+ membarrier_arch_switch_mm(prev, next, task);
+
/*
* Mark the current MM context as inactive, and the next as
* active. This is at least used by the icache flushing
@@ -332,5 +334,5 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
set_mm(prev, next, cpu);
- flush_icache_deferred(next, cpu);
+ flush_icache_deferred(next, cpu, task);
}
diff --git a/arch/riscv/mm/dma-noncoherent.c b/arch/riscv/mm/dma-noncoherent.c
index 843107f834b2..cb89d7e0ba88 100644
--- a/arch/riscv/mm/dma-noncoherent.c
+++ b/arch/riscv/mm/dma-noncoherent.c
@@ -128,8 +128,7 @@ void arch_dma_prep_coherent(struct page *page, size_t size)
ALT_CMO_OP(FLUSH, flush_addr, size, riscv_cbom_block_size);
}
-void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
- bool coherent)
+void arch_setup_dma_ops(struct device *dev, bool coherent)
{
WARN_TAINT(!coherent && riscv_cbom_block_size > ARCH_DMA_MINALIGN,
TAINT_CPU_OUT_OF_SPEC,
diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c
index 3ba1d4dde5dd..0194324a0c50 100644
--- a/arch/riscv/mm/fault.c
+++ b/arch/riscv/mm/fault.c
@@ -22,6 +22,57 @@
#include "../kernel/head.h"
+static void show_pte(unsigned long addr)
+{
+ pgd_t *pgdp, pgd;
+ p4d_t *p4dp, p4d;
+ pud_t *pudp, pud;
+ pmd_t *pmdp, pmd;
+ pte_t *ptep, pte;
+ struct mm_struct *mm = current->mm;
+
+ if (!mm)
+ mm = &init_mm;
+
+ pr_alert("Current %s pgtable: %luK pagesize, %d-bit VAs, pgdp=0x%016llx\n",
+ current->comm, PAGE_SIZE / SZ_1K, VA_BITS,
+ mm == &init_mm ? (u64)__pa_symbol(mm->pgd) : virt_to_phys(mm->pgd));
+
+ pgdp = pgd_offset(mm, addr);
+ pgd = pgdp_get(pgdp);
+ pr_alert("[%016lx] pgd=%016lx", addr, pgd_val(pgd));
+ if (pgd_none(pgd) || pgd_bad(pgd) || pgd_leaf(pgd))
+ goto out;
+
+ p4dp = p4d_offset(pgdp, addr);
+ p4d = p4dp_get(p4dp);
+ pr_cont(", p4d=%016lx", p4d_val(p4d));
+ if (p4d_none(p4d) || p4d_bad(p4d) || p4d_leaf(p4d))
+ goto out;
+
+ pudp = pud_offset(p4dp, addr);
+ pud = pudp_get(pudp);
+ pr_cont(", pud=%016lx", pud_val(pud));
+ if (pud_none(pud) || pud_bad(pud) || pud_leaf(pud))
+ goto out;
+
+ pmdp = pmd_offset(pudp, addr);
+ pmd = pmdp_get(pmdp);
+ pr_cont(", pmd=%016lx", pmd_val(pmd));
+ if (pmd_none(pmd) || pmd_bad(pmd) || pmd_leaf(pmd))
+ goto out;
+
+ ptep = pte_offset_map(pmdp, addr);
+ if (!ptep)
+ goto out;
+
+ pte = ptep_get(ptep);
+ pr_cont(", pte=%016lx", pte_val(pte));
+ pte_unmap(ptep);
+out:
+ pr_cont("\n");
+}
+
static void die_kernel_fault(const char *msg, unsigned long addr,
struct pt_regs *regs)
{
@@ -31,6 +82,7 @@ static void die_kernel_fault(const char *msg, unsigned long addr,
addr);
bust_spinlocks(0);
+ show_pte(addr);
die(regs, "Oops");
make_task_dead(SIGKILL);
}
@@ -61,26 +113,27 @@ static inline void no_context(struct pt_regs *regs, unsigned long addr)
static inline void mm_fault_error(struct pt_regs *regs, unsigned long addr, vm_fault_t fault)
{
+ if (!user_mode(regs)) {
+ no_context(regs, addr);
+ return;
+ }
+
if (fault & VM_FAULT_OOM) {
/*
* We ran out of memory, call the OOM killer, and return the userspace
* (which will retry the fault, or kill us if we got oom-killed).
*/
- if (!user_mode(regs)) {
- no_context(regs, addr);
- return;
- }
pagefault_out_of_memory();
return;
} else if (fault & (VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) {
/* Kernel mode? Handle exceptions or die */
- if (!user_mode(regs)) {
- no_context(regs, addr);
- return;
- }
do_trap(regs, SIGBUS, BUS_ADRERR, addr);
return;
+ } else if (fault & VM_FAULT_SIGSEGV) {
+ do_trap(regs, SIGSEGV, SEGV_MAPERR, addr);
+ return;
}
+
BUG();
}
@@ -292,7 +345,10 @@ void handle_page_fault(struct pt_regs *regs)
if (unlikely(access_error(cause, vma))) {
vma_end_read(vma);
- goto lock_mmap;
+ count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
+ tsk->thread.bad_cause = cause;
+ bad_area_nosemaphore(regs, SEGV_ACCERR, addr);
+ return;
}
fault = handle_mm_fault(vma, addr, flags | FAULT_FLAG_VMA_LOCK, regs);
diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c
index 5ef2a6891158..375dd96bb4a0 100644
--- a/arch/riscv/mm/hugetlbpage.c
+++ b/arch/riscv/mm/hugetlbpage.c
@@ -3,7 +3,7 @@
#include <linux/err.h>
#ifdef CONFIG_RISCV_ISA_SVNAPOT
-pte_t huge_ptep_get(pte_t *ptep)
+pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
unsigned long pte_num;
int i;
@@ -148,22 +148,25 @@ unsigned long hugetlb_mask_last_page(struct hstate *h)
static pte_t get_clear_contig(struct mm_struct *mm,
unsigned long addr,
pte_t *ptep,
- unsigned long pte_num)
+ unsigned long ncontig)
{
- pte_t orig_pte = ptep_get(ptep);
- unsigned long i;
-
- for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++) {
- pte_t pte = ptep_get_and_clear(mm, addr, ptep);
-
- if (pte_dirty(pte))
- orig_pte = pte_mkdirty(orig_pte);
-
- if (pte_young(pte))
- orig_pte = pte_mkyoung(orig_pte);
+ pte_t pte, tmp_pte;
+ bool present;
+
+ pte = ptep_get_and_clear(mm, addr, ptep);
+ present = pte_present(pte);
+ while (--ncontig) {
+ ptep++;
+ addr += PAGE_SIZE;
+ tmp_pte = ptep_get_and_clear(mm, addr, ptep);
+ if (present) {
+ if (pte_dirty(tmp_pte))
+ pte = pte_mkdirty(pte);
+ if (pte_young(tmp_pte))
+ pte = pte_mkyoung(pte);
+ }
}
-
- return orig_pte;
+ return pte;
}
static pte_t get_clear_contig_flush(struct mm_struct *mm,
@@ -212,6 +215,26 @@ static void clear_flush(struct mm_struct *mm,
flush_tlb_range(&vma, saddr, addr);
}
+static int num_contig_ptes_from_size(unsigned long sz, size_t *pgsize)
+{
+ unsigned long hugepage_shift;
+
+ if (sz >= PGDIR_SIZE)
+ hugepage_shift = PGDIR_SHIFT;
+ else if (sz >= P4D_SIZE)
+ hugepage_shift = P4D_SHIFT;
+ else if (sz >= PUD_SIZE)
+ hugepage_shift = PUD_SHIFT;
+ else if (sz >= PMD_SIZE)
+ hugepage_shift = PMD_SHIFT;
+ else
+ hugepage_shift = PAGE_SHIFT;
+
+ *pgsize = 1 << hugepage_shift;
+
+ return sz >> hugepage_shift;
+}
+
/*
* When dealing with NAPOT mappings, the privileged specification indicates that
* "if an update needs to be made, the OS generally should first mark all of the
@@ -226,22 +249,10 @@ void set_huge_pte_at(struct mm_struct *mm,
pte_t pte,
unsigned long sz)
{
- unsigned long hugepage_shift, pgsize;
+ size_t pgsize;
int i, pte_num;
- if (sz >= PGDIR_SIZE)
- hugepage_shift = PGDIR_SHIFT;
- else if (sz >= P4D_SIZE)
- hugepage_shift = P4D_SHIFT;
- else if (sz >= PUD_SIZE)
- hugepage_shift = PUD_SHIFT;
- else if (sz >= PMD_SIZE)
- hugepage_shift = PMD_SHIFT;
- else
- hugepage_shift = PAGE_SHIFT;
-
- pte_num = sz >> hugepage_shift;
- pgsize = 1 << hugepage_shift;
+ pte_num = num_contig_ptes_from_size(sz, &pgsize);
if (!pte_present(pte)) {
for (i = 0; i < pte_num; i++, ptep++, addr += pgsize)
@@ -293,15 +304,16 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
unsigned long addr,
- pte_t *ptep)
+ pte_t *ptep, unsigned long sz)
{
+ size_t pgsize;
pte_t orig_pte = ptep_get(ptep);
int pte_num;
if (!pte_napot(orig_pte))
return ptep_get_and_clear(mm, addr, ptep);
- pte_num = napot_pte_num(napot_cont_order(orig_pte));
+ pte_num = num_contig_ptes_from_size(sz, &pgsize);
return get_clear_contig(mm, addr, ptep, pte_num);
}
@@ -351,6 +363,7 @@ void huge_pte_clear(struct mm_struct *mm,
pte_t *ptep,
unsigned long sz)
{
+ size_t pgsize;
pte_t pte = ptep_get(ptep);
int i, pte_num;
@@ -359,8 +372,9 @@ void huge_pte_clear(struct mm_struct *mm,
return;
}
- pte_num = napot_pte_num(napot_cont_order(pte));
- for (i = 0; i < pte_num; i++, addr += PAGE_SIZE, ptep++)
+ pte_num = num_contig_ptes_from_size(sz, &pgsize);
+
+ for (i = 0; i < pte_num; i++, addr += pgsize, ptep++)
pte_clear(mm, addr, ptep);
}
@@ -399,16 +413,6 @@ static bool is_napot_size(unsigned long size)
#endif /*CONFIG_RISCV_ISA_SVNAPOT*/
-int pud_huge(pud_t pud)
-{
- return pud_leaf(pud);
-}
-
-int pmd_huge(pmd_t pmd)
-{
- return pmd_leaf(pmd);
-}
-
static bool __hugetlb_valid_size(unsigned long size)
{
if (size == HPAGE_SIZE)
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index b5ffb2ef54ad..8d0374d7ce8e 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -20,21 +20,24 @@
#include <linux/dma-map-ops.h>
#include <linux/crash_dump.h>
#include <linux/hugetlb.h>
-#ifdef CONFIG_RELOCATABLE
-#include <linux/elf.h>
-#endif
#include <linux/kfence.h>
+#include <linux/execmem.h>
#include <asm/fixmap.h>
#include <asm/io.h>
+#include <asm/kasan.h>
+#include <asm/module.h>
#include <asm/numa.h>
#include <asm/pgtable.h>
#include <asm/sections.h>
#include <asm/soc.h>
+#include <asm/sparsemem.h>
#include <asm/tlbflush.h>
#include "../kernel/head.h"
+u64 new_vmalloc[NR_CPUS / sizeof(u64) + 1];
+
struct kernel_mapping kernel_map __ro_after_init;
EXPORT_SYMBOL(kernel_map);
#ifdef CONFIG_XIP_KERNEL
@@ -49,8 +52,8 @@ u64 satp_mode __ro_after_init = SATP_MODE_32;
EXPORT_SYMBOL(satp_mode);
#ifdef CONFIG_64BIT
-bool pgtable_l4_enabled = IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_XIP_KERNEL);
-bool pgtable_l5_enabled = IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_XIP_KERNEL);
+bool pgtable_l4_enabled __ro_after_init = !IS_ENABLED(CONFIG_XIP_KERNEL);
+bool pgtable_l5_enabled __ro_after_init = !IS_ENABLED(CONFIG_XIP_KERNEL);
EXPORT_SYMBOL(pgtable_l4_enabled);
EXPORT_SYMBOL(pgtable_l5_enabled);
#endif
@@ -58,6 +61,13 @@ EXPORT_SYMBOL(pgtable_l5_enabled);
phys_addr_t phys_ram_base __ro_after_init;
EXPORT_SYMBOL(phys_ram_base);
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+#define VMEMMAP_ADDR_ALIGN (1ULL << SECTION_SIZE_BITS)
+
+unsigned long vmemmap_start_pfn __ro_after_init;
+EXPORT_SYMBOL(vmemmap_start_pfn);
+#endif
+
unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]
__page_aligned_bss;
EXPORT_SYMBOL(empty_zero_page);
@@ -159,14 +169,27 @@ static void __init print_vm_layout(void)
static void print_vm_layout(void) { }
#endif /* CONFIG_DEBUG_VM */
-void __init mem_init(void)
+void __init arch_mm_preinit(void)
{
+ bool swiotlb = max_pfn > PFN_DOWN(dma32_phys_limit);
#ifdef CONFIG_FLATMEM
BUG_ON(!mem_map);
#endif /* CONFIG_FLATMEM */
- swiotlb_init(max_pfn > PFN_DOWN(dma32_phys_limit), SWIOTLB_VERBOSE);
- memblock_free_all();
+ if (IS_ENABLED(CONFIG_DMA_BOUNCE_UNALIGNED_KMALLOC) && !swiotlb &&
+ dma_cache_alignment != 1) {
+ /*
+ * If no bouncing needed for ZONE_DMA, allocate 1MB swiotlb
+ * buffer per 1GB of RAM for kmalloc() bouncing on
+ * non-coherent platforms.
+ */
+ unsigned long size =
+ DIV_ROUND_UP(memblock_phys_mem_size(), 1024);
+ swiotlb_adjust_size(min(swiotlb_size_or_default(), size));
+ swiotlb = true;
+ }
+
+ swiotlb_init(swiotlb, SWIOTLB_VERBOSE);
print_vm_layout();
}
@@ -218,43 +241,59 @@ static void __init setup_bootmem(void)
*/
memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start);
- phys_ram_end = memblock_end_of_DRAM();
-
/*
* Make sure we align the start of the memory on a PMD boundary so that
* at worst, we map the linear mapping with PMD mappings.
*/
- if (!IS_ENABLED(CONFIG_XIP_KERNEL))
+ if (!IS_ENABLED(CONFIG_XIP_KERNEL)) {
phys_ram_base = memblock_start_of_DRAM() & PMD_MASK;
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ vmemmap_start_pfn = round_down(phys_ram_base, VMEMMAP_ADDR_ALIGN) >> PAGE_SHIFT;
+#endif
+ }
/*
* In 64-bit, any use of __va/__pa before this point is wrong as we
* did not know the start of DRAM before.
*/
- if (IS_ENABLED(CONFIG_64BIT))
+ if (IS_ENABLED(CONFIG_64BIT) && IS_ENABLED(CONFIG_MMU))
kernel_map.va_pa_offset = PAGE_OFFSET - phys_ram_base;
/*
- * memblock allocator is not aware of the fact that last 4K bytes of
- * the addressable memory can not be mapped because of IS_ERR_VALUE
- * macro. Make sure that last 4k bytes are not usable by memblock
- * if end of dram is equal to maximum addressable memory. For 64-bit
- * kernel, this problem can't happen here as the end of the virtual
- * address space is occupied by the kernel mapping then this check must
- * be done as soon as the kernel mapping base address is determined.
+ * The size of the linear page mapping may restrict the amount of
+ * usable RAM.
+ */
+ if (IS_ENABLED(CONFIG_64BIT) && IS_ENABLED(CONFIG_MMU)) {
+ max_mapped_addr = __pa(PAGE_OFFSET) + KERN_VIRT_SIZE;
+ if (memblock_end_of_DRAM() > max_mapped_addr) {
+ memblock_cap_memory_range(phys_ram_base,
+ max_mapped_addr - phys_ram_base);
+ pr_warn("Physical memory overflows the linear mapping size: region above %pa removed",
+ &max_mapped_addr);
+ }
+ }
+
+ /*
+ * Reserve physical address space that would be mapped to virtual
+ * addresses greater than (void *)(-PAGE_SIZE) because:
+ * - This memory would overlap with ERR_PTR
+ * - This memory belongs to high memory, which is not supported
+ *
+ * This is not applicable to 64-bit kernel, because virtual addresses
+ * after (void *)(-PAGE_SIZE) are not linearly mapped: they are
+ * occupied by kernel mapping. Also it is unrealistic for high memory
+ * to exist on 64-bit platforms.
*/
if (!IS_ENABLED(CONFIG_64BIT)) {
- max_mapped_addr = __pa(~(ulong)0);
- if (max_mapped_addr == (phys_ram_end - 1))
- memblock_set_current_limit(max_mapped_addr - 4096);
+ max_mapped_addr = __va_to_pa_nodebug(-PAGE_SIZE);
+ memblock_reserve(max_mapped_addr, (phys_addr_t)-max_mapped_addr);
}
+ phys_ram_end = memblock_end_of_DRAM();
min_low_pfn = PFN_UP(phys_ram_base);
max_low_pfn = max_pfn = PFN_DOWN(phys_ram_end);
- high_memory = (void *)(__va(PFN_PHYS(max_low_pfn)));
dma32_phys_limit = min(4UL * SZ_1G, (unsigned long)PFN_PHYS(max_low_pfn));
- set_max_mapnr(max_low_pfn - ARCH_PFN_OFFSET);
reserve_initrd_mem();
@@ -279,8 +318,46 @@ static void __init setup_bootmem(void)
hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT);
}
+#ifdef CONFIG_RELOCATABLE
+extern unsigned long __rela_dyn_start, __rela_dyn_end;
+
+static void __init relocate_kernel(void)
+{
+ Elf_Rela *rela = (Elf_Rela *)&__rela_dyn_start;
+ /*
+ * This holds the offset between the linked virtual address and the
+ * relocated virtual address.
+ */
+ uintptr_t reloc_offset = kernel_map.virt_addr - KERNEL_LINK_ADDR;
+ /*
+ * This holds the offset between kernel linked virtual address and
+ * physical address.
+ */
+ uintptr_t va_kernel_link_pa_offset = KERNEL_LINK_ADDR - kernel_map.phys_addr;
+
+ for ( ; rela < (Elf_Rela *)&__rela_dyn_end; rela++) {
+ Elf_Addr addr = (rela->r_offset - va_kernel_link_pa_offset);
+ Elf_Addr relocated_addr = rela->r_addend;
+
+ if (rela->r_info != R_RISCV_RELATIVE)
+ continue;
+
+ /*
+ * Make sure to not relocate vdso symbols like rt_sigreturn
+ * which are linked from the address 0 in vmlinux since
+ * vdso symbol addresses are actually used as an offset from
+ * mm->context.vdso in VDSO_OFFSET macro.
+ */
+ if (relocated_addr >= KERNEL_LINK_ADDR)
+ relocated_addr += reloc_offset;
+
+ *(Elf_Addr *)addr = relocated_addr;
+ }
+}
+#endif /* CONFIG_RELOCATABLE */
+
#ifdef CONFIG_MMU
-struct pt_alloc_ops pt_ops __initdata;
+struct pt_alloc_ops pt_ops __meminitdata;
pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_bss;
pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss;
@@ -342,7 +419,7 @@ static inline pte_t *__init get_pte_virt_fixmap(phys_addr_t pa)
return (pte_t *)set_fixmap_offset(FIX_PTE, pa);
}
-static inline pte_t *__init get_pte_virt_late(phys_addr_t pa)
+static inline pte_t *__meminit get_pte_virt_late(phys_addr_t pa)
{
return (pte_t *) __va(pa);
}
@@ -361,17 +438,21 @@ static inline phys_addr_t __init alloc_pte_fixmap(uintptr_t va)
return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
}
-static phys_addr_t __init alloc_pte_late(uintptr_t va)
+static phys_addr_t __meminit alloc_pte_late(uintptr_t va)
{
struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0);
- BUG_ON(!ptdesc || !pagetable_pte_ctor(ptdesc));
+ /*
+ * We do not know which mm the PTE page is associated to at this point.
+ * Passing NULL to the ctor is the safe option, though it may result
+ * in unnecessary work (e.g. initialising the ptlock for init_mm).
+ */
+ BUG_ON(!ptdesc || !pagetable_pte_ctor(NULL, ptdesc));
return __pa((pte_t *)ptdesc_address(ptdesc));
}
-static void __init create_pte_mapping(pte_t *ptep,
- uintptr_t va, phys_addr_t pa,
- phys_addr_t sz, pgprot_t prot)
+static void __meminit create_pte_mapping(pte_t *ptep, uintptr_t va, phys_addr_t pa, phys_addr_t sz,
+ pgprot_t prot)
{
uintptr_t pte_idx = pte_index(va);
@@ -425,7 +506,7 @@ static pmd_t *__init get_pmd_virt_fixmap(phys_addr_t pa)
return (pmd_t *)set_fixmap_offset(FIX_PMD, pa);
}
-static pmd_t *__init get_pmd_virt_late(phys_addr_t pa)
+static pmd_t *__meminit get_pmd_virt_late(phys_addr_t pa)
{
return (pmd_t *) __va(pa);
}
@@ -442,17 +523,18 @@ static phys_addr_t __init alloc_pmd_fixmap(uintptr_t va)
return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
}
-static phys_addr_t __init alloc_pmd_late(uintptr_t va)
+static phys_addr_t __meminit alloc_pmd_late(uintptr_t va)
{
struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0);
- BUG_ON(!ptdesc || !pagetable_pmd_ctor(ptdesc));
+ /* See comment in alloc_pte_late() regarding NULL passed the ctor */
+ BUG_ON(!ptdesc || !pagetable_pmd_ctor(NULL, ptdesc));
return __pa((pmd_t *)ptdesc_address(ptdesc));
}
-static void __init create_pmd_mapping(pmd_t *pmdp,
- uintptr_t va, phys_addr_t pa,
- phys_addr_t sz, pgprot_t prot)
+static void __meminit create_pmd_mapping(pmd_t *pmdp,
+ uintptr_t va, phys_addr_t pa,
+ phys_addr_t sz, pgprot_t prot)
{
pte_t *ptep;
phys_addr_t pte_phys;
@@ -488,7 +570,7 @@ static pud_t *__init get_pud_virt_fixmap(phys_addr_t pa)
return (pud_t *)set_fixmap_offset(FIX_PUD, pa);
}
-static pud_t *__init get_pud_virt_late(phys_addr_t pa)
+static pud_t *__meminit get_pud_virt_late(phys_addr_t pa)
{
return (pud_t *)__va(pa);
}
@@ -506,13 +588,13 @@ static phys_addr_t __init alloc_pud_fixmap(uintptr_t va)
return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
}
-static phys_addr_t alloc_pud_late(uintptr_t va)
+static phys_addr_t __meminit alloc_pud_late(uintptr_t va)
{
- unsigned long vaddr;
+ struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, 0);
- vaddr = __get_free_page(GFP_KERNEL);
- BUG_ON(!vaddr);
- return __pa(vaddr);
+ BUG_ON(!ptdesc);
+ pagetable_pud_ctor(ptdesc);
+ return __pa((pud_t *)ptdesc_address(ptdesc));
}
static p4d_t *__init get_p4d_virt_early(phys_addr_t pa)
@@ -526,7 +608,7 @@ static p4d_t *__init get_p4d_virt_fixmap(phys_addr_t pa)
return (p4d_t *)set_fixmap_offset(FIX_P4D, pa);
}
-static p4d_t *__init get_p4d_virt_late(phys_addr_t pa)
+static p4d_t *__meminit get_p4d_virt_late(phys_addr_t pa)
{
return (p4d_t *)__va(pa);
}
@@ -544,18 +626,17 @@ static phys_addr_t __init alloc_p4d_fixmap(uintptr_t va)
return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
}
-static phys_addr_t alloc_p4d_late(uintptr_t va)
+static phys_addr_t __meminit alloc_p4d_late(uintptr_t va)
{
- unsigned long vaddr;
+ struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, 0);
- vaddr = __get_free_page(GFP_KERNEL);
- BUG_ON(!vaddr);
- return __pa(vaddr);
+ BUG_ON(!ptdesc);
+ pagetable_p4d_ctor(ptdesc);
+ return __pa((p4d_t *)ptdesc_address(ptdesc));
}
-static void __init create_pud_mapping(pud_t *pudp,
- uintptr_t va, phys_addr_t pa,
- phys_addr_t sz, pgprot_t prot)
+static void __meminit create_pud_mapping(pud_t *pudp, uintptr_t va, phys_addr_t pa, phys_addr_t sz,
+ pgprot_t prot)
{
pmd_t *nextp;
phys_addr_t next_phys;
@@ -580,9 +661,8 @@ static void __init create_pud_mapping(pud_t *pudp,
create_pmd_mapping(nextp, va, pa, sz, prot);
}
-static void __init create_p4d_mapping(p4d_t *p4dp,
- uintptr_t va, phys_addr_t pa,
- phys_addr_t sz, pgprot_t prot)
+static void __meminit create_p4d_mapping(p4d_t *p4dp, uintptr_t va, phys_addr_t pa, phys_addr_t sz,
+ pgprot_t prot)
{
pud_t *nextp;
phys_addr_t next_phys;
@@ -638,9 +718,8 @@ static void __init create_p4d_mapping(p4d_t *p4dp,
#define create_pmd_mapping(__pmdp, __va, __pa, __sz, __prot) do {} while(0)
#endif /* __PAGETABLE_PMD_FOLDED */
-void __init create_pgd_mapping(pgd_t *pgdp,
- uintptr_t va, phys_addr_t pa,
- phys_addr_t sz, pgprot_t prot)
+void __meminit create_pgd_mapping(pgd_t *pgdp, uintptr_t va, phys_addr_t pa, phys_addr_t sz,
+ pgprot_t prot)
{
pgd_next_t *nextp;
phys_addr_t next_phys;
@@ -665,9 +744,11 @@ void __init create_pgd_mapping(pgd_t *pgdp,
create_pgd_next_mapping(nextp, va, pa, sz, prot);
}
-static uintptr_t __init best_map_size(phys_addr_t pa, uintptr_t va,
- phys_addr_t size)
+static uintptr_t __meminit best_map_size(phys_addr_t pa, uintptr_t va, phys_addr_t size)
{
+ if (debug_pagealloc_enabled())
+ return PAGE_SIZE;
+
if (pgtable_l5_enabled &&
!(pa & (P4D_SIZE - 1)) && !(va & (P4D_SIZE - 1)) && size >= P4D_SIZE)
return P4D_SIZE;
@@ -699,7 +780,7 @@ asmlinkage void __init __copy_data(void)
#endif
#ifdef CONFIG_STRICT_KERNEL_RWX
-static __init pgprot_t pgprot_from_va(uintptr_t va)
+static __meminit pgprot_t pgprot_from_va(uintptr_t va)
{
if (is_va_kernel_text(va))
return PAGE_KERNEL_READ_EXEC;
@@ -724,7 +805,7 @@ void mark_rodata_ro(void)
set_memory_ro);
}
#else
-static __init pgprot_t pgprot_from_va(uintptr_t va)
+static __meminit pgprot_t pgprot_from_va(uintptr_t va)
{
if (IS_ENABLED(CONFIG_64BIT) && !is_kernel_mapping(va))
return PAGE_KERNEL;
@@ -764,6 +845,11 @@ static int __init print_no5lvl(char *p)
}
early_param("no5lvl", print_no5lvl);
+static void __init set_mmap_rnd_bits_max(void)
+{
+ mmap_rnd_bits_max = MMAP_VA_BITS - PAGE_SHIFT - 3;
+}
+
/*
* There is a simple way to determine if 4-level is supported by the
* underlying hardware: establish 1:1 mapping in 4-level page table mode
@@ -776,6 +862,8 @@ static __init void set_satp_mode(uintptr_t dtb_pa)
uintptr_t set_satp_mode_pmd = ((unsigned long)set_satp_mode) & PMD_MASK;
u64 satp_mode_cmdline = __pi_set_satp_mode_from_cmdline(dtb_pa);
+ kernel_map.page_offset = PAGE_OFFSET_L5;
+
if (satp_mode_cmdline == SATP_MODE_57) {
disable_pgtable_l5();
} else if (satp_mode_cmdline == SATP_MODE_48) {
@@ -846,49 +934,11 @@ retry:
#error "setup_vm() is called from head.S before relocate so it should not use absolute addressing."
#endif
-#ifdef CONFIG_RELOCATABLE
-extern unsigned long __rela_dyn_start, __rela_dyn_end;
-
-static void __init relocate_kernel(void)
-{
- Elf64_Rela *rela = (Elf64_Rela *)&__rela_dyn_start;
- /*
- * This holds the offset between the linked virtual address and the
- * relocated virtual address.
- */
- uintptr_t reloc_offset = kernel_map.virt_addr - KERNEL_LINK_ADDR;
- /*
- * This holds the offset between kernel linked virtual address and
- * physical address.
- */
- uintptr_t va_kernel_link_pa_offset = KERNEL_LINK_ADDR - kernel_map.phys_addr;
-
- for ( ; rela < (Elf64_Rela *)&__rela_dyn_end; rela++) {
- Elf64_Addr addr = (rela->r_offset - va_kernel_link_pa_offset);
- Elf64_Addr relocated_addr = rela->r_addend;
-
- if (rela->r_info != R_RISCV_RELATIVE)
- continue;
-
- /*
- * Make sure to not relocate vdso symbols like rt_sigreturn
- * which are linked from the address 0 in vmlinux since
- * vdso symbol addresses are actually used as an offset from
- * mm->context.vdso in VDSO_OFFSET macro.
- */
- if (relocated_addr >= KERNEL_LINK_ADDR)
- relocated_addr += reloc_offset;
-
- *(Elf64_Addr *)addr = relocated_addr;
- }
-}
-#endif /* CONFIG_RELOCATABLE */
-
#ifdef CONFIG_XIP_KERNEL
static void __init create_kernel_page_table(pgd_t *pgdir,
__always_unused bool early)
{
- uintptr_t va, end_va;
+ uintptr_t va, start_va, end_va;
/* Map the flash resident part */
end_va = kernel_map.virt_addr + kernel_map.xiprom_sz;
@@ -898,10 +948,11 @@ static void __init create_kernel_page_table(pgd_t *pgdir,
PMD_SIZE, PAGE_KERNEL_EXEC);
/* Map the data in RAM */
- end_va = kernel_map.virt_addr + XIP_OFFSET + kernel_map.size;
- for (va = kernel_map.virt_addr + XIP_OFFSET; va < end_va; va += PMD_SIZE)
+ start_va = kernel_map.virt_addr + (uintptr_t)&_sdata - (uintptr_t)&_start;
+ end_va = kernel_map.virt_addr + kernel_map.size;
+ for (va = start_va; va < end_va; va += PMD_SIZE)
create_pgd_mapping(pgdir, va,
- kernel_map.phys_addr + (va - (kernel_map.virt_addr + XIP_OFFSET)),
+ kernel_map.phys_addr + (va - start_va),
PMD_SIZE, PAGE_KERNEL);
}
#else
@@ -1019,6 +1070,7 @@ static void __init pt_ops_set_late(void)
#ifdef CONFIG_RANDOMIZE_BASE
extern bool __init __pi_set_nokaslr_from_cmdline(uintptr_t dtb_pa);
extern u64 __init __pi_get_kaslr_seed(uintptr_t dtb_pa);
+extern u64 __init __pi_get_kaslr_seed_zkr(const uintptr_t dtb_pa);
static int __init print_nokaslr(char *p)
{
@@ -1039,10 +1091,12 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
#ifdef CONFIG_RANDOMIZE_BASE
if (!__pi_set_nokaslr_from_cmdline(dtb_pa)) {
- u64 kaslr_seed = __pi_get_kaslr_seed(dtb_pa);
+ u64 kaslr_seed = __pi_get_kaslr_seed_zkr(dtb_pa);
u32 kernel_size = (uintptr_t)(&_end) - (uintptr_t)(&_start);
u32 nr_pos;
+ if (kaslr_seed == 0)
+ kaslr_seed = __pi_get_kaslr_seed(dtb_pa);
/*
* Compute the number of positions available: we are limited
* by the early page table that only has one PUD and we must
@@ -1057,27 +1111,28 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
kernel_map.virt_addr = KERNEL_LINK_ADDR + kernel_map.virt_offset;
#ifdef CONFIG_XIP_KERNEL
-#ifdef CONFIG_64BIT
- kernel_map.page_offset = PAGE_OFFSET_L3;
-#else
- kernel_map.page_offset = _AC(CONFIG_PAGE_OFFSET, UL);
-#endif
kernel_map.xiprom = (uintptr_t)CONFIG_XIP_PHYS_ADDR;
kernel_map.xiprom_sz = (uintptr_t)(&_exiprom) - (uintptr_t)(&_xiprom);
phys_ram_base = CONFIG_PHYS_RAM_BASE;
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+ vmemmap_start_pfn = round_down(phys_ram_base, VMEMMAP_ADDR_ALIGN) >> PAGE_SHIFT;
+#endif
kernel_map.phys_addr = (uintptr_t)CONFIG_PHYS_RAM_BASE;
- kernel_map.size = (uintptr_t)(&_end) - (uintptr_t)(&_sdata);
+ kernel_map.size = (uintptr_t)(&_end) - (uintptr_t)(&_start);
- kernel_map.va_kernel_xip_pa_offset = kernel_map.virt_addr - kernel_map.xiprom;
+ kernel_map.va_kernel_xip_text_pa_offset = kernel_map.virt_addr - kernel_map.xiprom;
+ kernel_map.va_kernel_xip_data_pa_offset = kernel_map.virt_addr - kernel_map.phys_addr
+ + (uintptr_t)&_sdata - (uintptr_t)&_start;
#else
- kernel_map.page_offset = _AC(CONFIG_PAGE_OFFSET, UL);
kernel_map.phys_addr = (uintptr_t)(&_start);
kernel_map.size = (uintptr_t)(&_end) - kernel_map.phys_addr;
+ kernel_map.va_kernel_pa_offset = kernel_map.virt_addr - kernel_map.phys_addr;
#endif
#if defined(CONFIG_64BIT) && !defined(CONFIG_XIP_KERNEL)
set_satp_mode(dtb_pa);
+ set_mmap_rnd_bits_max();
#endif
/*
@@ -1094,15 +1149,8 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
*/
kernel_map.va_pa_offset = IS_ENABLED(CONFIG_64BIT) ?
0UL : PAGE_OFFSET - kernel_map.phys_addr;
- kernel_map.va_kernel_pa_offset = kernel_map.virt_addr - kernel_map.phys_addr;
- /*
- * The default maximal physical memory size is KERN_VIRT_SIZE for 32-bit
- * kernel, whereas for 64-bit kernel, the end of the virtual address
- * space is occupied by the modules/BPF/kernel mappings which reduces
- * the available size of the linear mapping.
- */
- memory_limit = KERN_VIRT_SIZE - (IS_ENABLED(CONFIG_64BIT) ? SZ_4G : 0);
+ memory_limit = KERN_VIRT_SIZE;
/* Sanity check alignment and size */
BUG_ON((PAGE_OFFSET % PGDIR_SIZE) != 0);
@@ -1123,7 +1171,8 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
* makes the kernel cross over a PUD_SIZE boundary, raise a bug
* since a part of the kernel would not get mapped.
*/
- BUG_ON(PUD_SIZE - (kernel_map.virt_addr & (PUD_SIZE - 1)) < kernel_map.size);
+ if (IS_ENABLED(CONFIG_64BIT))
+ BUG_ON(PUD_SIZE - (kernel_map.virt_addr & (PUD_SIZE - 1)) < kernel_map.size);
relocate_kernel();
#endif
@@ -1210,9 +1259,8 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
pt_ops_set_fixmap();
}
-static void __init create_linear_mapping_range(phys_addr_t start,
- phys_addr_t end,
- uintptr_t fixed_map_size)
+static void __meminit create_linear_mapping_range(phys_addr_t start, phys_addr_t end,
+ uintptr_t fixed_map_size, const pgprot_t *pgprot)
{
phys_addr_t pa;
uintptr_t va, map_size;
@@ -1223,7 +1271,7 @@ static void __init create_linear_mapping_range(phys_addr_t start,
best_map_size(pa, va, end - pa);
create_pgd_mapping(swapper_pg_dir, va, pa, map_size,
- pgprot_from_va(va));
+ pgprot ? *pgprot : pgprot_from_va(va));
}
}
@@ -1264,25 +1312,20 @@ static void __init create_linear_mapping_page_table(void)
if (start <= __pa(PAGE_OFFSET) &&
__pa(PAGE_OFFSET) < end)
start = __pa(PAGE_OFFSET);
- if (end >= __pa(PAGE_OFFSET) + memory_limit)
- end = __pa(PAGE_OFFSET) + memory_limit;
- create_linear_mapping_range(start, end, 0);
+ create_linear_mapping_range(start, end, 0, NULL);
}
#ifdef CONFIG_STRICT_KERNEL_RWX
- create_linear_mapping_range(ktext_start, ktext_start + ktext_size, 0);
- create_linear_mapping_range(krodata_start,
- krodata_start + krodata_size, 0);
+ create_linear_mapping_range(ktext_start, ktext_start + ktext_size, 0, NULL);
+ create_linear_mapping_range(krodata_start, krodata_start + krodata_size, 0, NULL);
memblock_clear_nomap(ktext_start, ktext_size);
memblock_clear_nomap(krodata_start, krodata_size);
#endif
#ifdef CONFIG_KFENCE
- create_linear_mapping_range(kfence_pool,
- kfence_pool + KFENCE_POOL_SIZE,
- PAGE_SIZE);
+ create_linear_mapping_range(kfence_pool, kfence_pool + KFENCE_POOL_SIZE, PAGE_SIZE, NULL);
memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE);
#endif
@@ -1333,6 +1376,12 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
{
dtb_early_va = (void *)dtb_pa;
dtb_early_pa = dtb_pa;
+
+#ifdef CONFIG_RELOCATABLE
+ kernel_map.virt_addr = (uintptr_t)_start;
+ kernel_map.phys_addr = (uintptr_t)_start;
+ relocate_kernel();
+#endif
}
static inline void setup_vm_final(void)
@@ -1351,21 +1400,19 @@ static void __init arch_reserve_crashkernel(void)
{
unsigned long long low_size = 0;
unsigned long long crash_base, crash_size;
- char *cmdline = boot_command_line;
bool high = false;
int ret;
if (!IS_ENABLED(CONFIG_CRASH_RESERVE))
return;
- ret = parse_crashkernel(cmdline, memblock_phys_mem_size(),
+ ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
&crash_size, &crash_base,
&low_size, &high);
if (ret)
return;
- reserve_crashkernel_generic(cmdline, crash_size, crash_base,
- low_size, high);
+ reserve_crashkernel_generic(crash_size, crash_base, low_size, high);
}
void __init paging_init(void)
@@ -1414,7 +1461,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
* memory hotplug, we are not able to update all the page tables with
* the new PMDs.
*/
- return vmemmap_populate_hugepages(start, end, node, NULL);
+ return vmemmap_populate_hugepages(start, end, node, altmap);
}
#endif
@@ -1468,10 +1515,320 @@ failed:
panic("Failed to pre-allocate %s pages for %s area\n", lvl, area);
}
+#define PAGE_END KASAN_SHADOW_START
+
void __init pgtable_cache_init(void)
{
preallocate_pgd_pages_range(VMALLOC_START, VMALLOC_END, "vmalloc");
if (IS_ENABLED(CONFIG_MODULES))
preallocate_pgd_pages_range(MODULES_VADDR, MODULES_END, "bpf/modules");
+ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) {
+ preallocate_pgd_pages_range(VMEMMAP_START, VMEMMAP_END, "vmemmap");
+ preallocate_pgd_pages_range(PAGE_OFFSET, PAGE_END, "direct map");
+ if (IS_ENABLED(CONFIG_KASAN))
+ preallocate_pgd_pages_range(KASAN_SHADOW_START, KASAN_SHADOW_END, "kasan");
+ }
}
#endif
+
+#ifdef CONFIG_EXECMEM
+#ifdef CONFIG_MMU
+static struct execmem_info execmem_info __ro_after_init;
+
+struct execmem_info __init *execmem_arch_setup(void)
+{
+ execmem_info = (struct execmem_info){
+ .ranges = {
+ [EXECMEM_DEFAULT] = {
+ .start = MODULES_VADDR,
+ .end = MODULES_END,
+ .pgprot = PAGE_KERNEL,
+ .alignment = 1,
+ },
+ [EXECMEM_KPROBES] = {
+ .start = VMALLOC_START,
+ .end = VMALLOC_END,
+ .pgprot = PAGE_KERNEL_READ_EXEC,
+ .alignment = 1,
+ },
+ [EXECMEM_BPF] = {
+ .start = BPF_JIT_REGION_START,
+ .end = BPF_JIT_REGION_END,
+ .pgprot = PAGE_KERNEL,
+ .alignment = PAGE_SIZE,
+ },
+ },
+ };
+
+ return &execmem_info;
+}
+#endif /* CONFIG_MMU */
+#endif /* CONFIG_EXECMEM */
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+ struct page *page = pmd_page(*pmd);
+ struct ptdesc *ptdesc = page_ptdesc(page);
+ pte_t *pte;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ pte = pte_start + i;
+ if (!pte_none(*pte))
+ return;
+ }
+
+ pagetable_dtor(ptdesc);
+ if (PageReserved(page))
+ free_reserved_page(page);
+ else
+ pagetable_free(ptdesc);
+ pmd_clear(pmd);
+}
+
+static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud, bool is_vmemmap)
+{
+ struct page *page = pud_page(*pud);
+ struct ptdesc *ptdesc = page_ptdesc(page);
+ pmd_t *pmd;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ pmd = pmd_start + i;
+ if (!pmd_none(*pmd))
+ return;
+ }
+
+ if (!is_vmemmap)
+ pagetable_dtor(ptdesc);
+ if (PageReserved(page))
+ free_reserved_page(page);
+ else
+ pagetable_free(ptdesc);
+ pud_clear(pud);
+}
+
+static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
+{
+ struct page *page = p4d_page(*p4d);
+ pud_t *pud;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PUD; i++) {
+ pud = pud_start + i;
+ if (!pud_none(*pud))
+ return;
+ }
+
+ if (PageReserved(page))
+ free_reserved_page(page);
+ else
+ free_pages((unsigned long)page_address(page), 0);
+ p4d_clear(p4d);
+}
+
+static void __meminit free_vmemmap_storage(struct page *page, size_t size,
+ struct vmem_altmap *altmap)
+{
+ int order = get_order(size);
+
+ if (altmap) {
+ vmem_altmap_free(altmap, size >> PAGE_SHIFT);
+ return;
+ }
+
+ if (PageReserved(page)) {
+ unsigned int nr_pages = 1 << order;
+
+ while (nr_pages--)
+ free_reserved_page(page++);
+ return;
+ }
+
+ free_pages((unsigned long)page_address(page), order);
+}
+
+static void __meminit remove_pte_mapping(pte_t *pte_base, unsigned long addr, unsigned long end,
+ bool is_vmemmap, struct vmem_altmap *altmap)
+{
+ unsigned long next;
+ pte_t *ptep, pte;
+
+ for (; addr < end; addr = next) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ if (next > end)
+ next = end;
+
+ ptep = pte_base + pte_index(addr);
+ pte = ptep_get(ptep);
+ if (!pte_present(*ptep))
+ continue;
+
+ pte_clear(&init_mm, addr, ptep);
+ if (is_vmemmap)
+ free_vmemmap_storage(pte_page(pte), PAGE_SIZE, altmap);
+ }
+}
+
+static void __meminit remove_pmd_mapping(pmd_t *pmd_base, unsigned long addr, unsigned long end,
+ bool is_vmemmap, struct vmem_altmap *altmap)
+{
+ unsigned long next;
+ pte_t *pte_base;
+ pmd_t *pmdp, pmd;
+
+ for (; addr < end; addr = next) {
+ next = pmd_addr_end(addr, end);
+ pmdp = pmd_base + pmd_index(addr);
+ pmd = pmdp_get(pmdp);
+ if (!pmd_present(pmd))
+ continue;
+
+ if (pmd_leaf(pmd)) {
+ pmd_clear(pmdp);
+ if (is_vmemmap)
+ free_vmemmap_storage(pmd_page(pmd), PMD_SIZE, altmap);
+ continue;
+ }
+
+ pte_base = (pte_t *)pmd_page_vaddr(*pmdp);
+ remove_pte_mapping(pte_base, addr, next, is_vmemmap, altmap);
+ free_pte_table(pte_base, pmdp);
+ }
+}
+
+static void __meminit remove_pud_mapping(pud_t *pud_base, unsigned long addr, unsigned long end,
+ bool is_vmemmap, struct vmem_altmap *altmap)
+{
+ unsigned long next;
+ pud_t *pudp, pud;
+ pmd_t *pmd_base;
+
+ for (; addr < end; addr = next) {
+ next = pud_addr_end(addr, end);
+ pudp = pud_base + pud_index(addr);
+ pud = pudp_get(pudp);
+ if (!pud_present(pud))
+ continue;
+
+ if (pud_leaf(pud)) {
+ if (pgtable_l4_enabled) {
+ pud_clear(pudp);
+ if (is_vmemmap)
+ free_vmemmap_storage(pud_page(pud), PUD_SIZE, altmap);
+ }
+ continue;
+ }
+
+ pmd_base = pmd_offset(pudp, 0);
+ remove_pmd_mapping(pmd_base, addr, next, is_vmemmap, altmap);
+
+ if (pgtable_l4_enabled)
+ free_pmd_table(pmd_base, pudp, is_vmemmap);
+ }
+}
+
+static void __meminit remove_p4d_mapping(p4d_t *p4d_base, unsigned long addr, unsigned long end,
+ bool is_vmemmap, struct vmem_altmap *altmap)
+{
+ unsigned long next;
+ p4d_t *p4dp, p4d;
+ pud_t *pud_base;
+
+ for (; addr < end; addr = next) {
+ next = p4d_addr_end(addr, end);
+ p4dp = p4d_base + p4d_index(addr);
+ p4d = p4dp_get(p4dp);
+ if (!p4d_present(p4d))
+ continue;
+
+ if (p4d_leaf(p4d)) {
+ if (pgtable_l5_enabled) {
+ p4d_clear(p4dp);
+ if (is_vmemmap)
+ free_vmemmap_storage(p4d_page(p4d), P4D_SIZE, altmap);
+ }
+ continue;
+ }
+
+ pud_base = pud_offset(p4dp, 0);
+ remove_pud_mapping(pud_base, addr, next, is_vmemmap, altmap);
+
+ if (pgtable_l5_enabled)
+ free_pud_table(pud_base, p4dp);
+ }
+}
+
+static void __meminit remove_pgd_mapping(unsigned long va, unsigned long end, bool is_vmemmap,
+ struct vmem_altmap *altmap)
+{
+ unsigned long addr, next;
+ p4d_t *p4d_base;
+ pgd_t *pgd;
+
+ for (addr = va; addr < end; addr = next) {
+ next = pgd_addr_end(addr, end);
+ pgd = pgd_offset_k(addr);
+
+ if (!pgd_present(*pgd))
+ continue;
+
+ if (pgd_leaf(*pgd))
+ continue;
+
+ p4d_base = p4d_offset(pgd, 0);
+ remove_p4d_mapping(p4d_base, addr, next, is_vmemmap, altmap);
+ }
+
+ flush_tlb_all();
+}
+
+static void __meminit remove_linear_mapping(phys_addr_t start, u64 size)
+{
+ unsigned long va = (unsigned long)__va(start);
+ unsigned long end = (unsigned long)__va(start + size);
+
+ remove_pgd_mapping(va, end, false, NULL);
+}
+
+struct range arch_get_mappable_range(void)
+{
+ struct range mhp_range;
+
+ mhp_range.start = __pa(PAGE_OFFSET);
+ mhp_range.end = __pa(PAGE_END - 1);
+ return mhp_range;
+}
+
+int __ref arch_add_memory(int nid, u64 start, u64 size, struct mhp_params *params)
+{
+ int ret = 0;
+
+ create_linear_mapping_range(start, start + size, 0, &params->pgprot);
+ ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, params);
+ if (ret) {
+ remove_linear_mapping(start, size);
+ goto out;
+ }
+
+ max_pfn = PFN_UP(start + size);
+ max_low_pfn = max_pfn;
+
+ out:
+ flush_tlb_all();
+ return ret;
+}
+
+void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
+{
+ __remove_pages(start >> PAGE_SHIFT, size >> PAGE_SHIFT, altmap);
+ remove_linear_mapping(start, size);
+ flush_tlb_all();
+}
+
+void __ref vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap)
+{
+ remove_pgd_mapping(start, end, true, altmap);
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c
index c301c8d291d2..41c635d6aca4 100644
--- a/arch/riscv/mm/kasan_init.c
+++ b/arch/riscv/mm/kasan_init.c
@@ -32,7 +32,7 @@ static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned
pte_t *ptep, *p;
if (pmd_none(pmdp_get(pmd))) {
- p = memblock_alloc(PTRS_PER_PTE * sizeof(pte_t), PAGE_SIZE);
+ p = memblock_alloc_or_panic(PTRS_PER_PTE * sizeof(pte_t), PAGE_SIZE);
set_pmd(pmd, pfn_pmd(PFN_DOWN(__pa(p)), PAGE_TABLE));
}
@@ -54,7 +54,7 @@ static void __init kasan_populate_pmd(pud_t *pud, unsigned long vaddr, unsigned
unsigned long next;
if (pud_none(pudp_get(pud))) {
- p = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE);
+ p = memblock_alloc_or_panic(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE);
set_pud(pud, pfn_pud(PFN_DOWN(__pa(p)), PAGE_TABLE));
}
@@ -85,7 +85,7 @@ static void __init kasan_populate_pud(p4d_t *p4d,
unsigned long next;
if (p4d_none(p4dp_get(p4d))) {
- p = memblock_alloc(PTRS_PER_PUD * sizeof(pud_t), PAGE_SIZE);
+ p = memblock_alloc_or_panic(PTRS_PER_PUD * sizeof(pud_t), PAGE_SIZE);
set_p4d(p4d, pfn_p4d(PFN_DOWN(__pa(p)), PAGE_TABLE));
}
@@ -116,7 +116,7 @@ static void __init kasan_populate_p4d(pgd_t *pgd,
unsigned long next;
if (pgd_none(pgdp_get(pgd))) {
- p = memblock_alloc(PTRS_PER_P4D * sizeof(p4d_t), PAGE_SIZE);
+ p = memblock_alloc_or_panic(PTRS_PER_P4D * sizeof(p4d_t), PAGE_SIZE);
set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(p)), PAGE_TABLE));
}
@@ -385,7 +385,7 @@ static void __init kasan_shallow_populate_pud(p4d_t *p4d,
next = pud_addr_end(vaddr, end);
if (pud_none(pudp_get(pud_k))) {
- p = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
+ p = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);
set_pud(pud_k, pfn_pud(PFN_DOWN(__pa(p)), PAGE_TABLE));
continue;
}
@@ -405,7 +405,7 @@ static void __init kasan_shallow_populate_p4d(pgd_t *pgd,
next = p4d_addr_end(vaddr, end);
if (p4d_none(p4dp_get(p4d_k))) {
- p = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
+ p = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);
set_p4d(p4d_k, pfn_p4d(PFN_DOWN(__pa(p)), PAGE_TABLE));
continue;
}
@@ -424,7 +424,7 @@ static void __init kasan_shallow_populate_pgd(unsigned long vaddr, unsigned long
next = pgd_addr_end(vaddr, end);
if (pgd_none(pgdp_get(pgd_k))) {
- p = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
+ p = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE);
set_pgd(pgd_k, pfn_pgd(PFN_DOWN(__pa(p)), PAGE_TABLE));
continue;
}
diff --git a/arch/riscv/mm/pageattr.c b/arch/riscv/mm/pageattr.c
index 410056a50aa9..d815448758a1 100644
--- a/arch/riscv/mm/pageattr.c
+++ b/arch/riscv/mm/pageattr.c
@@ -386,18 +386,49 @@ int set_direct_map_default_noflush(struct page *page)
PAGE_KERNEL, __pgprot(_PAGE_EXEC));
}
+int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid)
+{
+ pgprot_t set, clear;
+
+ if (valid) {
+ set = PAGE_KERNEL;
+ clear = __pgprot(_PAGE_EXEC);
+ } else {
+ set = __pgprot(0);
+ clear = __pgprot(_PAGE_PRESENT);
+ }
+
+ return __set_memory((unsigned long)page_address(page), nr, set, clear);
+}
+
#ifdef CONFIG_DEBUG_PAGEALLOC
+static int debug_pagealloc_set_page(pte_t *pte, unsigned long addr, void *data)
+{
+ int enable = *(int *)data;
+
+ unsigned long val = pte_val(ptep_get(pte));
+
+ if (enable)
+ val |= _PAGE_PRESENT;
+ else
+ val &= ~_PAGE_PRESENT;
+
+ set_pte(pte, __pte(val));
+
+ return 0;
+}
+
void __kernel_map_pages(struct page *page, int numpages, int enable)
{
if (!debug_pagealloc_enabled())
return;
- if (enable)
- __set_memory((unsigned long)page_address(page), numpages,
- __pgprot(_PAGE_PRESENT), __pgprot(0));
- else
- __set_memory((unsigned long)page_address(page), numpages,
- __pgprot(0), __pgprot(_PAGE_PRESENT));
+ unsigned long start = (unsigned long)page_address(page);
+ unsigned long size = PAGE_SIZE * numpages;
+
+ apply_to_existing_page_range(&init_mm, start, size, debug_pagealloc_set_page, &enable);
+
+ flush_tlb_kernel_range(start, start + size);
}
#endif
diff --git a/arch/riscv/mm/pgtable.c b/arch/riscv/mm/pgtable.c
index ef887efcb679..8b6c0a112a8d 100644
--- a/arch/riscv/mm/pgtable.c
+++ b/arch/riscv/mm/pgtable.c
@@ -9,13 +9,26 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep,
pte_t entry, int dirty)
{
+ asm goto(ALTERNATIVE("nop", "j %l[svvptc]", 0, RISCV_ISA_EXT_SVVPTC, 1)
+ : : : : svvptc);
+
if (!pte_same(ptep_get(ptep), entry))
- __set_pte_at(ptep, entry);
+ __set_pte_at(vma->vm_mm, ptep, entry);
/*
* update_mmu_cache will unconditionally execute, handling both
* the case that the PTE changed and the spurious fault case.
*/
return true;
+
+svvptc:
+ if (!pte_same(ptep_get(ptep), entry)) {
+ __set_pte_at(vma->vm_mm, ptep, entry);
+ /* Here only not svadu is impacted */
+ flush_tlb_page(vma, address);
+ return true;
+ }
+
+ return false;
}
int ptep_test_and_clear_young(struct vm_area_struct *vma,
@@ -141,4 +154,14 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma,
flush_tlb_mm(vma->vm_mm);
return pmd;
}
+
+pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address,
+ pud_t *pudp)
+{
+ VM_WARN_ON_ONCE(!pud_present(*pudp));
+ pud_t old = pudp_establish(vma, address, pudp, pud_mkinvalid(*pudp));
+
+ flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE);
+ return old;
+}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
diff --git a/arch/riscv/mm/physaddr.c b/arch/riscv/mm/physaddr.c
index 18706f457da7..559d291fac5c 100644
--- a/arch/riscv/mm/physaddr.c
+++ b/arch/riscv/mm/physaddr.c
@@ -12,7 +12,7 @@ phys_addr_t __virt_to_phys(unsigned long x)
* Boundary checking aginst the kernel linear mapping space.
*/
WARN(!is_linear_mapping(x) && !is_kernel_mapping(x),
- "virt_to_phys used for non-linear address: %pK (%pS)\n",
+ "virt_to_phys used for non-linear address: %p (%pS)\n",
(void *)x, (void *)x);
return __va_to_pa_nodebug(x);
diff --git a/arch/riscv/mm/ptdump.c b/arch/riscv/mm/ptdump.c
index 1289cc6d3700..32922550a50a 100644
--- a/arch/riscv/mm/ptdump.c
+++ b/arch/riscv/mm/ptdump.c
@@ -6,6 +6,7 @@
#include <linux/efi.h>
#include <linux/init.h>
#include <linux/debugfs.h>
+#include <linux/memory_hotplug.h>
#include <linux/seq_file.h>
#include <linux/ptdump.h>
@@ -317,6 +318,38 @@ static void note_page(struct ptdump_state *pt_st, unsigned long addr,
}
}
+static void note_page_pte(struct ptdump_state *pt_st, unsigned long addr, pte_t pte)
+{
+ note_page(pt_st, addr, 4, pte_val(pte));
+}
+
+static void note_page_pmd(struct ptdump_state *pt_st, unsigned long addr, pmd_t pmd)
+{
+ note_page(pt_st, addr, 3, pmd_val(pmd));
+}
+
+static void note_page_pud(struct ptdump_state *pt_st, unsigned long addr, pud_t pud)
+{
+ note_page(pt_st, addr, 2, pud_val(pud));
+}
+
+static void note_page_p4d(struct ptdump_state *pt_st, unsigned long addr, p4d_t p4d)
+{
+ note_page(pt_st, addr, 1, p4d_val(p4d));
+}
+
+static void note_page_pgd(struct ptdump_state *pt_st, unsigned long addr, pgd_t pgd)
+{
+ note_page(pt_st, addr, 0, pgd_val(pgd));
+}
+
+static void note_page_flush(struct ptdump_state *pt_st)
+{
+ pte_t pte_zero = {0};
+
+ note_page(pt_st, 0, -1, pte_val(pte_zero));
+}
+
static void ptdump_walk(struct seq_file *s, struct ptd_mm_info *pinfo)
{
struct pg_state st = {
@@ -324,7 +357,12 @@ static void ptdump_walk(struct seq_file *s, struct ptd_mm_info *pinfo)
.marker = pinfo->markers,
.level = -1,
.ptdump = {
- .note_page = note_page,
+ .note_page_pte = note_page_pte,
+ .note_page_pmd = note_page_pmd,
+ .note_page_pud = note_page_pud,
+ .note_page_p4d = note_page_p4d,
+ .note_page_pgd = note_page_pgd,
+ .note_page_flush = note_page_flush,
.range = (struct ptdump_range[]) {
{pinfo->base_addr, pinfo->end},
{0, 0}
@@ -346,7 +384,12 @@ bool ptdump_check_wx(void)
.level = -1,
.check_wx = true,
.ptdump = {
- .note_page = note_page,
+ .note_page_pte = note_page_pte,
+ .note_page_pmd = note_page_pmd,
+ .note_page_pud = note_page_pud,
+ .note_page_p4d = note_page_p4d,
+ .note_page_pgd = note_page_pgd,
+ .note_page_flush = note_page_flush,
.range = (struct ptdump_range[]) {
{KERN_VIRT_START, ULONG_MAX},
{0, 0}
@@ -370,7 +413,9 @@ bool ptdump_check_wx(void)
static int ptdump_show(struct seq_file *m, void *v)
{
+ get_online_mems();
ptdump_walk(m, m->private);
+ put_online_mems();
return 0;
}
diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c
index 893566e004b7..e737ba7949b1 100644
--- a/arch/riscv/mm/tlbflush.c
+++ b/arch/riscv/mm/tlbflush.c
@@ -4,37 +4,36 @@
#include <linux/smp.h>
#include <linux/sched.h>
#include <linux/hugetlb.h>
+#include <linux/mmu_notifier.h>
#include <asm/sbi.h>
#include <asm/mmu_context.h>
+#include <asm/cpufeature.h>
-static inline void local_flush_tlb_all_asid(unsigned long asid)
+#define has_svinval() riscv_has_extension_unlikely(RISCV_ISA_EXT_SVINVAL)
+
+static inline void local_sfence_inval_ir(void)
{
- if (asid != FLUSH_TLB_NO_ASID)
- __asm__ __volatile__ ("sfence.vma x0, %0"
- :
- : "r" (asid)
- : "memory");
- else
- local_flush_tlb_all();
+ asm volatile(SFENCE_INVAL_IR() ::: "memory");
}
-static inline void local_flush_tlb_page_asid(unsigned long addr,
- unsigned long asid)
+static inline void local_sfence_w_inval(void)
+{
+ asm volatile(SFENCE_W_INVAL() ::: "memory");
+}
+
+static inline void local_sinval_vma(unsigned long vma, unsigned long asid)
{
if (asid != FLUSH_TLB_NO_ASID)
- __asm__ __volatile__ ("sfence.vma %0, %1"
- :
- : "r" (addr), "r" (asid)
- : "memory");
+ asm volatile(SINVAL_VMA(%0, %1) : : "r" (vma), "r" (asid) : "memory");
else
- local_flush_tlb_page(addr);
+ asm volatile(SINVAL_VMA(%0, zero) : : "r" (vma) : "memory");
}
/*
* Flush entire TLB if number of entries to be flushed is greater
* than the threshold below.
*/
-static unsigned long tlb_flush_all_threshold __read_mostly = 64;
+unsigned long tlb_flush_all_threshold __read_mostly = 64;
static void local_flush_tlb_range_threshold_asid(unsigned long start,
unsigned long size,
@@ -49,6 +48,16 @@ static void local_flush_tlb_range_threshold_asid(unsigned long start,
return;
}
+ if (has_svinval()) {
+ local_sfence_w_inval();
+ for (i = 0; i < nr_ptes_in_range; ++i) {
+ local_sinval_vma(start, asid);
+ start += stride;
+ }
+ local_sfence_inval_ir();
+ return;
+ }
+
for (i = 0; i < nr_ptes_in_range; ++i) {
local_flush_tlb_page_asid(start, asid);
start += stride;
@@ -79,10 +88,12 @@ static void __ipi_flush_tlb_all(void *info)
void flush_tlb_all(void)
{
- if (riscv_use_ipi_for_rfence())
- on_each_cpu(__ipi_flush_tlb_all, NULL, 1);
- else
+ if (num_online_cpus() < 2)
+ local_flush_tlb_all();
+ else if (riscv_use_sbi_for_rfence())
sbi_remote_sfence_vma_asid(NULL, 0, FLUSH_TLB_MAX_SIZE, FLUSH_TLB_NO_ASID);
+ else
+ on_each_cpu(__ipi_flush_tlb_all, NULL, 1);
}
struct flush_tlb_range_data {
@@ -99,69 +110,60 @@ static void __ipi_flush_tlb_range_asid(void *info)
local_flush_tlb_range_asid(d->start, d->size, d->stride, d->asid);
}
-static void __flush_tlb_range(struct cpumask *cmask, unsigned long asid,
+static inline unsigned long get_mm_asid(struct mm_struct *mm)
+{
+ return mm ? cntx2asid(atomic_long_read(&mm->context.id)) : FLUSH_TLB_NO_ASID;
+}
+
+static void __flush_tlb_range(struct mm_struct *mm,
+ const struct cpumask *cmask,
unsigned long start, unsigned long size,
unsigned long stride)
{
- struct flush_tlb_range_data ftd;
- bool broadcast;
+ unsigned long asid = get_mm_asid(mm);
+ unsigned int cpu;
if (cpumask_empty(cmask))
return;
- if (cmask != cpu_online_mask) {
- unsigned int cpuid;
+ cpu = get_cpu();
- cpuid = get_cpu();
- /* check if the tlbflush needs to be sent to other CPUs */
- broadcast = cpumask_any_but(cmask, cpuid) < nr_cpu_ids;
+ /* Check if the TLB flush needs to be sent to other CPUs. */
+ if (cpumask_any_but(cmask, cpu) >= nr_cpu_ids) {
+ local_flush_tlb_range_asid(start, size, stride, asid);
+ } else if (riscv_use_sbi_for_rfence()) {
+ sbi_remote_sfence_vma_asid(cmask, start, size, asid);
} else {
- broadcast = true;
- }
+ struct flush_tlb_range_data ftd;
- if (broadcast) {
- if (riscv_use_ipi_for_rfence()) {
- ftd.asid = asid;
- ftd.start = start;
- ftd.size = size;
- ftd.stride = stride;
- on_each_cpu_mask(cmask,
- __ipi_flush_tlb_range_asid,
- &ftd, 1);
- } else
- sbi_remote_sfence_vma_asid(cmask,
- start, size, asid);
- } else {
- local_flush_tlb_range_asid(start, size, stride, asid);
+ ftd.asid = asid;
+ ftd.start = start;
+ ftd.size = size;
+ ftd.stride = stride;
+ on_each_cpu_mask(cmask, __ipi_flush_tlb_range_asid, &ftd, 1);
}
- if (cmask != cpu_online_mask)
- put_cpu();
-}
+ put_cpu();
-static inline unsigned long get_mm_asid(struct mm_struct *mm)
-{
- return static_branch_unlikely(&use_asid_allocator) ?
- atomic_long_read(&mm->context.id) & asid_mask : FLUSH_TLB_NO_ASID;
+ if (mm)
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, start + size);
}
void flush_tlb_mm(struct mm_struct *mm)
{
- __flush_tlb_range(mm_cpumask(mm), get_mm_asid(mm),
- 0, FLUSH_TLB_MAX_SIZE, PAGE_SIZE);
+ __flush_tlb_range(mm, mm_cpumask(mm), 0, FLUSH_TLB_MAX_SIZE, PAGE_SIZE);
}
void flush_tlb_mm_range(struct mm_struct *mm,
unsigned long start, unsigned long end,
unsigned int page_size)
{
- __flush_tlb_range(mm_cpumask(mm), get_mm_asid(mm),
- start, end - start, page_size);
+ __flush_tlb_range(mm, mm_cpumask(mm), start, end - start, page_size);
}
void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr)
{
- __flush_tlb_range(mm_cpumask(vma->vm_mm), get_mm_asid(vma->vm_mm),
+ __flush_tlb_range(vma->vm_mm, mm_cpumask(vma->vm_mm),
addr, PAGE_SIZE, PAGE_SIZE);
}
@@ -194,13 +196,13 @@ void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
}
}
- __flush_tlb_range(mm_cpumask(vma->vm_mm), get_mm_asid(vma->vm_mm),
+ __flush_tlb_range(vma->vm_mm, mm_cpumask(vma->vm_mm),
start, end - start, stride_size);
}
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
{
- __flush_tlb_range((struct cpumask *)cpu_online_mask, FLUSH_TLB_NO_ASID,
+ __flush_tlb_range(NULL, cpu_online_mask,
start, end - start, PAGE_SIZE);
}
@@ -208,9 +210,16 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
void flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end)
{
- __flush_tlb_range(mm_cpumask(vma->vm_mm), get_mm_asid(vma->vm_mm),
+ __flush_tlb_range(vma->vm_mm, mm_cpumask(vma->vm_mm),
start, end - start, PMD_SIZE);
}
+
+void flush_pud_tlb_range(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end)
+{
+ __flush_tlb_range(vma->vm_mm, mm_cpumask(vma->vm_mm),
+ start, end - start, PUD_SIZE);
+}
#endif
bool arch_tlbbatch_should_defer(struct mm_struct *mm)
@@ -219,10 +228,10 @@ bool arch_tlbbatch_should_defer(struct mm_struct *mm)
}
void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch,
- struct mm_struct *mm,
- unsigned long uaddr)
+ struct mm_struct *mm, unsigned long start, unsigned long end)
{
cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
+ mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end);
}
void arch_flush_tlb_batched_pending(struct mm_struct *mm)
@@ -232,7 +241,7 @@ void arch_flush_tlb_batched_pending(struct mm_struct *mm)
void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
{
- __flush_tlb_range(&batch->cpumask, FLUSH_TLB_NO_ASID, 0,
- FLUSH_TLB_MAX_SIZE, PAGE_SIZE);
+ __flush_tlb_range(NULL, &batch->cpumask,
+ 0, FLUSH_TLB_MAX_SIZE, PAGE_SIZE);
cpumask_clear(&batch->cpumask);
}