aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/Makefile5
-rw-r--r--arch/x86/mm/extable.c108
-rw-r--r--arch/x86/mm/fault.c150
-rw-r--r--arch/x86/mm/gup.c47
-rw-r--r--arch/x86/mm/mpx.c8
-rw-r--r--arch/x86/mm/pageattr.c34
-rw-r--r--arch/x86/mm/pat.c2
-rw-r--r--arch/x86/mm/pkeys.c101
8 files changed, 291 insertions, 164 deletions
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index f9d38a48e3c8..f98913258c63 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,3 +1,6 @@
+# Kernel does not boot with instrumentation of tlb.c.
+KCOV_INSTRUMENT_tlb.o := n
+
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
pat.o pgtable.o physaddr.o gup.o setup_nx.o
@@ -34,3 +37,5 @@ obj-$(CONFIG_ACPI_NUMA) += srat.o
obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
+obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
+
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 9dd7e4b7fcde..82447b3fba38 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -1,17 +1,10 @@
#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/sort.h>
#include <asm/uaccess.h>
typedef bool (*ex_handler_t)(const struct exception_table_entry *,
struct pt_regs *, int);
static inline unsigned long
-ex_insn_addr(const struct exception_table_entry *x)
-{
- return (unsigned long)&x->insn + x->insn;
-}
-static inline unsigned long
ex_fixup_addr(const struct exception_table_entry *x)
{
return (unsigned long)&x->fixup + x->fixup;
@@ -110,104 +103,3 @@ int __init early_fixup_exception(unsigned long *ip)
*ip = new_ip;
return 1;
}
-
-/*
- * Search one exception table for an entry corresponding to the
- * given instruction address, and return the address of the entry,
- * or NULL if none is found.
- * We use a binary search, and thus we assume that the table is
- * already sorted.
- */
-const struct exception_table_entry *
-search_extable(const struct exception_table_entry *first,
- const struct exception_table_entry *last,
- unsigned long value)
-{
- while (first <= last) {
- const struct exception_table_entry *mid;
- unsigned long addr;
-
- mid = ((last - first) >> 1) + first;
- addr = ex_insn_addr(mid);
- if (addr < value)
- first = mid + 1;
- else if (addr > value)
- last = mid - 1;
- else
- return mid;
- }
- return NULL;
-}
-
-/*
- * The exception table needs to be sorted so that the binary
- * search that we use to find entries in it works properly.
- * This is used both for the kernel exception table and for
- * the exception tables of modules that get loaded.
- *
- */
-static int cmp_ex(const void *a, const void *b)
-{
- const struct exception_table_entry *x = a, *y = b;
-
- /*
- * This value will always end up fittin in an int, because on
- * both i386 and x86-64 the kernel symbol-reachable address
- * space is < 2 GiB.
- *
- * This compare is only valid after normalization.
- */
- return x->insn - y->insn;
-}
-
-void sort_extable(struct exception_table_entry *start,
- struct exception_table_entry *finish)
-{
- struct exception_table_entry *p;
- int i;
-
- /* Convert all entries to being relative to the start of the section */
- i = 0;
- for (p = start; p < finish; p++) {
- p->insn += i;
- i += 4;
- p->fixup += i;
- i += 4;
- p->handler += i;
- i += 4;
- }
-
- sort(start, finish - start, sizeof(struct exception_table_entry),
- cmp_ex, NULL);
-
- /* Denormalize all entries */
- i = 0;
- for (p = start; p < finish; p++) {
- p->insn -= i;
- i += 4;
- p->fixup -= i;
- i += 4;
- p->handler -= i;
- i += 4;
- }
-}
-
-#ifdef CONFIG_MODULES
-/*
- * If the exception table is sorted, any referring to the module init
- * will be at the beginning or the end.
- */
-void trim_init_extable(struct module *m)
-{
- /*trim the beginning*/
- while (m->num_exentries &&
- within_module_init(ex_insn_addr(&m->extable[0]), m)) {
- m->extable++;
- m->num_exentries--;
- }
- /*trim the end*/
- while (m->num_exentries &&
- within_module_init(ex_insn_addr(&m->extable[m->num_exentries-1]), m))
- m->num_exentries--;
-}
-#endif /* CONFIG_MODULES */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 03898aea6e0f..5ce1ed02f7e8 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -15,12 +15,14 @@
#include <linux/context_tracking.h> /* exception_enter(), ... */
#include <linux/uaccess.h> /* faulthandler_disabled() */
+#include <asm/cpufeature.h> /* boot_cpu_has, ... */
#include <asm/traps.h> /* dotraplinkage, ... */
#include <asm/pgalloc.h> /* pgd_*(), ... */
#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
#include <asm/fixmap.h> /* VSYSCALL_ADDR */
#include <asm/vsyscall.h> /* emulate_vsyscall */
#include <asm/vm86.h> /* struct vm86 */
+#include <asm/mmu_context.h> /* vma_pkey() */
#define CREATE_TRACE_POINTS
#include <asm/trace/exceptions.h>
@@ -33,6 +35,7 @@
* bit 2 == 0: kernel-mode access 1: user-mode access
* bit 3 == 1: use of reserved bit detected
* bit 4 == 1: fault was an instruction fetch
+ * bit 5 == 1: protection keys block access
*/
enum x86_pf_error_code {
@@ -41,6 +44,7 @@ enum x86_pf_error_code {
PF_USER = 1 << 2,
PF_RSVD = 1 << 3,
PF_INSTR = 1 << 4,
+ PF_PK = 1 << 5,
};
/*
@@ -167,9 +171,60 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
return prefetch;
}
+/*
+ * A protection key fault means that the PKRU value did not allow
+ * access to some PTE. Userspace can figure out what PKRU was
+ * from the XSAVE state, and this function fills out a field in
+ * siginfo so userspace can discover which protection key was set
+ * on the PTE.
+ *
+ * If we get here, we know that the hardware signaled a PF_PK
+ * fault and that there was a VMA once we got in the fault
+ * handler. It does *not* guarantee that the VMA we find here
+ * was the one that we faulted on.
+ *
+ * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4);
+ * 2. T1 : set PKRU to deny access to pkey=4, touches page
+ * 3. T1 : faults...
+ * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
+ * 5. T1 : enters fault handler, takes mmap_sem, etc...
+ * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really
+ * faulted on a pte with its pkey=4.
+ */
+static void fill_sig_info_pkey(int si_code, siginfo_t *info,
+ struct vm_area_struct *vma)
+{
+ /* This is effectively an #ifdef */
+ if (!boot_cpu_has(X86_FEATURE_OSPKE))
+ return;
+
+ /* Fault not from Protection Keys: nothing to do */
+ if (si_code != SEGV_PKUERR)
+ return;
+ /*
+ * force_sig_info_fault() is called from a number of
+ * contexts, some of which have a VMA and some of which
+ * do not. The PF_PK handing happens after we have a
+ * valid VMA, so we should never reach this without a
+ * valid VMA.
+ */
+ if (!vma) {
+ WARN_ONCE(1, "PKU fault with no VMA passed in");
+ info->si_pkey = 0;
+ return;
+ }
+ /*
+ * si_pkey should be thought of as a strong hint, but not
+ * absolutely guranteed to be 100% accurate because of
+ * the race explained above.
+ */
+ info->si_pkey = vma_pkey(vma);
+}
+
static void
force_sig_info_fault(int si_signo, int si_code, unsigned long address,
- struct task_struct *tsk, int fault)
+ struct task_struct *tsk, struct vm_area_struct *vma,
+ int fault)
{
unsigned lsb = 0;
siginfo_t info;
@@ -184,6 +239,8 @@ force_sig_info_fault(int si_signo, int si_code, unsigned long address,
lsb = PAGE_SHIFT;
info.si_addr_lsb = lsb;
+ fill_sig_info_pkey(si_code, &info, vma);
+
force_sig_info(si_signo, &info, tsk);
}
@@ -661,6 +718,8 @@ no_context(struct pt_regs *regs, unsigned long error_code,
struct task_struct *tsk = current;
unsigned long flags;
int sig;
+ /* No context means no VMA to pass down */
+ struct vm_area_struct *vma = NULL;
/* Are we prepared to handle this kernel fault? */
if (fixup_exception(regs, X86_TRAP_PF)) {
@@ -684,7 +743,8 @@ no_context(struct pt_regs *regs, unsigned long error_code,
tsk->thread.cr2 = address;
/* XXX: hwpoison faults will set the wrong code. */
- force_sig_info_fault(signal, si_code, address, tsk, 0);
+ force_sig_info_fault(signal, si_code, address,
+ tsk, vma, 0);
}
/*
@@ -761,7 +821,8 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
static void
__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, int si_code)
+ unsigned long address, struct vm_area_struct *vma,
+ int si_code)
{
struct task_struct *tsk = current;
@@ -804,7 +865,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_PF;
- force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
+ force_sig_info_fault(SIGSEGV, si_code, address, tsk, vma, 0);
return;
}
@@ -817,14 +878,14 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
static noinline void
bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
- unsigned long address)
+ unsigned long address, struct vm_area_struct *vma)
{
- __bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
+ __bad_area_nosemaphore(regs, error_code, address, vma, SEGV_MAPERR);
}
static void
__bad_area(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, int si_code)
+ unsigned long address, struct vm_area_struct *vma, int si_code)
{
struct mm_struct *mm = current->mm;
@@ -834,25 +895,50 @@ __bad_area(struct pt_regs *regs, unsigned long error_code,
*/
up_read(&mm->mmap_sem);
- __bad_area_nosemaphore(regs, error_code, address, si_code);
+ __bad_area_nosemaphore(regs, error_code, address, vma, si_code);
}
static noinline void
bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
{
- __bad_area(regs, error_code, address, SEGV_MAPERR);
+ __bad_area(regs, error_code, address, NULL, SEGV_MAPERR);
+}
+
+static inline bool bad_area_access_from_pkeys(unsigned long error_code,
+ struct vm_area_struct *vma)
+{
+ /* This code is always called on the current mm */
+ bool foreign = false;
+
+ if (!boot_cpu_has(X86_FEATURE_OSPKE))
+ return false;
+ if (error_code & PF_PK)
+ return true;
+ /* this checks permission keys on the VMA: */
+ if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE),
+ (error_code & PF_INSTR), foreign))
+ return true;
+ return false;
}
static noinline void
bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
- unsigned long address)
+ unsigned long address, struct vm_area_struct *vma)
{
- __bad_area(regs, error_code, address, SEGV_ACCERR);
+ /*
+ * This OSPKE check is not strictly necessary at runtime.
+ * But, doing it this way allows compiler optimizations
+ * if pkeys are compiled out.
+ */
+ if (bad_area_access_from_pkeys(error_code, vma))
+ __bad_area(regs, error_code, address, vma, SEGV_PKUERR);
+ else
+ __bad_area(regs, error_code, address, vma, SEGV_ACCERR);
}
static void
do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
- unsigned int fault)
+ struct vm_area_struct *vma, unsigned int fault)
{
struct task_struct *tsk = current;
int code = BUS_ADRERR;
@@ -879,12 +965,13 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
code = BUS_MCEERR_AR;
}
#endif
- force_sig_info_fault(SIGBUS, code, address, tsk, fault);
+ force_sig_info_fault(SIGBUS, code, address, tsk, vma, fault);
}
static noinline void
mm_fault_error(struct pt_regs *regs, unsigned long error_code,
- unsigned long address, unsigned int fault)
+ unsigned long address, struct vm_area_struct *vma,
+ unsigned int fault)
{
if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
no_context(regs, error_code, address, 0, 0);
@@ -908,9 +995,9 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
} else {
if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
VM_FAULT_HWPOISON_LARGE))
- do_sigbus(regs, error_code, address, fault);
+ do_sigbus(regs, error_code, address, vma, fault);
else if (fault & VM_FAULT_SIGSEGV)
- bad_area_nosemaphore(regs, error_code, address);
+ bad_area_nosemaphore(regs, error_code, address, vma);
else
BUG();
}
@@ -923,6 +1010,12 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
if ((error_code & PF_INSTR) && !pte_exec(*pte))
return 0;
+ /*
+ * Note: We do not do lazy flushing on protection key
+ * changes, so no spurious fault will ever set PF_PK.
+ */
+ if ((error_code & PF_PK))
+ return 1;
return 1;
}
@@ -1012,6 +1105,17 @@ int show_unhandled_signals = 1;
static inline int
access_error(unsigned long error_code, struct vm_area_struct *vma)
{
+ /* This is only called for the current mm, so: */
+ bool foreign = false;
+ /*
+ * Make sure to check the VMA so that we do not perform
+ * faults just to hit a PF_PK as soon as we fill in a
+ * page.
+ */
+ if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE),
+ (error_code & PF_INSTR), foreign))
+ return 1;
+
if (error_code & PF_WRITE) {
/* write, present and write, not present: */
if (unlikely(!(vma->vm_flags & VM_WRITE)))
@@ -1118,7 +1222,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* Don't take the mm semaphore here. If we fixup a prefetch
* fault we could otherwise deadlock:
*/
- bad_area_nosemaphore(regs, error_code, address);
+ bad_area_nosemaphore(regs, error_code, address, NULL);
return;
}
@@ -1131,7 +1235,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
pgtable_bad(regs, error_code, address);
if (unlikely(smap_violation(error_code, regs))) {
- bad_area_nosemaphore(regs, error_code, address);
+ bad_area_nosemaphore(regs, error_code, address, NULL);
return;
}
@@ -1140,7 +1244,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* in a region with pagefaults disabled then we must not take the fault
*/
if (unlikely(faulthandler_disabled() || !mm)) {
- bad_area_nosemaphore(regs, error_code, address);
+ bad_area_nosemaphore(regs, error_code, address, NULL);
return;
}
@@ -1164,6 +1268,8 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
if (error_code & PF_WRITE)
flags |= FAULT_FLAG_WRITE;
+ if (error_code & PF_INSTR)
+ flags |= FAULT_FLAG_INSTRUCTION;
/*
* When running in the kernel we expect faults to occur only to
@@ -1184,7 +1290,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
if ((error_code & PF_USER) == 0 &&
!search_exception_tables(regs->ip)) {
- bad_area_nosemaphore(regs, error_code, address);
+ bad_area_nosemaphore(regs, error_code, address, NULL);
return;
}
retry:
@@ -1232,7 +1338,7 @@ retry:
*/
good_area:
if (unlikely(access_error(error_code, vma))) {
- bad_area_access_error(regs, error_code, address);
+ bad_area_access_error(regs, error_code, address, vma);
return;
}
@@ -1270,7 +1376,7 @@ good_area:
up_read(&mm->mmap_sem);
if (unlikely(fault & VM_FAULT_ERROR)) {
- mm_fault_error(regs, error_code, address, fault);
+ mm_fault_error(regs, error_code, address, vma, fault);
return;
}
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index d8a798d8bf50..b8b6a60b32cf 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -11,6 +11,7 @@
#include <linux/swap.h>
#include <linux/memremap.h>
+#include <asm/mmu_context.h>
#include <asm/pgtable.h>
static inline pte_t gup_get_pte(pte_t *ptep)
@@ -75,6 +76,28 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
}
/*
+ * 'pteval' can come from a pte, pmd or pud. We only check
+ * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the
+ * same value on all 3 types.
+ */
+static inline int pte_allows_gup(unsigned long pteval, int write)
+{
+ unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER;
+
+ if (write)
+ need_pte_bits |= _PAGE_RW;
+
+ if ((pteval & need_pte_bits) != need_pte_bits)
+ return 0;
+
+ /* Check memory protection keys permissions. */
+ if (!__pkru_allows_pkey(pte_flags_pkey(pteval), write))
+ return 0;
+
+ return 1;
+}
+
+/*
* The performance critical leaf functions are made noinline otherwise gcc
* inlines everything into a single function which results in too much
* register pressure.
@@ -83,14 +106,9 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
struct dev_pagemap *pgmap = NULL;
- unsigned long mask;
int nr_start = *nr;
pte_t *ptep;
- mask = _PAGE_PRESENT|_PAGE_USER;
- if (write)
- mask |= _PAGE_RW;
-
ptep = pte_offset_map(&pmd, addr);
do {
pte_t pte = gup_get_pte(ptep);
@@ -109,7 +127,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
pte_unmap(ptep);
return 0;
}
- } else if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
+ } else if (!pte_allows_gup(pte_val(pte), write) ||
+ pte_special(pte)) {
pte_unmap(ptep);
return 0;
}
@@ -131,7 +150,7 @@ static inline void get_head_page_multiple(struct page *page, int nr)
{
VM_BUG_ON_PAGE(page != compound_head(page), page);
VM_BUG_ON_PAGE(page_count(page) == 0, page);
- atomic_add(nr, &page->_count);
+ page_ref_add(page, nr);
SetPageReferenced(page);
}
@@ -164,14 +183,10 @@ static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
- unsigned long mask;
struct page *head, *page;
int refs;
- mask = _PAGE_PRESENT|_PAGE_USER;
- if (write)
- mask |= _PAGE_RW;
- if ((pmd_flags(pmd) & mask) != mask)
+ if (!pte_allows_gup(pmd_val(pmd), write))
return 0;
VM_BUG_ON(!pfn_valid(pmd_pfn(pmd)));
@@ -231,14 +246,10 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
- unsigned long mask;
struct page *head, *page;
int refs;
- mask = _PAGE_PRESENT|_PAGE_USER;
- if (write)
- mask |= _PAGE_RW;
- if ((pud_flags(pud) & mask) != mask)
+ if (!pte_allows_gup(pud_val(pud), write))
return 0;
/* hugepages are never "special" */
VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL);
@@ -422,7 +433,7 @@ slow_irqon:
start += nr << PAGE_SHIFT;
pages += nr;
- ret = get_user_pages_unlocked(current, mm, start,
+ ret = get_user_pages_unlocked(start,
(end - start) >> PAGE_SHIFT,
write, 0, pages);
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index ef05755a1900..80476878eb4c 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -546,8 +546,8 @@ static int mpx_resolve_fault(long __user *addr, int write)
int nr_pages = 1;
int force = 0;
- gup_ret = get_user_pages(current, current->mm, (unsigned long)addr,
- nr_pages, write, force, NULL, NULL);
+ gup_ret = get_user_pages((unsigned long)addr, nr_pages, write,
+ force, NULL, NULL);
/*
* get_user_pages() returns number of pages gotten.
* 0 means we failed to fault in and get anything,
@@ -728,14 +728,14 @@ static inline unsigned long bd_entry_virt_space(struct mm_struct *mm)
/*
* This covers 32-bit emulation as well as 32-bit kernels
- * running on 64-bit harware.
+ * running on 64-bit hardware.
*/
if (!is_64bit_mm(mm))
return (4ULL * GB) / MPX_BD_NR_ENTRIES_32;
/*
* 'x86_virt_bits' returns what the hardware is capable
- * of, and returns the full >32-bit adddress space when
+ * of, and returns the full >32-bit address space when
* running 32-bit kernels on 64-bit hardware.
*/
virt_space = (1ULL << boot_cpu_data.x86_virt_bits);
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 4d0b26253042..01be9ec3bf79 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -909,16 +909,25 @@ static void populate_pte(struct cpa_data *cpa,
pte = pte_offset_kernel(pmd, start);
- while (num_pages-- && start < end) {
+ /*
+ * Set the GLOBAL flags only if the PRESENT flag is
+ * set otherwise pte_present will return true even on
+ * a non present pte. The canon_pgprot will clear
+ * _PAGE_GLOBAL for the ancient hardware that doesn't
+ * support it.
+ */
+ if (pgprot_val(pgprot) & _PAGE_PRESENT)
+ pgprot_val(pgprot) |= _PAGE_GLOBAL;
+ else
+ pgprot_val(pgprot) &= ~_PAGE_GLOBAL;
- /* deal with the NX bit */
- if (!(pgprot_val(pgprot) & _PAGE_NX))
- cpa->pfn &= ~_PAGE_NX;
+ pgprot = canon_pgprot(pgprot);
- set_pte(pte, pfn_pte(cpa->pfn >> PAGE_SHIFT, pgprot));
+ while (num_pages-- && start < end) {
+ set_pte(pte, pfn_pte(cpa->pfn, pgprot));
start += PAGE_SIZE;
- cpa->pfn += PAGE_SIZE;
+ cpa->pfn++;
pte++;
}
}
@@ -974,11 +983,11 @@ static int populate_pmd(struct cpa_data *cpa,
pmd = pmd_offset(pud, start);
- set_pmd(pmd, __pmd(cpa->pfn | _PAGE_PSE |
+ set_pmd(pmd, __pmd(cpa->pfn << PAGE_SHIFT | _PAGE_PSE |
massage_pgprot(pmd_pgprot)));
start += PMD_SIZE;
- cpa->pfn += PMD_SIZE;
+ cpa->pfn += PMD_SIZE >> PAGE_SHIFT;
cur_pages += PMD_SIZE >> PAGE_SHIFT;
}
@@ -1046,12 +1055,12 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd,
/*
* Map everything starting from the Gb boundary, possibly with 1G pages
*/
- while (end - start >= PUD_SIZE) {
- set_pud(pud, __pud(cpa->pfn | _PAGE_PSE |
+ while (cpu_has_gbpages && end - start >= PUD_SIZE) {
+ set_pud(pud, __pud(cpa->pfn << PAGE_SHIFT | _PAGE_PSE |
massage_pgprot(pud_pgprot)));
start += PUD_SIZE;
- cpa->pfn += PUD_SIZE;
+ cpa->pfn += PUD_SIZE >> PAGE_SHIFT;
cur_pages += PUD_SIZE >> PAGE_SHIFT;
pud++;
}
@@ -1964,6 +1973,9 @@ int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
if (!(page_flags & _PAGE_NX))
cpa.mask_clr = __pgprot(_PAGE_NX);
+ if (!(page_flags & _PAGE_RW))
+ cpa.mask_clr = __pgprot(_PAGE_RW);
+
cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags);
retval = __change_page_attr_set_clr(&cpa, 0);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 04e2e7144bee..faec01e7a17d 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -149,7 +149,7 @@ enum {
PAT_WT = 4, /* Write Through */
PAT_WP = 5, /* Write Protected */
PAT_WB = 6, /* Write Back (default) */
- PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */
+ PAT_UC_MINUS = 7, /* UC, but can be overridden by MTRR */
};
#define CM(c) (_PAGE_CACHE_MODE_ ## c)
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
new file mode 100644
index 000000000000..e8c474451928
--- /dev/null
+++ b/arch/x86/mm/pkeys.c
@@ -0,0 +1,101 @@
+/*
+ * Intel Memory Protection Keys management
+ * Copyright (c) 2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+#include <linux/mm_types.h> /* mm_struct, vma, etc... */
+#include <linux/pkeys.h> /* PKEY_* */
+#include <uapi/asm-generic/mman-common.h>
+
+#include <asm/cpufeature.h> /* boot_cpu_has, ... */
+#include <asm/mmu_context.h> /* vma_pkey() */
+#include <asm/fpu/internal.h> /* fpregs_active() */
+
+int __execute_only_pkey(struct mm_struct *mm)
+{
+ int ret;
+
+ /*
+ * We do not want to go through the relatively costly
+ * dance to set PKRU if we do not need to. Check it
+ * first and assume that if the execute-only pkey is
+ * write-disabled that we do not have to set it
+ * ourselves. We need preempt off so that nobody
+ * can make fpregs inactive.
+ */
+ preempt_disable();
+ if (fpregs_active() &&
+ !__pkru_allows_read(read_pkru(), PKEY_DEDICATED_EXECUTE_ONLY)) {
+ preempt_enable();
+ return PKEY_DEDICATED_EXECUTE_ONLY;
+ }
+ preempt_enable();
+ ret = arch_set_user_pkey_access(current, PKEY_DEDICATED_EXECUTE_ONLY,
+ PKEY_DISABLE_ACCESS);
+ /*
+ * If the PKRU-set operation failed somehow, just return
+ * 0 and effectively disable execute-only support.
+ */
+ if (ret)
+ return 0;
+
+ return PKEY_DEDICATED_EXECUTE_ONLY;
+}
+
+static inline bool vma_is_pkey_exec_only(struct vm_area_struct *vma)
+{
+ /* Do this check first since the vm_flags should be hot */
+ if ((vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) != VM_EXEC)
+ return false;
+ if (vma_pkey(vma) != PKEY_DEDICATED_EXECUTE_ONLY)
+ return false;
+
+ return true;
+}
+
+/*
+ * This is only called for *plain* mprotect calls.
+ */
+int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey)
+{
+ /*
+ * Is this an mprotect_pkey() call? If so, never
+ * override the value that came from the user.
+ */
+ if (pkey != -1)
+ return pkey;
+ /*
+ * Look for a protection-key-drive execute-only mapping
+ * which is now being given permissions that are not
+ * execute-only. Move it back to the default pkey.
+ */
+ if (vma_is_pkey_exec_only(vma) &&
+ (prot & (PROT_READ|PROT_WRITE))) {
+ return 0;
+ }
+ /*
+ * The mapping is execute-only. Go try to get the
+ * execute-only protection key. If we fail to do that,
+ * fall through as if we do not have execute-only
+ * support.
+ */
+ if (prot == PROT_EXEC) {
+ pkey = execute_only_pkey(vma->vm_mm);
+ if (pkey > 0)
+ return pkey;
+ }
+ /*
+ * This is a vanilla, non-pkey mprotect (or we failed to
+ * setup execute-only), inherit the pkey from the VMA we
+ * are working on.
+ */
+ return vma_pkey(vma);
+}