aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/mm/init.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm/init.c')
-rw-r--r--arch/x86/mm/init.c158
1 files changed, 122 insertions, 36 deletions
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index e7bb483557c9..9121bc1b9453 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -25,12 +25,12 @@
#include <asm/cpufeature.h>
#include <asm/pti.h>
#include <asm/text-patching.h>
+#include <asm/memtype.h>
/*
* We need to define the tracepoints somewhere, and tlb.c
- * is only compied when SMP=y.
+ * is only compiled when SMP=y.
*/
-#define CREATE_TRACE_POINTS
#include <trace/events/tlb.h>
#include "mm_internal.h"
@@ -49,7 +49,7 @@
* Index into __pte2cachemode_tbl[] are the caching attribute bits of the pte
* (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2.
*/
-uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = {
+static uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = {
[_PAGE_CACHE_MODE_WB ] = 0 | 0 ,
[_PAGE_CACHE_MODE_WC ] = 0 | _PAGE_PCD,
[_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD,
@@ -57,9 +57,16 @@ uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = {
[_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD,
[_PAGE_CACHE_MODE_WP ] = 0 | _PAGE_PCD,
};
-EXPORT_SYMBOL(__cachemode2pte_tbl);
-uint8_t __pte2cachemode_tbl[8] = {
+unsigned long cachemode2protval(enum page_cache_mode pcm)
+{
+ if (likely(pcm == 0))
+ return 0;
+ return __cachemode2pte_tbl[pcm];
+}
+EXPORT_SYMBOL(cachemode2protval);
+
+static uint8_t __pte2cachemode_tbl[8] = {
[__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB,
[__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_UC_MINUS,
[__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS,
@@ -69,7 +76,32 @@ uint8_t __pte2cachemode_tbl[8] = {
[__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
[__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC,
};
-EXPORT_SYMBOL(__pte2cachemode_tbl);
+
+/*
+ * Check that the write-protect PAT entry is set for write-protect.
+ * To do this without making assumptions how PAT has been set up (Xen has
+ * another layout than the kernel), translate the _PAGE_CACHE_MODE_WP cache
+ * mode via the __cachemode2pte_tbl[] into protection bits (those protection
+ * bits will select a cache mode of WP or better), and then translate the
+ * protection bits back into the cache mode using __pte2cm_idx() and the
+ * __pte2cachemode_tbl[] array. This will return the really used cache mode.
+ */
+bool x86_has_pat_wp(void)
+{
+ uint16_t prot = __cachemode2pte_tbl[_PAGE_CACHE_MODE_WP];
+
+ return __pte2cachemode_tbl[__pte2cm_idx(prot)] == _PAGE_CACHE_MODE_WP;
+}
+
+enum page_cache_mode pgprot2cachemode(pgprot_t pgprot)
+{
+ unsigned long masked;
+
+ masked = pgprot_val(pgprot) & _PAGE_CACHE_MASK;
+ if (likely(masked == 0))
+ return 0;
+ return __pte2cachemode_tbl[__pte2cm_idx(masked)];
+}
static unsigned long __initdata pgt_buf_start;
static unsigned long __initdata pgt_buf_end;
@@ -104,14 +136,12 @@ __ref void *alloc_low_pages(unsigned int num)
unsigned long ret = 0;
if (min_pfn_mapped < max_pfn_mapped) {
- ret = memblock_find_in_range(
+ ret = memblock_phys_alloc_range(
+ PAGE_SIZE * num, PAGE_SIZE,
min_pfn_mapped << PAGE_SHIFT,
- max_pfn_mapped << PAGE_SHIFT,
- PAGE_SIZE * num , PAGE_SIZE);
+ max_pfn_mapped << PAGE_SHIFT);
}
- if (ret)
- memblock_reserve(ret, PAGE_SIZE * num);
- else if (can_use_brk_pgt)
+ if (!ret && can_use_brk_pgt)
ret = __pa(extend_brk(PAGE_SIZE * num, PAGE_SIZE));
if (!ret)
@@ -121,8 +151,6 @@ __ref void *alloc_low_pages(unsigned int num)
} else {
pfn = pgt_buf_end;
pgt_buf_end += num;
- printk(KERN_DEBUG "BRK [%#010lx, %#010lx] PGTABLE\n",
- pfn << PAGE_SHIFT, (pgt_buf_end << PAGE_SHIFT) - 1);
}
for (i = 0; i < num; i++) {
@@ -136,16 +164,25 @@ __ref void *alloc_low_pages(unsigned int num)
}
/*
- * By default need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS.
- * With KASLR memory randomization, depending on the machine e820 memory
- * and the PUD alignment. We may need twice more pages when KASLR memory
+ * By default need to be able to allocate page tables below PGD firstly for
+ * the 0-ISA_END_ADDRESS range and secondly for the initial PMD_SIZE mapping.
+ * With KASLR memory randomization, depending on the machine e820 memory and the
+ * PUD alignment, twice that many pages may be needed when KASLR memory
* randomization is enabled.
*/
+
+#ifndef CONFIG_X86_5LEVEL
+#define INIT_PGD_PAGE_TABLES 3
+#else
+#define INIT_PGD_PAGE_TABLES 4
+#endif
+
#ifndef CONFIG_RANDOMIZE_MEMORY
-#define INIT_PGD_PAGE_COUNT 6
+#define INIT_PGD_PAGE_COUNT (2 * INIT_PGD_PAGE_TABLES)
#else
-#define INIT_PGD_PAGE_COUNT 12
+#define INIT_PGD_PAGE_COUNT (4 * INIT_PGD_PAGE_TABLES)
#endif
+
#define INIT_PGT_BUF_SIZE (INIT_PGD_PAGE_COUNT * PAGE_SIZE)
RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
void __init early_alloc_pgt_buf(void)
@@ -172,6 +209,19 @@ struct map_range {
static int page_size_mask;
+/*
+ * Save some of cr4 feature set we're using (e.g. Pentium 4MB
+ * enable and PPro Global page enable), so that any CPU's that boot
+ * up after us can get the correct flags. Invoked on the boot CPU.
+ */
+static inline void cr4_set_bits_and_update_boot(unsigned long mask)
+{
+ mmu_cr4_features |= mask;
+ if (trampoline_cr4_features)
+ *trampoline_cr4_features = mmu_cr4_features;
+ cr4_set_bits(mask);
+}
+
static void __init probe_page_size_mask(void)
{
/*
@@ -467,7 +517,7 @@ bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn)
* the physical memory. To access them they are temporarily mapped.
*/
unsigned long __ref init_memory_mapping(unsigned long start,
- unsigned long end)
+ unsigned long end, pgprot_t prot)
{
struct map_range mr[NR_RANGE_MR];
unsigned long ret = 0;
@@ -481,7 +531,8 @@ unsigned long __ref init_memory_mapping(unsigned long start,
for (i = 0; i < nr_range; i++)
ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
- mr[i].page_size_mask);
+ mr[i].page_size_mask,
+ prot);
add_pfn_range_mapped(start >> PAGE_SHIFT, ret >> PAGE_SHIFT);
@@ -521,7 +572,7 @@ static unsigned long __init init_range_memory_mapping(
*/
can_use_brk_pgt = max(start, (u64)pgt_buf_end<<PAGE_SHIFT) >=
min(end, (u64)pgt_buf_top<<PAGE_SHIFT);
- init_memory_mapping(start, end);
+ init_memory_mapping(start, end, PAGE_KERNEL);
mapped_ram_size += end - start;
can_use_brk_pgt = true;
}
@@ -561,20 +612,29 @@ static unsigned long __init get_new_step_size(unsigned long step_size)
static void __init memory_map_top_down(unsigned long map_start,
unsigned long map_end)
{
- unsigned long real_end, start, last_start;
+ unsigned long real_end, last_start;
unsigned long step_size;
unsigned long addr;
unsigned long mapped_ram_size = 0;
- /* xen has big range in reserved near end of ram, skip it at first.*/
- addr = memblock_find_in_range(map_start, map_end, PMD_SIZE, PMD_SIZE);
+ /*
+ * Systems that have many reserved areas near top of the memory,
+ * e.g. QEMU with less than 1G RAM and EFI enabled, or Xen, will
+ * require lots of 4K mappings which may exhaust pgt_buf.
+ * Start with top-most PMD_SIZE range aligned at PMD_SIZE to ensure
+ * there is enough mapped memory that can be allocated from
+ * memblock.
+ */
+ addr = memblock_phys_alloc_range(PMD_SIZE, PMD_SIZE, map_start,
+ map_end);
+ memblock_phys_free(addr, PMD_SIZE);
real_end = addr + PMD_SIZE;
/* step_size need to be small so pgt_buf from BRK could cover it */
step_size = PMD_SIZE;
max_pfn_mapped = 0; /* will get exact value next */
min_pfn_mapped = real_end >> PAGE_SHIFT;
- last_start = start = real_end;
+ last_start = real_end;
/*
* We start from the top (end of memory) and go to the bottom.
@@ -583,6 +643,8 @@ static void __init memory_map_top_down(unsigned long map_start,
* for page table.
*/
while (last_start > map_start) {
+ unsigned long start;
+
if (last_start > step_size) {
start = round_down(last_start - 1, step_size);
if (start < map_start)
@@ -646,6 +708,33 @@ static void __init memory_map_bottom_up(unsigned long map_start,
}
}
+/*
+ * The real mode trampoline, which is required for bootstrapping CPUs
+ * occupies only a small area under the low 1MB. See reserve_real_mode()
+ * for details.
+ *
+ * If KASLR is disabled the first PGD entry of the direct mapping is copied
+ * to map the real mode trampoline.
+ *
+ * If KASLR is enabled, copy only the PUD which covers the low 1MB
+ * area. This limits the randomization granularity to 1GB for both 4-level
+ * and 5-level paging.
+ */
+static void __init init_trampoline(void)
+{
+#ifdef CONFIG_X86_64
+ /*
+ * The code below will alias kernel page-tables in the user-range of the
+ * address space, including the Global bit. So global TLB entries will
+ * be created when using the trampoline page-table.
+ */
+ if (!kaslr_memory_enabled())
+ trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)];
+ else
+ init_trampoline_kaslr();
+#endif
+}
+
void __init init_mem_mapping(void)
{
unsigned long end;
@@ -661,7 +750,7 @@ void __init init_mem_mapping(void)
#endif
/* the ISA range is always mapped regardless of memory holes */
- init_memory_mapping(0, ISA_END_ADDRESS);
+ init_memory_mapping(0, ISA_END_ADDRESS, PAGE_KERNEL);
/* Init the trampoline, possibly with KASLR memory offset */
init_trampoline();
@@ -688,7 +777,7 @@ void __init init_mem_mapping(void)
#ifdef CONFIG_X86_64
if (max_pfn > max_low_pfn) {
- /* can we preseve max_low_pfn ?*/
+ /* can we preserve max_low_pfn ?*/
max_low_pfn = max_pfn;
}
#else
@@ -767,7 +856,7 @@ int devmem_is_allowed(unsigned long pagenr)
/*
* This must follow RAM test, since System RAM is considered a
- * restricted resource under CONFIG_STRICT_IOMEM.
+ * restricted resource under CONFIG_STRICT_DEVMEM.
*/
if (iomem_is_exclusive(pagenr << PAGE_SHIFT)) {
/* Low 1MB bypasses iomem restrictions. */
@@ -856,8 +945,6 @@ void free_kernel_image_pages(const char *what, void *begin, void *end)
set_memory_np_noalias(begin_ul, len_pages);
}
-void __weak mem_encrypt_free_decrypted_mem(void) { }
-
void __ref free_initmem(void)
{
e820__reallocate_tables();
@@ -873,7 +960,7 @@ void __init free_initrd_mem(unsigned long start, unsigned long end)
{
/*
* end could be not aligned, and We can not align that,
- * decompresser could be confused by aligned initrd_end
+ * decompressor could be confused by aligned initrd_end
* We already reserve the end partial page before in
* - i386_start_kernel()
* - x86_64_start_kernel()
@@ -948,15 +1035,14 @@ void __init zone_sizes_init(void)
max_zone_pfns[ZONE_HIGHMEM] = max_pfn;
#endif
- free_area_init_nodes(max_zone_pfns);
+ free_area_init(max_zone_pfns);
}
-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
+__visible DEFINE_PER_CPU_ALIGNED(struct tlb_state, cpu_tlbstate) = {
.loaded_mm = &init_mm,
.next_asid = 1,
.cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
};
-EXPORT_PER_CPU_SYMBOL(cpu_tlbstate);
void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
{
@@ -968,7 +1054,7 @@ void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
}
#ifdef CONFIG_SWAP
-unsigned long max_swapfile_size(void)
+unsigned long arch_max_swapfile_size(void)
{
unsigned long pages;