diff options
Diffstat (limited to 'arch/powerpc')
92 files changed, 2093 insertions, 1103 deletions
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 97a8bc8a095c..9ff731f50a29 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -586,7 +586,7 @@ config ARCH_SPARSEMEM_ENABLE config ARCH_SPARSEMEM_DEFAULT def_bool y - depends on (SMP && PPC_PSERIES) || PPC_PS3 + depends on PPC_BOOK3S_64 config SYS_SUPPORTS_HUGETLBFS bool @@ -678,6 +678,16 @@ config PPC_256K_PAGES endchoice +config THREAD_SHIFT + int "Thread shift" if EXPERT + range 13 15 + default "15" if PPC_256K_PAGES + default "14" if PPC64 + default "13" + help + Used to define the stack size. The default is almost always what you + want. Only change this if you know what you are doing. + config FORCE_MAX_ZONEORDER int "Maximum zone order" range 8 9 if PPC64 && PPC_64K_PAGES diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig index ac8b8332ed82..0695ce047d56 100644 --- a/arch/powerpc/configs/powernv_defconfig +++ b/arch/powerpc/configs/powernv_defconfig @@ -33,7 +33,7 @@ CONFIG_BLK_DEV_INITRD=y CONFIG_BPF_SYSCALL=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y -CONFIG_OPROFILE=y +CONFIG_OPROFILE=m CONFIG_KPROBES=y CONFIG_JUMP_LABEL=y CONFIG_MODULES=y @@ -261,7 +261,7 @@ CONFIG_NILFS2_FS=m CONFIG_AUTOFS4_FS=m CONFIG_FUSE_FS=m CONFIG_OVERLAY_FS=m -CONFIG_ISO9660_FS=m +CONFIG_ISO9660_FS=y CONFIG_UDF_FS=m CONFIG_MSDOS_FS=y CONFIG_VFAT_FS=m @@ -306,7 +306,7 @@ CONFIG_CRYPTO_TEST=m CONFIG_CRYPTO_CCM=m CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_HMAC=y -CONFIG_CRYPT_CRC32C_VPMSUM=m +CONFIG_CRYPTO_CRC32C_VPMSUM=m CONFIG_CRYPTO_MD5_PPC=m CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_SHA256=y diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig index 4f1288b04303..e353168f98a7 100644 --- a/arch/powerpc/configs/ppc64_defconfig +++ b/arch/powerpc/configs/ppc64_defconfig @@ -19,7 +19,7 @@ CONFIG_BLK_DEV_INITRD=y CONFIG_BPF_SYSCALL=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y -CONFIG_OPROFILE=y +CONFIG_OPROFILE=m CONFIG_KPROBES=y CONFIG_JUMP_LABEL=y CONFIG_MODULES=y @@ -291,7 +291,7 @@ CONFIG_NILFS2_FS=m CONFIG_AUTOFS4_FS=m CONFIG_FUSE_FS=m CONFIG_OVERLAY_FS=m -CONFIG_ISO9660_FS=m +CONFIG_ISO9660_FS=y CONFIG_UDF_FS=m CONFIG_MSDOS_FS=y CONFIG_VFAT_FS=m @@ -340,7 +340,7 @@ CONFIG_PPC_EARLY_DEBUG=y CONFIG_CRYPTO_TEST=m CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_HMAC=y -CONFIG_CRYPT_CRC32C_VPMSUM=m +CONFIG_CRYPTO_CRC32C_VPMSUM=m CONFIG_CRYPTO_MD5_PPC=m CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_SHA256=y diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig index 4ff68b752618..1a61aa20dfba 100644 --- a/arch/powerpc/configs/pseries_defconfig +++ b/arch/powerpc/configs/pseries_defconfig @@ -34,7 +34,7 @@ CONFIG_BLK_DEV_INITRD=y CONFIG_BPF_SYSCALL=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y -CONFIG_OPROFILE=y +CONFIG_OPROFILE=m CONFIG_KPROBES=y CONFIG_JUMP_LABEL=y CONFIG_MODULES=y @@ -259,7 +259,7 @@ CONFIG_NILFS2_FS=m CONFIG_AUTOFS4_FS=m CONFIG_FUSE_FS=m CONFIG_OVERLAY_FS=m -CONFIG_ISO9660_FS=m +CONFIG_ISO9660_FS=y CONFIG_UDF_FS=m CONFIG_MSDOS_FS=y CONFIG_VFAT_FS=m @@ -303,7 +303,7 @@ CONFIG_XMON=y CONFIG_CRYPTO_TEST=m CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_HMAC=y -CONFIG_CRYPT_CRC32C_VPMSUM=m +CONFIG_CRYPTO_CRC32C_VPMSUM=m CONFIG_CRYPTO_MD5_PPC=m CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_SHA256=y diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index f6c5264287e5..7330150bfe34 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -17,6 +17,8 @@ #include <asm/checksum.h> #include <linux/uaccess.h> #include <asm/epapr_hcalls.h> +#include <asm/dcr.h> +#include <asm/mmu_context.h> #include <uapi/asm/ucontext.h> @@ -120,6 +122,8 @@ extern s64 __ashrdi3(s64, int); extern int __cmpdi2(s64, s64); extern int __ucmpdi2(u64, u64); +/* tracing */ void _mcount(void); +unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip); #endif /* _ASM_POWERPC_ASM_PROTOTYPES_H */ diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h index 0c4e470571ca..b4b5e6b671ca 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h @@ -8,7 +8,7 @@ #define H_PTE_INDEX_SIZE 9 #define H_PMD_INDEX_SIZE 7 #define H_PUD_INDEX_SIZE 9 -#define H_PGD_INDEX_SIZE 9 +#define H_PGD_INDEX_SIZE 12 #ifndef __ASSEMBLY__ #define H_PTE_TABLE_SIZE (sizeof(pte_t) << H_PTE_INDEX_SIZE) diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h index f3dd21efa2ea..214219dff87c 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h @@ -4,10 +4,14 @@ #define H_PTE_INDEX_SIZE 8 #define H_PMD_INDEX_SIZE 5 #define H_PUD_INDEX_SIZE 5 -#define H_PGD_INDEX_SIZE 12 +#define H_PGD_INDEX_SIZE 15 -#define H_PAGE_COMBO 0x00001000 /* this is a combo 4k page */ -#define H_PAGE_4K_PFN 0x00002000 /* PFN is for a single 4k page */ +/* + * 64k aligned address free up few of the lower bits of RPN for us + * We steal that here. For more deatils look at pte_pfn/pfn_pte() + */ +#define H_PAGE_COMBO _RPAGE_RPN0 /* this is a combo 4k page */ +#define H_PAGE_4K_PFN _RPAGE_RPN1 /* PFN is for a single 4k page */ /* * We need to differentiate between explicit huge page and THP huge * page, since THP huge page also need to track real subpage details diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index f7b721bbf918..4e957b027fe0 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -6,19 +6,13 @@ * Common bits between 4K and 64K pages in a linux-style PTE. * Additional bits may be defined in pgtable-hash64-*.h * - * Note: We only support user read/write permissions. Supervisor always - * have full read/write to pages above PAGE_OFFSET (pages below that - * always use the user access permissions). - * - * We could create separate kernel read-only if we used the 3 PP bits - * combinations that newer processors provide but we currently don't. */ -#define H_PAGE_BUSY 0x00800 /* software: PTE & hash are busy */ #define H_PTE_NONE_MASK _PAGE_HPTEFLAGS -#define H_PAGE_F_GIX_SHIFT 57 -#define H_PAGE_F_GIX (7ul << 57) /* HPTE index within HPTEG */ -#define H_PAGE_F_SECOND (1ul << 60) /* HPTE is in 2ndary HPTEG */ -#define H_PAGE_HASHPTE (1ul << 61) /* PTE has associated HPTE */ +#define H_PAGE_F_GIX_SHIFT 56 +#define H_PAGE_BUSY _RPAGE_RSV1 /* software: PTE & hash are busy */ +#define H_PAGE_F_SECOND _RPAGE_RSV2 /* HPTE is in 2ndary HPTEG */ +#define H_PAGE_F_GIX (_RPAGE_RSV3 | _RPAGE_RSV4 | _RPAGE_RPN44) +#define H_PAGE_HASHPTE _RPAGE_RPN43 /* PTE has associated HPTE */ #ifdef CONFIG_PPC_64K_PAGES #include <asm/book3s/64/hash-64k.h> diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h index c62f14d0bec1..6666cd366596 100644 --- a/arch/powerpc/include/asm/book3s/64/hugetlb.h +++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h @@ -46,7 +46,7 @@ static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, */ VM_WARN_ON(page_shift == mmu_psize_defs[MMU_PAGE_1G].shift); if (page_shift == mmu_psize_defs[MMU_PAGE_2M].shift) - return __pte(pte_val(entry) | _PAGE_LARGE); + return __pte(pte_val(entry) | R_PAGE_LARGE); else return entry; } diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index 52d8d1e4b772..6d56974adf28 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -39,6 +39,7 @@ /* Bits in the SLB VSID word */ #define SLB_VSID_SHIFT 12 +#define SLB_VSID_SHIFT_256M SLB_VSID_SHIFT #define SLB_VSID_SHIFT_1T 24 #define SLB_VSID_SSIZE_SHIFT 62 #define SLB_VSID_B ASM_CONST(0xc000000000000000) @@ -408,7 +409,7 @@ static inline unsigned long hpt_vpn(unsigned long ea, static inline unsigned long hpt_hash(unsigned long vpn, unsigned int shift, int ssize) { - int mask; + unsigned long mask; unsigned long hash, vsid; /* VPN_SHIFT can be atmost 12 */ @@ -491,13 +492,14 @@ extern void slb_set_size(u16 size); * We first generate a 37-bit "proto-VSID". Proto-VSIDs are generated * from mmu context id and effective segment id of the address. * - * For user processes max context id is limited to ((1ul << 19) - 5) - * for kernel space, we use the top 4 context ids to map address as below + * For user processes max context id is limited to MAX_USER_CONTEXT. + + * For kernel space, we use context ids 1-5 to map address as below: * NOTE: each context only support 64TB now. - * 0x7fffc - [ 0xc000000000000000 - 0xc0003fffffffffff ] - * 0x7fffd - [ 0xd000000000000000 - 0xd0003fffffffffff ] - * 0x7fffe - [ 0xe000000000000000 - 0xe0003fffffffffff ] - * 0x7ffff - [ 0xf000000000000000 - 0xf0003fffffffffff ] + * 0x00001 - [ 0xc000000000000000 - 0xc0003fffffffffff ] + * 0x00002 - [ 0xd000000000000000 - 0xd0003fffffffffff ] + * 0x00003 - [ 0xe000000000000000 - 0xe0003fffffffffff ] + * 0x00004 - [ 0xf000000000000000 - 0xf0003fffffffffff ] * * The proto-VSIDs are then scrambled into real VSIDs with the * multiplicative hash: @@ -511,20 +513,28 @@ extern void slb_set_size(u16 size); * robust scattering in the hash table (at least based on some initial * results). * - * We also consider VSID 0 special. We use VSID 0 for slb entries mapping - * bad address. This enables us to consolidate bad address handling in - * hash_page. + * We use VSID 0 to indicate an invalid VSID. The means we can't use context id + * 0, because a context id of 0 and an EA of 0 gives a proto-VSID of 0, which + * will produce a VSID of 0. * * We also need to avoid the last segment of the last context, because that * would give a protovsid of 0x1fffffffff. That will result in a VSID 0 - * because of the modulo operation in vsid scramble. But the vmemmap - * (which is what uses region 0xf) will never be close to 64TB in size - * (it's 56 bytes per page of system memory). + * because of the modulo operation in vsid scramble. */ +/* + * Max Va bits we support as of now is 68 bits. We want 19 bit + * context ID. + * Restrictions: + * GPU has restrictions of not able to access beyond 128TB + * (47 bit effective address). We also cannot do more than 20bit PID. + * For p4 and p5 which can only do 65 bit VA, we restrict our CONTEXT_BITS + * to 16 bits (ie, we can only have 2^16 pids at the same time). + */ +#define VA_BITS 68 #define CONTEXT_BITS 19 -#define ESID_BITS 18 -#define ESID_BITS_1T 6 +#define ESID_BITS (VA_BITS - (SID_SHIFT + CONTEXT_BITS)) +#define ESID_BITS_1T (VA_BITS - (SID_SHIFT_1T + CONTEXT_BITS)) #define ESID_BITS_MASK ((1 << ESID_BITS) - 1) #define ESID_BITS_1T_MASK ((1 << ESID_BITS_1T) - 1) @@ -532,63 +542,70 @@ extern void slb_set_size(u16 size); /* * 256MB segment * The proto-VSID space has 2^(CONTEX_BITS + ESID_BITS) - 1 segments - * available for user + kernel mapping. The top 4 contexts are used for - * kernel mapping. Each segment contains 2^28 bytes. Each - * context maps 2^46 bytes (64TB) so we can support 2^19-1 contexts - * (19 == 37 + 28 - 46). + * available for user + kernel mapping. VSID 0 is reserved as invalid, contexts + * 1-4 are used for kernel mapping. Each segment contains 2^28 bytes. Each + * context maps 2^49 bytes (512TB). + * + * We also need to avoid the last segment of the last context, because that + * would give a protovsid of 0x1fffffffff. That will result in a VSID 0 + * because of the modulo operation in vsid scramble. + */ +#define MAX_USER_CONTEXT ((ASM_CONST(1) << CONTEXT_BITS) - 2) +#define MIN_USER_CONTEXT (5) + +/* Would be nice to use KERNEL_REGION_ID here */ +#define KERNEL_REGION_CONTEXT_OFFSET (0xc - 1) + +/* + * For platforms that support on 65bit VA we limit the context bits */ -#define MAX_USER_CONTEXT ((ASM_CONST(1) << CONTEXT_BITS) - 5) +#define MAX_USER_CONTEXT_65BIT_VA ((ASM_CONST(1) << (65 - (SID_SHIFT + ESID_BITS))) - 2) /* * This should be computed such that protovosid * vsid_mulitplier - * doesn't overflow 64 bits. It should also be co-prime to vsid_modulus + * doesn't overflow 64 bits. The vsid_mutliplier should also be + * co-prime to vsid_modulus. We also need to make sure that number + * of bits in multiplied result (dividend) is less than twice the number of + * protovsid bits for our modulus optmization to work. + * + * The below table shows the current values used. + * |-------+------------+----------------------+------------+-------------------| + * | | Prime Bits | proto VSID_BITS_65VA | Total Bits | 2* prot VSID_BITS | + * |-------+------------+----------------------+------------+-------------------| + * | 1T | 24 | 25 | 49 | 50 | + * |-------+------------+----------------------+------------+-------------------| + * | 256MB | 24 | 37 | 61 | 74 | + * |-------+------------+----------------------+------------+-------------------| + * + * |-------+------------+----------------------+------------+--------------------| + * | | Prime Bits | proto VSID_BITS_68VA | Total Bits | 2* proto VSID_BITS | + * |-------+------------+----------------------+------------+--------------------| + * | 1T | 24 | 28 | 52 | 56 | + * |-------+------------+----------------------+------------+--------------------| + * | 256MB | 24 | 40 | 64 | 80 | + * |-------+------------+----------------------+------------+--------------------| + * */ #define VSID_MULTIPLIER_256M ASM_CONST(12538073) /* 24-bit prime */ -#define VSID_BITS_256M (CONTEXT_BITS + ESID_BITS) -#define VSID_MODULUS_256M ((1UL<<VSID_BITS_256M)-1) +#define VSID_BITS_256M (VA_BITS - SID_SHIFT) +#define VSID_BITS_65_256M (65 - SID_SHIFT) +/* + * Modular multiplicative inverse of VSID_MULTIPLIER under modulo VSID_MODULUS + */ +#define VSID_MULINV_256M ASM_CONST(665548017062) #define VSID_MULTIPLIER_1T ASM_CONST(12538073) /* 24-bit prime */ -#define VSID_BITS_1T (CONTEXT_BITS + ESID_BITS_1T) -#define VSID_MODULUS_1T ((1UL<<VSID_BITS_1T)-1) - +#define VSID_BITS_1T (VA_BITS - SID_SHIFT_1T) +#define VSID_BITS_65_1T (65 - SID_SHIFT_1T) +#define VSID_MULINV_1T ASM_CONST(209034062) +/* 1TB VSID reserved for VRMA */ +#define VRMA_VSID 0x1ffffffUL #define USER_VSID_RANGE (1UL << (ESID_BITS + SID_SHIFT)) -/* - * This macro generates asm code to compute the VSID scramble - * function. Used in slb_allocate() and do_stab_bolted. The function - * computed is: (protovsid*VSID_MULTIPLIER) % VSID_MODULUS - * - * rt = register containing the proto-VSID and into which the - * VSID will be stored - * rx = scratch register (clobbered) - * - * - rt and rx must be different registers - * - The answer will end up in the low VSID_BITS bits of rt. The higher - * bits may contain other garbage, so you may need to mask the - * result. - */ -#define ASM_VSID_SCRAMBLE(rt, rx, size) \ - lis rx,VSID_MULTIPLIER_##size@h; \ - ori rx,rx,VSID_MULTIPLIER_##size@l; \ - mulld rt,rt,rx; /* rt = rt * MULTIPLIER */ \ - \ - srdi rx,rt,VSID_BITS_##size; \ - clrldi rt,rt,(64-VSID_BITS_##size); \ - add rt,rt,rx; /* add high and low bits */ \ - /* NOTE: explanation based on VSID_BITS_##size = 36 \ - * Now, r3 == VSID (mod 2^36-1), and lies between 0 and \ - * 2^36-1+2^28-1. That in particular means that if r3 >= \ - * 2^36-1, then r3+1 has the 2^36 bit set. So, if r3+1 has \ - * the bit clear, r3 already has the answer we want, if it \ - * doesn't, the answer is the low 36 bits of r3+1. So in all \ - * cases the answer is the low 36 bits of (r3 + ((r3+1) >> 36))*/\ - addi rx,rt,1; \ - srdi rx,rx,VSID_BITS_##size; /* extract 2^VSID_BITS bit */ \ - add rt,rt,rx - /* 4 bits per slice and we have one slice per 1TB */ -#define SLICE_ARRAY_SIZE (H_PGTABLE_RANGE >> 41) +#define SLICE_ARRAY_SIZE (H_PGTABLE_RANGE >> 41) +#define TASK_SLICE_ARRAY_SZ(x) ((x)->context.addr_limit >> 41) #ifndef __ASSEMBLY__ @@ -634,7 +651,7 @@ static inline void subpage_prot_init_new_context(struct mm_struct *mm) { } #define vsid_scramble(protovsid, size) \ ((((protovsid) * VSID_MULTIPLIER_##size) % VSID_MODULUS_##size)) -#else /* 1 */ +/* simplified form avoiding mod operation */ #define vsid_scramble(protovsid, size) \ ({ \ unsigned long x; \ @@ -642,6 +659,21 @@ static inline void subpage_prot_init_new_context(struct mm_struct *mm) { } x = (x >> VSID_BITS_##size) + (x & VSID_MODULUS_##size); \ (x + ((x+1) >> VSID_BITS_##size)) & VSID_MODULUS_##size; \ }) + +#else /* 1 */ +static inline unsigned long vsid_scramble(unsigned long protovsid, + unsigned long vsid_multiplier, int vsid_bits) +{ + unsigned long vsid; + unsigned long vsid_modulus = ((1UL << vsid_bits) - 1); + /* + * We have same multipler for both 256 and 1T segements now + */ + vsid = protovsid * vsid_multiplier; + vsid = (vsid >> vsid_bits) + (vsid & vsid_modulus); + return (vsid + ((vsid + 1) >> vsid_bits)) & vsid_modulus; +} + #endif /* 1 */ /* Returns the segment size indicator for a user address */ @@ -656,36 +688,56 @@ static inline int user_segment_size(unsigned long addr) static inline unsigned long get_vsid(unsigned long context, unsigned long ea, int ssize) { + unsigned long va_bits = VA_BITS; + unsigned long vsid_bits; + unsigned long protovsid; + /* * Bad address. We return VSID 0 for that */ if ((ea & ~REGION_MASK) >= H_PGTABLE_RANGE) return 0; - if (ssize == MMU_SEGSIZE_256M) - return vsid_scramble((context << ESID_BITS) - | ((ea >> SID_SHIFT) & ESID_BITS_MASK), 256M); - return vsid_scramble((context << ESID_BITS_1T) - | ((ea >> SID_SHIFT_1T) & ESID_BITS_1T_MASK), 1T); + if (!mmu_has_feature(MMU_FTR_68_BIT_VA)) + va_bits = 65; + + if (ssize == MMU_SEGSIZE_256M) { + vsid_bits = va_bits - SID_SHIFT; + protovsid = (context << ESID_BITS) | + ((ea >> SID_SHIFT) & ESID_BITS_MASK); + return vsid_scramble(protovsid, VSID_MULTIPLIER_256M, vsid_bits); + } + /* 1T segment */ + vsid_bits = va_bits - SID_SHIFT_1T; + protovsid = (context << ESID_BITS_1T) | + ((ea >> SID_SHIFT_1T) & ESID_BITS_1T_MASK); + return vsid_scramble(protovsid, VSID_MULTIPLIER_1T, vsid_bits); } /* * This is only valid for addresses >= PAGE_OFFSET - * - * For kernel space, we use the top 4 context ids to map address as below - * 0x7fffc - [ 0xc000000000000000 - 0xc0003fffffffffff ] - * 0x7fffd - [ 0xd000000000000000 - 0xd0003fffffffffff ] - * 0x7fffe - [ 0xe000000000000000 - 0xe0003fffffffffff ] - * 0x7ffff - [ 0xf000000000000000 - 0xf0003fffffffffff ] */ static inline unsigned long get_kernel_vsid(unsigned long ea, int ssize) { unsigned long context; + if (!is_kernel_addr(ea)) + return 0; + /* - * kernel take the top 4 context from the available range + * For kernel space, we use context ids 1-4 to map the address space as + * below: + * + * 0x00001 - [ 0xc000000000000000 - 0xc0003fffffffffff ] + * 0x00002 - [ 0xd000000000000000 - 0xd0003fffffffffff ] + * 0x00003 - [ 0xe000000000000000 - 0xe0003fffffffffff ] + * 0x00004 - [ 0xf000000000000000 - 0xf0003fffffffffff ] + * + * So we can compute the context from the region (top nibble) by + * subtracting 11, or 0xc - 1. */ - context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1; + context = (ea >> 60) - KERNEL_REGION_CONTEXT_OFFSET; + return get_vsid(context, ea, ssize); } diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index 805d4105e9bb..77529a3e3811 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -65,6 +65,8 @@ extern struct patb_entry *partition_tb; * MAX_USER_CONTEXT * 16 bytes of space. */ #define PRTB_SIZE_SHIFT (CONTEXT_BITS + 4) +#define PRTB_ENTRIES (1ul << CONTEXT_BITS) + /* * Power9 currently only support 64K partition table size. */ @@ -73,13 +75,20 @@ extern struct patb_entry *partition_tb; typedef unsigned long mm_context_id_t; struct spinlock; +/* Maximum possible number of NPUs in a system. */ +#define NV_MAX_NPUS 8 + typedef struct { mm_context_id_t id; u16 user_psize; /* page size index */ + /* NPU NMMU context */ + struct npu_context *npu_context; + #ifdef CONFIG_PPC_MM_SLICES u64 low_slices_psize; /* SLB page size encodings */ unsigned char high_slices_psize[SLICE_ARRAY_SIZE]; + unsigned long addr_limit; #else u16 sllp; /* SLB page size encoding */ #endif diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 8f4d41936e5a..fb72ff6b98e6 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -37,21 +37,47 @@ #define _RPAGE_RSV3 0x0400000000000000UL #define _RPAGE_RSV4 0x0200000000000000UL -#ifdef CONFIG_MEM_SOFT_DIRTY -#define _PAGE_SOFT_DIRTY _RPAGE_SW3 /* software: software dirty tracking */ -#else -#define _PAGE_SOFT_DIRTY 0x00000 -#endif -#define _PAGE_SPECIAL _RPAGE_SW2 /* software: special page */ +#define _PAGE_PTE 0x4000000000000000UL /* distinguishes PTEs from pointers */ +#define _PAGE_PRESENT 0x8000000000000000UL /* pte contains a translation */ /* - * For P9 DD1 only, we need to track whether the pte's huge. + * Top and bottom bits of RPN which can be used by hash + * translation mode, because we expect them to be zero + * otherwise. */ -#define _PAGE_LARGE _RPAGE_RSV1 +#define _RPAGE_RPN0 0x01000 +#define _RPAGE_RPN1 0x02000 +#define _RPAGE_RPN44 0x0100000000000000UL +#define _RPAGE_RPN43 0x0080000000000000UL +#define _RPAGE_RPN42 0x0040000000000000UL +#define _RPAGE_RPN41 0x0020000000000000UL + +/* Max physical address bit as per radix table */ +#define _RPAGE_PA_MAX 57 +/* + * Max physical address bit we will use for now. + * + * This is mostly a hardware limitation and for now Power9 has + * a 51 bit limit. + * + * This is different from the number of physical bit required to address + * the last byte of memory. That is defined by MAX_PHYSMEM_BITS. + * MAX_PHYSMEM_BITS is a linux limitation imposed by the maximum + * number of sections we can support (SECTIONS_SHIFT). + * + * This is different from Radix page table limitation above and + * should always be less than that. The limit is done such that + * we can overload the bits between _RPAGE_PA_MAX and _PAGE_PA_MAX + * for hash linux page table specific bits. + * + * In order to be compatible with future hardware generations we keep + * some offsets and limit this for now to 53 + */ +#define _PAGE_PA_MAX 53 -#define _PAGE_PTE (1ul << 62) /* distinguishes PTEs from pointers */ -#define _PAGE_PRESENT (1ul << 63) /* pte contains a translation */ +#define _PAGE_SOFT_DIRTY _RPAGE_SW3 /* software: software dirty tracking */ +#define _PAGE_SPECIAL _RPAGE_SW2 /* software: special page */ /* * Drivers request for cache inhibited pte mapping using _PAGE_NO_CACHE * Instead of fixing all of them, add an alternate define which @@ -59,10 +85,11 @@ */ #define _PAGE_NO_CACHE _PAGE_TOLERANT /* - * We support 57 bit real address in pte. Clear everything above 57, and - * every thing below PAGE_SHIFT; + * We support _RPAGE_PA_MAX bit real address in pte. On the linux side + * we are limited by _PAGE_PA_MAX. Clear everything above _PAGE_PA_MAX + * and every thing below PAGE_SHIFT; */ -#define PTE_RPN_MASK (((1UL << 57) - 1) & (PAGE_MASK)) +#define PTE_RPN_MASK (((1UL << _PAGE_PA_MAX) - 1) & (PAGE_MASK)) /* * set of bits not changed in pmd_modify. Even though we have hash specific bits * in here, on radix we expect them to be zero. diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index 9e0bb7cd6e22..ac16d1943022 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -11,6 +11,12 @@ #include <asm/book3s/64/radix-4k.h> #endif +/* + * For P9 DD1 only, we need to track whether the pte's huge. + */ +#define R_PAGE_LARGE _RPAGE_RSV1 + + #ifndef __ASSEMBLY__ #include <asm/book3s/64/tlbflush-radix.h> #include <asm/cpu_has_feature.h> @@ -252,7 +258,7 @@ static inline int radix__pmd_trans_huge(pmd_t pmd) static inline pmd_t radix__pmd_mkhuge(pmd_t pmd) { if (cpu_has_feature(CPU_FTR_POWER9_DD1)) - return __pmd(pmd_val(pmd) | _PAGE_PTE | _PAGE_LARGE); + return __pmd(pmd_val(pmd) | _PAGE_PTE | R_PAGE_LARGE); return __pmd(pmd_val(pmd) | _PAGE_PTE); } static inline void radix__pmdp_huge_split_prepare(struct vm_area_struct *vma, diff --git a/arch/powerpc/include/asm/cpuidle.h b/arch/powerpc/include/asm/cpuidle.h index 155731557c9b..4649ca0d28e3 100644 --- a/arch/powerpc/include/asm/cpuidle.h +++ b/arch/powerpc/include/asm/cpuidle.h @@ -46,6 +46,7 @@ extern u32 pnv_fastsleep_workaround_at_exit[]; extern u64 pnv_first_deep_stop_state; +unsigned long pnv_cpu_offline(unsigned int cpu); int validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags); static inline void report_invalid_psscr_val(u64 psscr_val, int err) { diff --git a/arch/powerpc/include/asm/debug.h b/arch/powerpc/include/asm/debug.h index 86308f177f2d..5d5af3fddfd8 100644 --- a/arch/powerpc/include/asm/debug.h +++ b/arch/powerpc/include/asm/debug.h @@ -8,8 +8,6 @@ struct pt_regs; -extern struct dentry *powerpc_debugfs_root; - #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC_CORE) extern int (*__debugger)(struct pt_regs *regs); diff --git a/arch/powerpc/include/asm/debugfs.h b/arch/powerpc/include/asm/debugfs.h new file mode 100644 index 000000000000..4f3b39f3e3d2 --- /dev/null +++ b/arch/powerpc/include/asm/debugfs.h @@ -0,0 +1,17 @@ +#ifndef _ASM_POWERPC_DEBUGFS_H +#define _ASM_POWERPC_DEBUGFS_H + +/* + * Copyright 2017, Michael Ellerman, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/debugfs.h> + +extern struct dentry *powerpc_debugfs_root; + +#endif /* _ASM_POWERPC_DEBUGFS_H */ diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index 3cc12a86ef5d..d73755fafbb0 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -377,16 +377,6 @@ long plpar_hcall_raw(unsigned long opcode, unsigned long *retbuf, ...); long plpar_hcall9(unsigned long opcode, unsigned long *retbuf, ...); long plpar_hcall9_raw(unsigned long opcode, unsigned long *retbuf, ...); -/* For hcall instrumentation. One structure per-hcall, per-CPU */ -struct hcall_stats { - unsigned long num_calls; /* number of calls (on this CPU) */ - unsigned long tb_total; /* total wall time (mftb) of calls. */ - unsigned long purr_total; /* total cpu time (PURR) of calls. */ - unsigned long tb_start; - unsigned long purr_start; -}; -#define HCALL_STAT_ARRAY_SIZE ((MAX_HCALL_OPCODE >> 2) + 1) - struct hvcall_mpp_data { unsigned long entitled_mem; unsigned long mapped_mem; diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index d9b48f5bb606..d55c7f881ce7 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -49,8 +49,6 @@ static inline bool kvm_is_radix(struct kvm *kvm) #define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */ #endif -#define VRMA_VSID 0x1ffffffUL /* 1TB VSID reserved for VRMA */ - /* * We use a lock bit in HPTE dword 0 to synchronize updates and * accesses to each HPTE, and another bit to indicate non-present diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index ed62efe01e49..81eff8631434 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -24,97 +24,6 @@ #include <linux/bitops.h> -/* - * Machine Check bits on power7 and power8 - */ -#define P7_SRR1_MC_LOADSTORE(srr1) ((srr1) & PPC_BIT(42)) /* P8 too */ - -/* SRR1 bits for machine check (On Power7 and Power8) */ -#define P7_SRR1_MC_IFETCH(srr1) ((srr1) & PPC_BITMASK(43, 45)) /* P8 too */ - -#define P7_SRR1_MC_IFETCH_UE (0x1 << PPC_BITLSHIFT(45)) /* P8 too */ -#define P7_SRR1_MC_IFETCH_SLB_PARITY (0x2 << PPC_BITLSHIFT(45)) /* P8 too */ -#define P7_SRR1_MC_IFETCH_SLB_MULTIHIT (0x3 << PPC_BITLSHIFT(45)) /* P8 too */ -#define P7_SRR1_MC_IFETCH_SLB_BOTH (0x4 << PPC_BITLSHIFT(45)) -#define P7_SRR1_MC_IFETCH_TLB_MULTIHIT (0x5 << PPC_BITLSHIFT(45)) /* P8 too */ -#define P7_SRR1_MC_IFETCH_UE_TLB_RELOAD (0x6 << PPC_BITLSHIFT(45)) /* P8 too */ -#define P7_SRR1_MC_IFETCH_UE_IFU_INTERNAL (0x7 << PPC_BITLSHIFT(45)) - -/* SRR1 bits for machine check (On Power8) */ -#define P8_SRR1_MC_IFETCH_ERAT_MULTIHIT (0x4 << PPC_BITLSHIFT(45)) - -/* DSISR bits for machine check (On Power7 and Power8) */ -#define P7_DSISR_MC_UE (PPC_BIT(48)) /* P8 too */ -#define P7_DSISR_MC_UE_TABLEWALK (PPC_BIT(49)) /* P8 too */ -#define P7_DSISR_MC_ERAT_MULTIHIT (PPC_BIT(52)) /* P8 too */ -#define P7_DSISR_MC_TLB_MULTIHIT_MFTLB (PPC_BIT(53)) /* P8 too */ -#define P7_DSISR_MC_SLB_PARITY_MFSLB (PPC_BIT(55)) /* P8 too */ -#define P7_DSISR_MC_SLB_MULTIHIT (PPC_BIT(56)) /* P8 too */ -#define P7_DSISR_MC_SLB_MULTIHIT_PARITY (PPC_BIT(57)) /* P8 too */ - -/* - * DSISR bits for machine check (Power8) in addition to above. - * Secondary DERAT Multihit - */ -#define P8_DSISR_MC_ERAT_MULTIHIT_SEC (PPC_BIT(54)) - -/* SLB error bits */ -#define P7_DSISR_MC_SLB_ERRORS (P7_DSISR_MC_ERAT_MULTIHIT | \ - P7_DSISR_MC_SLB_PARITY_MFSLB | \ - P7_DSISR_MC_SLB_MULTIHIT | \ - P7_DSISR_MC_SLB_MULTIHIT_PARITY) - -#define P8_DSISR_MC_SLB_ERRORS (P7_DSISR_MC_SLB_ERRORS | \ - P8_DSISR_MC_ERAT_MULTIHIT_SEC) - -/* - * Machine Check bits on power9 - */ -#define P9_SRR1_MC_LOADSTORE(srr1) (((srr1) >> PPC_BITLSHIFT(42)) & 1) - -#define P9_SRR1_MC_IFETCH(srr1) ( \ - PPC_BITEXTRACT(srr1, 45, 0) | \ - PPC_BITEXTRACT(srr1, 44, 1) | \ - PPC_BITEXTRACT(srr1, 43, 2) | \ - PPC_BITEXTRACT(srr1, 36, 3) ) - -/* 0 is reserved */ -#define P9_SRR1_MC_IFETCH_UE 1 -#define P9_SRR1_MC_IFETCH_SLB_PARITY 2 -#define P9_SRR1_MC_IFETCH_SLB_MULTIHIT 3 -#define P9_SRR1_MC_IFETCH_ERAT_MULTIHIT 4 -#define P9_SRR1_MC_IFETCH_TLB_MULTIHIT 5 -#define P9_SRR1_MC_IFETCH_UE_TLB_RELOAD 6 -/* 7 is reserved */ -#define P9_SRR1_MC_IFETCH_LINK_TIMEOUT 8 -#define P9_SRR1_MC_IFETCH_LINK_TABLEWALK_TIMEOUT 9 -/* 10 ? */ -#define P9_SRR1_MC_IFETCH_RA 11 -#define P9_SRR1_MC_IFETCH_RA_TABLEWALK 12 -#define P9_SRR1_MC_IFETCH_RA_ASYNC_STORE 13 -#define P9_SRR1_MC_IFETCH_LINK_ASYNC_STORE_TIMEOUT 14 -#define P9_SRR1_MC_IFETCH_RA_TABLEWALK_FOREIGN 15 - -/* DSISR bits for machine check (On Power9) */ -#define P9_DSISR_MC_UE (PPC_BIT(48)) -#define P9_DSISR_MC_UE_TABLEWALK (PPC_BIT(49)) -#define P9_DSISR_MC_LINK_LOAD_TIMEOUT (PPC_BIT(50)) -#define P9_DSISR_MC_LINK_TABLEWALK_TIMEOUT (PPC_BIT(51)) -#define P9_DSISR_MC_ERAT_MULTIHIT (PPC_BIT(52)) -#define P9_DSISR_MC_TLB_MULTIHIT_MFTLB (PPC_BIT(53)) -#define P9_DSISR_MC_USER_TLBIE (PPC_BIT(54)) -#define P9_DSISR_MC_SLB_PARITY_MFSLB (PPC_BIT(55)) -#define P9_DSISR_MC_SLB_MULTIHIT_MFSLB (PPC_BIT(56)) -#define P9_DSISR_MC_RA_LOAD (PPC_BIT(57)) -#define P9_DSISR_MC_RA_TABLEWALK (PPC_BIT(58)) -#define P9_DSISR_MC_RA_TABLEWALK_FOREIGN (PPC_BIT(59)) -#define P9_DSISR_MC_RA_FOREIGN (PPC_BIT(60)) - -/* SLB error bits */ -#define P9_DSISR_MC_SLB_ERRORS (P9_DSISR_MC_ERAT_MULTIHIT | \ - P9_DSISR_MC_SLB_PARITY_MFSLB | \ - P9_DSISR_MC_SLB_MULTIHIT_MFSLB) - enum MCE_Version { MCE_V1 = 1, }; @@ -298,7 +207,8 @@ extern void save_mce_event(struct pt_regs *regs, long handled, extern int get_mce_event(struct machine_check_event *mce, bool release); extern void release_mce_event(void); extern void machine_check_queue_event(void); -extern void machine_check_print_event_info(struct machine_check_event *evt); +extern void machine_check_print_event_info(struct machine_check_event *evt, + bool user_mode); extern uint64_t get_mce_fault_addr(struct machine_check_event *evt); #endif /* __ASM_PPC64_MCE_H__ */ diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h index b62a8d43a06c..7ca8d8e80ffa 100644 --- a/arch/powerpc/include/asm/mmu-book3e.h +++ b/arch/powerpc/include/asm/mmu-book3e.h @@ -229,11 +229,6 @@ typedef struct { unsigned int id; unsigned int active; unsigned long vdso_base; -#ifdef CONFIG_PPC_MM_SLICES - u64 low_slices_psize; /* SLB page size encodings */ - u64 high_slices_psize; /* 4 bits per slice for now */ - u16 user_psize; /* page size index */ -#endif #ifdef CONFIG_PPC_64K_PAGES /* for 4K PTE fragment support */ void *pte_frag; diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 065e762fae85..78260409dc9c 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -29,6 +29,10 @@ */ /* + * Support for 68 bit VA space. We added that from ISA 2.05 + */ +#define MMU_FTR_68_BIT_VA ASM_CONST(0x00002000) +/* * Kernel read only support. * We added the ppp value 0b110 in ISA 2.04. */ @@ -109,10 +113,10 @@ #define MMU_FTRS_POWER4 MMU_FTRS_DEFAULT_HPTE_ARCH_V2 #define MMU_FTRS_PPC970 MMU_FTRS_POWER4 | MMU_FTR_TLBIE_CROP_VA #define MMU_FTRS_POWER5 MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE -#define MMU_FTRS_POWER6 MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_KERNEL_RO -#define MMU_FTRS_POWER7 MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_KERNEL_RO -#define MMU_FTRS_POWER8 MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_KERNEL_RO -#define MMU_FTRS_POWER9 MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_KERNEL_RO +#define MMU_FTRS_POWER6 MMU_FTRS_POWER5 | MMU_FTR_KERNEL_RO | MMU_FTR_68_BIT_VA +#define MMU_FTRS_POWER7 MMU_FTRS_POWER6 +#define MMU_FTRS_POWER8 MMU_FTRS_POWER6 +#define MMU_FTRS_POWER9 MMU_FTRS_POWER6 #define MMU_FTRS_CELL MMU_FTRS_DEFAULT_HPTE_ARCH_V2 | \ MMU_FTR_CI_LARGE_PAGE #define MMU_FTRS_PA6T MMU_FTRS_DEFAULT_HPTE_ARCH_V2 | \ @@ -136,7 +140,7 @@ enum { MMU_FTR_NO_SLBIE_B | MMU_FTR_16M_PAGE | MMU_FTR_TLBIEL | MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_CI_LARGE_PAGE | MMU_FTR_1T_SEGMENT | MMU_FTR_TLBIE_CROP_VA | - MMU_FTR_KERNEL_RO | + MMU_FTR_KERNEL_RO | MMU_FTR_68_BIT_VA | #ifdef CONFIG_PPC_RADIX_MMU MMU_FTR_TYPE_RADIX | #endif @@ -290,7 +294,10 @@ static inline bool early_radix_enabled(void) #define MMU_PAGE_16G 14 #define MMU_PAGE_64G 15 -/* N.B. we need to change the type of hpte_page_sizes if this gets to be > 16 */ +/* + * N.B. we need to change the type of hpte_page_sizes if this gets to be > 16 + * Also we need to change he type of mm_context.low/high_slices_psize. + */ #define MMU_PAGE_COUNT 16 #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index b9e3f0aca261..78803a7ebdd9 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -51,7 +51,8 @@ static inline void switch_mmu_context(struct mm_struct *prev, return switch_slb(tsk, next); } -extern int __init_new_context(void); +extern int hash__alloc_context_id(void); +extern void hash__reserve_context_id(int id); extern void __destroy_context(int context_id); static inline void mmu_context_init(void) { } #else diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index c7f927e67d14..f0ff384d4ca5 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -88,11 +88,6 @@ #include <asm/nohash/pte-book3e.h> #include <asm/pte-common.h> -#ifdef CONFIG_PPC_MM_SLICES -#define HAVE_ARCH_UNMAPPED_AREA -#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN -#endif /* CONFIG_PPC_MM_SLICES */ - #ifndef __ASSEMBLY__ /* pte_clear moved to later in this file */ diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index bc8ac3c0e649..cb3e6242a78c 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -187,7 +187,10 @@ #define OPAL_XIVE_DUMP 142 #define OPAL_XIVE_RESERVED3 143 #define OPAL_XIVE_RESERVED4 144 -#define OPAL_LAST 144 +#define OPAL_NPU_INIT_CONTEXT 146 +#define OPAL_NPU_DESTROY_CONTEXT 147 +#define OPAL_NPU_MAP_LPAR 148 +#define OPAL_LAST 148 /* Device tree flags */ diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index cb7d6078b03a..588fb1c23af9 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -29,6 +29,11 @@ extern struct device_node *opal_node; /* API functions */ int64_t opal_invalid_call(void); +int64_t opal_npu_destroy_context(uint64_t phb_id, uint64_t pid, uint64_t bdf); +int64_t opal_npu_init_context(uint64_t phb_id, int pasid, uint64_t msr, + uint64_t bdf); +int64_t opal_npu_map_lpar(uint64_t phb_id, uint64_t bdf, uint64_t lparid, + uint64_t lpcr); int64_t opal_console_write(int64_t term_number, __be64 *length, const uint8_t *buffer); int64_t opal_console_read(int64_t term_number, __be64 *length, diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 708c3e592eeb..140ddb9ae5a8 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -139,6 +139,7 @@ struct paca_struct { #ifdef CONFIG_PPC_MM_SLICES u64 mm_ctx_low_slices_psize; unsigned char mm_ctx_high_slices_psize[SLICE_ARRAY_SIZE]; + unsigned long addr_limit; #else u16 mm_ctx_user_psize; u16 mm_ctx_sllp; @@ -172,6 +173,11 @@ struct paca_struct { u8 thread_mask; /* Mask to denote subcore sibling threads */ u8 subcore_sibling_mask; + /* + * Pointer to an array which contains pointer + * to the sibling threads' paca. + */ + struct paca_struct **thread_sibling_pacas; #endif #ifdef CONFIG_PPC_BOOK3S_64 @@ -206,23 +212,7 @@ struct paca_struct { #endif }; -#ifdef CONFIG_PPC_BOOK3S -static inline void copy_mm_to_paca(mm_context_t *context) -{ - get_paca()->mm_ctx_id = context->id; -#ifdef CONFIG_PPC_MM_SLICES - get_paca()->mm_ctx_low_slices_psize = context->low_slices_psize; - memcpy(&get_paca()->mm_ctx_high_slices_psize, - &context->high_slices_psize, SLICE_ARRAY_SIZE); -#else - get_paca()->mm_ctx_user_psize = context->user_psize; - get_paca()->mm_ctx_sllp = context->sllp; -#endif -} -#else -static inline void copy_mm_to_paca(mm_context_t *context){} -#endif - +extern void copy_mm_to_paca(struct mm_struct *mm); extern struct paca_struct *paca; extern void initialise_paca(struct paca_struct *new_paca, int cpu); extern void setup_paca(struct paca_struct *new_paca); diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h index 3e83d2a20b6f..c4d9654bd637 100644 --- a/arch/powerpc/include/asm/page_64.h +++ b/arch/powerpc/include/asm/page_64.h @@ -98,21 +98,7 @@ extern u64 ppc64_pft_size; #define GET_LOW_SLICE_INDEX(addr) ((addr) >> SLICE_LOW_SHIFT) #define GET_HIGH_SLICE_INDEX(addr) ((addr) >> SLICE_HIGH_SHIFT) -/* - * 1 bit per slice and we have one slice per 1TB - * Right now we support only 64TB. - * IF we change this we will have to change the type - * of high_slices - */ -#define SLICE_MASK_SIZE 8 - #ifndef __ASSEMBLY__ - -struct slice_mask { - u16 low_slices; - u64 high_slices; -}; - struct mm_struct; extern unsigned long slice_get_unmapped_area(unsigned long addr, diff --git a/arch/powerpc/include/asm/powernv.h b/arch/powerpc/include/asm/powernv.h index 0e9c2402dd20..f62797702300 100644 --- a/arch/powerpc/include/asm/powernv.h +++ b/arch/powerpc/include/asm/powernv.h @@ -11,9 +11,31 @@ #define _ASM_POWERNV_H #ifdef CONFIG_PPC_POWERNV +#define NPU2_WRITE 1 extern void powernv_set_nmmu_ptcr(unsigned long ptcr); +extern struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, + unsigned long flags, + struct npu_context *(*cb)(struct npu_context *, void *), + void *priv); +extern void pnv_npu2_destroy_context(struct npu_context *context, + struct pci_dev *gpdev); +extern int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea, + unsigned long *flags, unsigned long *status, + int count); #else static inline void powernv_set_nmmu_ptcr(unsigned long ptcr) { } +static inline struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, + unsigned long flags, + struct npu_context *(*cb)(struct npu_context *, void *), + void *priv) { return ERR_PTR(-ENODEV); } +static inline void pnv_npu2_destroy_context(struct npu_context *context, + struct pci_dev *gpdev) { } + +static inline int pnv_npu2_handle_fault(struct npu_context *context, + uintptr_t *ea, unsigned long *flags, + unsigned long *status, int count) { + return -ENODEV; +} #endif #endif /* _ASM_POWERNV_H */ diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index e0fecbcea2a2..a4b1d8d6b793 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -102,11 +102,25 @@ void release_thread(struct task_struct *); #endif #ifdef CONFIG_PPC64 -/* 64-bit user address space is 46-bits (64TB user VM) */ -#define TASK_SIZE_USER64 (0x0000400000000000UL) +/* + * 64-bit user address space can have multiple limits + * For now supported values are: + */ +#define TASK_SIZE_64TB (0x0000400000000000UL) +#define TASK_SIZE_128TB (0x0000800000000000UL) +#define TASK_SIZE_512TB (0x0002000000000000UL) + +#ifdef CONFIG_PPC_BOOK3S_64 +/* + * Max value currently used: + */ +#define TASK_SIZE_USER64 TASK_SIZE_512TB +#else +#define TASK_SIZE_USER64 TASK_SIZE_64TB +#endif -/* - * 32-bit user address space is 4GB - 1 page +/* + * 32-bit user address space is 4GB - 1 page * (this 1 page is needed so referencing of 0xFFFFFFFF generates EFAULT */ #define TASK_SIZE_USER32 (0x0000000100000000UL - (1*PAGE_SIZE)) @@ -114,26 +128,37 @@ void release_thread(struct task_struct *); #define TASK_SIZE_OF(tsk) (test_tsk_thread_flag(tsk, TIF_32BIT) ? \ TASK_SIZE_USER32 : TASK_SIZE_USER64) #define TASK_SIZE TASK_SIZE_OF(current) - /* This decides where the kernel will search for a free chunk of vm * space during mmap's. */ #define TASK_UNMAPPED_BASE_USER32 (PAGE_ALIGN(TASK_SIZE_USER32 / 4)) -#define TASK_UNMAPPED_BASE_USER64 (PAGE_ALIGN(TASK_SIZE_USER64 / 4)) +#define TASK_UNMAPPED_BASE_USER64 (PAGE_ALIGN(TASK_SIZE_128TB / 4)) #define TASK_UNMAPPED_BASE ((is_32bit_task()) ? \ TASK_UNMAPPED_BASE_USER32 : TASK_UNMAPPED_BASE_USER64 ) #endif +/* + * Initial task size value for user applications. For book3s 64 we start + * with 128TB and conditionally enable upto 512TB + */ +#ifdef CONFIG_PPC_BOOK3S_64 +#define DEFAULT_MAP_WINDOW ((is_32bit_task()) ? \ + TASK_SIZE_USER32 : TASK_SIZE_128TB) +#else +#define DEFAULT_MAP_WINDOW TASK_SIZE +#endif + #ifdef __powerpc64__ -#define STACK_TOP_USER64 TASK_SIZE_USER64 +/* Limit stack to 128TB */ +#define STACK_TOP_USER64 TASK_SIZE_128TB #define STACK_TOP_USER32 TASK_SIZE_USER32 #define STACK_TOP (is_32bit_task() ? \ STACK_TOP_USER32 : STACK_TOP_USER64) -#define STACK_TOP_MAX STACK_TOP_USER64 +#define STACK_TOP_MAX TASK_SIZE_USER64 #else /* __powerpc64__ */ diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index 87e4b2d8dcd4..2e17d668c472 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -10,15 +10,7 @@ #ifdef __KERNEL__ -/* We have 8k stacks on ppc32 and 16k on ppc64 */ - -#if defined(CONFIG_PPC64) -#define THREAD_SHIFT 14 -#elif defined(CONFIG_PPC_256K_PAGES) -#define THREAD_SHIFT 15 -#else -#define THREAD_SHIFT 13 -#endif +#define THREAD_SHIFT CONFIG_THREAD_SHIFT #define THREAD_SIZE (1 << THREAD_SHIFT) diff --git a/arch/powerpc/include/uapi/asm/mman.h b/arch/powerpc/include/uapi/asm/mman.h index 03c06ba7464f..ab45cc2f3101 100644 --- a/arch/powerpc/include/uapi/asm/mman.h +++ b/arch/powerpc/include/uapi/asm/mman.h @@ -29,4 +29,20 @@ #define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ #define MAP_HUGETLB 0x40000 /* create a huge page mapping */ +/* + * When MAP_HUGETLB is set, bits [26:31] of the flags argument to mmap(2), + * encode the log2 of the huge page size. A value of zero indicates that the + * default huge page size should be used. To use a non-default huge page size, + * one of these defines can be used, or the size can be encoded by hand. Note + * that on most systems only a subset, or possibly none, of these sizes will be + * available. + */ +#define MAP_HUGE_512KB (19 << MAP_HUGE_SHIFT) /* 512KB HugeTLB Page */ +#define MAP_HUGE_1MB (20 << MAP_HUGE_SHIFT) /* 1MB HugeTLB Page */ +#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) /* 2MB HugeTLB Page */ +#define MAP_HUGE_8MB (23 << MAP_HUGE_SHIFT) /* 8MB HugeTLB Page */ +#define MAP_HUGE_16MB (24 << MAP_HUGE_SHIFT) /* 16MB HugeTLB Page */ +#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT) /* 1GB HugeTLB Page */ +#define MAP_HUGE_16GB (34 << MAP_HUGE_SHIFT) /* 16GB HugeTLB Page */ + #endif /* _UAPI_ASM_POWERPC_MMAN_H */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 4367e7df51a1..e7c8229a8812 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -185,6 +185,7 @@ int main(void) #ifdef CONFIG_PPC_MM_SLICES OFFSET(PACALOWSLICESPSIZE, paca_struct, mm_ctx_low_slices_psize); OFFSET(PACAHIGHSLICEPSIZE, paca_struct, mm_ctx_high_slices_psize); + DEFINE(PACA_ADDR_LIMIT, offsetof(struct paca_struct, addr_limit)); DEFINE(MMUPSIZEDEFSIZE, sizeof(struct mmu_psize_def)); #endif /* CONFIG_PPC_MM_SLICES */ #endif @@ -727,6 +728,7 @@ int main(void) OFFSET(PACA_THREAD_IDLE_STATE, paca_struct, thread_idle_state); OFFSET(PACA_THREAD_MASK, paca_struct, thread_mask); OFFSET(PACA_SUBCORE_SIBLING_MASK, paca_struct, subcore_sibling_mask); + OFFSET(PACA_SIBLING_PACA_PTRS, paca_struct, thread_sibling_pacas); #endif DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER); diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 9de7f79e702b..63992b2d8e15 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -22,7 +22,6 @@ */ #include <linux/delay.h> -#include <linux/debugfs.h> #include <linux/sched.h> #include <linux/init.h> #include <linux/list.h> @@ -37,7 +36,7 @@ #include <linux/of.h> #include <linux/atomic.h> -#include <asm/debug.h> +#include <asm/debugfs.h> #include <asm/eeh.h> #include <asm/eeh_event.h> #include <asm/io.h> diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 8ff0dd4e77a7..243dbef7e926 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -30,17 +30,16 @@ #include <linux/string.h> #include <linux/memblock.h> #include <linux/delay.h> -#include <linux/debugfs.h> #include <linux/seq_file.h> #include <linux/crash_dump.h> #include <linux/kobject.h> #include <linux/sysfs.h> +#include <asm/debugfs.h> #include <asm/page.h> #include <asm/prom.h> #include <asm/rtas.h> #include <asm/fadump.h> -#include <asm/debug.h> #include <asm/setup.h> static struct fw_dump fw_dump; @@ -319,15 +318,34 @@ int __init fadump_reserve_mem(void) pr_debug("fadumphdr_addr = %p\n", (void *) fw_dump.fadumphdr_addr); } else { - /* Reserve the memory at the top of memory. */ size = get_fadump_area_size(); - base = memory_boundary - size; - memblock_reserve(base, size); - printk(KERN_INFO "Reserved %ldMB of memory at %ldMB " - "for firmware-assisted dump\n", - (unsigned long)(size >> 20), - (unsigned long)(base >> 20)); + + /* + * Reserve memory at an offset closer to bottom of the RAM to + * minimize the impact of memory hot-remove operation. We can't + * use memblock_find_in_range() here since it doesn't allocate + * from bottom to top. + */ + for (base = fw_dump.boot_memory_size; + base <= (memory_boundary - size); + base += size) { + if (memblock_is_region_memory(base, size) && + !memblock_is_region_reserved(base, size)) + break; + } + if ((base > (memory_boundary - size)) || + memblock_reserve(base, size)) { + pr_err("Failed to reserve memory\n"); + return 0; + } + + pr_info("Reserved %ldMB of memory at %ldMB for firmware-" + "assisted dump (System RAM: %ldMB)\n", + (unsigned long)(size >> 20), + (unsigned long)(base >> 20), + (unsigned long)(memblock_phys_mem_size() >> 20)); } + fw_dump.reserve_dump_area_start = base; fw_dump.reserve_dump_area_size = size; return 1; diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c index 5c9f50c1aa99..32509de6ce4c 100644 --- a/arch/powerpc/kernel/ftrace.c +++ b/arch/powerpc/kernel/ftrace.c @@ -21,6 +21,7 @@ #include <linux/init.h> #include <linux/list.h> +#include <asm/asm-prototypes.h> #include <asm/cacheflush.h> #include <asm/code-patching.h> #include <asm/ftrace.h> diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 1607be7c0ef2..e22734278458 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -735,11 +735,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_NEED_DTLB_SW_LRU) EXCEPTION(0x2c00, Trap_2c, unknown_exception, EXC_XFER_EE) EXCEPTION(0x2d00, Trap_2d, unknown_exception, EXC_XFER_EE) EXCEPTION(0x2e00, Trap_2e, unknown_exception, EXC_XFER_EE) - EXCEPTION(0x2f00, MOLTrampoline, unknown_exception, EXC_XFER_EE_LITE) - - .globl mol_trampoline - .set mol_trampoline, i0x2f00 - EXPORT_SYMBOL(mol_trampoline) + EXCEPTION(0x2f00, Trap_2f, unknown_exception, EXC_XFER_EE) . = 0x3000 @@ -1278,16 +1274,6 @@ EXPORT_SYMBOL(empty_zero_page) swapper_pg_dir: .space PGD_TABLE_SIZE - .globl intercept_table -intercept_table: - .long 0, 0, i0x200, i0x300, i0x400, 0, i0x600, i0x700 - .long i0x800, 0, 0, 0, 0, i0xd00, 0, 0 - .long 0, 0, 0, i0x1300, 0, 0, 0, 0 - .long 0, 0, 0, 0, 0, 0, 0, 0 - .long 0, 0, 0, 0, 0, 0, 0, 0 - .long 0, 0, 0, 0, 0, 0, 0, 0 -EXPORT_SYMBOL(intercept_table) - /* Room for two PTE pointers, usually the kernel and current user pointers * to their respective root page table. */ diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index 1dc5eae2ced3..0ddc602b33a4 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -949,7 +949,8 @@ start_here_multiplatform: LOAD_REG_ADDR(r3,init_thread_union) /* set up a stack pointer */ - addi r1,r3,THREAD_SIZE + LOAD_REG_IMMEDIATE(r1,THREAD_SIZE) + add r1,r3,r1 li r0,0 stdu r0,-STACK_FRAME_OVERHEAD(r1) diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 995728736677..24717a73b6bb 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -375,6 +375,46 @@ _GLOBAL(power9_idle_stop) li r4,1 b pnv_powersave_common /* No return */ + + +/* + * On waking up from stop 0,1,2 with ESL=1 on POWER9 DD1, + * HSPRG0 will be set to the HSPRG0 value of one of the + * threads in this core. Thus the value we have in r13 + * may not be this thread's paca pointer. + * + * Fortunately, the TIR remains invariant. Since this thread's + * paca pointer is recorded in all its sibling's paca, we can + * correctly recover this thread's paca pointer if we + * know the index of this thread in the core. + * + * This index can be obtained from the TIR. + * + * i.e, thread's position in the core = TIR. + * If this value is i, then this thread's paca is + * paca->thread_sibling_pacas[i]. + */ +power9_dd1_recover_paca: + mfspr r4, SPRN_TIR + /* + * Since each entry in thread_sibling_pacas is 8 bytes + * we need to left-shift by 3 bits. Thus r4 = i * 8 + */ + sldi r4, r4, 3 + /* Get &paca->thread_sibling_pacas[0] in r5 */ + ld r5, PACA_SIBLING_PACA_PTRS(r13) + /* Load paca->thread_sibling_pacas[i] into r13 */ + ldx r13, r4, r5 + SET_PACA(r13) + ld r2, PACATOC(r13) + /* + * Indicate that we have lost NVGPR state + * which needs to be restored from the stack. + */ + li r3, 1 + stb r0,PACA_NAPSTATELOST(r13) + blr + /* * Called from reset vector. Check whether we have woken up with * hypervisor state loss. If yes, restore hypervisor state and return @@ -385,7 +425,13 @@ _GLOBAL(power9_idle_stop) */ _GLOBAL(pnv_restore_hyp_resource) BEGIN_FTR_SECTION - ld r2,PACATOC(r13); +BEGIN_FTR_SECTION_NESTED(70) + mflr r6 + bl power9_dd1_recover_paca + mtlr r6 +FTR_SECTION_ELSE_NESTED(70) + ld r2, PACATOC(r13) +ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_POWER9_DD1, 70) /* * POWER ISA 3. Use PSSCR to determine if we * are waking up from deep idle state diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 8ee7b44450eb..5c291df30fe3 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -65,7 +65,6 @@ #include <asm/machdep.h> #include <asm/udbg.h> #include <asm/smp.h> -#include <asm/debug.h> #include <asm/livepatch.h> #include <asm/asm-prototypes.h> diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index a1475e6aef3a..16eb0b508761 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -228,12 +228,13 @@ static void machine_check_process_queued_event(struct irq_work *work) while (__this_cpu_read(mce_queue_count) > 0) { index = __this_cpu_read(mce_queue_count) - 1; machine_check_print_event_info( - this_cpu_ptr(&mce_event_queue[index])); + this_cpu_ptr(&mce_event_queue[index]), false); __this_cpu_dec(mce_queue_count); } } -void machine_check_print_event_info(struct machine_check_event *evt) +void machine_check_print_event_info(struct machine_check_event *evt, + bool user_mode) { const char *level, *sevstr, *subtype; static const char *mc_ue_types[] = { @@ -310,7 +311,16 @@ void machine_check_print_event_info(struct machine_check_event *evt) printk("%s%s Machine check interrupt [%s]\n", level, sevstr, evt->disposition == MCE_DISPOSITION_RECOVERED ? - "Recovered" : "[Not recovered"); + "Recovered" : "Not recovered"); + + if (user_mode) { + printk("%s NIP: [%016llx] PID: %d Comm: %s\n", level, + evt->srr0, current->pid, current->comm); + } else { + printk("%s NIP [%016llx]: %pS\n", level, evt->srr0, + (void *)evt->srr0); + } + printk("%s Initiator: %s\n", level, evt->initiator == MCE_INITIATOR_CPU ? "CPU" : "Unknown"); switch (evt->error_type) { diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index 763d6f58caa8..de242b4bbd20 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -147,159 +147,365 @@ static int mce_flush(int what) return 0; } -static int mce_handle_flush_derrors(uint64_t dsisr, uint64_t slb, uint64_t tlb, uint64_t erat) -{ - if ((dsisr & slb) && mce_flush(MCE_FLUSH_SLB)) - dsisr &= ~slb; - if ((dsisr & erat) && mce_flush(MCE_FLUSH_ERAT)) - dsisr &= ~erat; - if ((dsisr & tlb) && mce_flush(MCE_FLUSH_TLB)) - dsisr &= ~tlb; - /* Any other errors we don't understand? */ - if (dsisr) - return 0; - return 1; -} - -static long mce_handle_derror(uint64_t dsisr, uint64_t slb_error_bits) +#define SRR1_MC_LOADSTORE(srr1) ((srr1) & PPC_BIT(42)) + +struct mce_ierror_table { + unsigned long srr1_mask; + unsigned long srr1_value; + bool nip_valid; /* nip is a valid indicator of faulting address */ + unsigned int error_type; + unsigned int error_subtype; + unsigned int initiator; + unsigned int severity; +}; + +static const struct mce_ierror_table mce_p7_ierror_table[] = { +{ 0x00000000001c0000, 0x0000000000040000, true, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000001c0000, 0x0000000000080000, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000001c0000, 0x00000000000c0000, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000001c0000, 0x0000000000100000, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_INDETERMINATE, /* BOTH */ + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000001c0000, 0x0000000000140000, true, + MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000001c0000, 0x0000000000180000, true, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000001c0000, 0x00000000001c0000, true, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0, 0, 0, 0, 0, 0 } }; + +static const struct mce_ierror_table mce_p8_ierror_table[] = { +{ 0x00000000081c0000, 0x0000000000040000, true, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x0000000000080000, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x00000000000c0000, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x0000000000100000, true, + MCE_ERROR_TYPE_ERAT,MCE_ERAT_ERROR_MULTIHIT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x0000000000140000, true, + MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x0000000000180000, true, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x00000000001c0000, true, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x0000000008000000, true, + MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_IFETCH_TIMEOUT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x0000000008040000, true, + MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_PAGE_TABLE_WALK_IFETCH_TIMEOUT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0, 0, 0, 0, 0, 0 } }; + +static const struct mce_ierror_table mce_p9_ierror_table[] = { +{ 0x00000000081c0000, 0x0000000000040000, true, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x0000000000080000, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x00000000000c0000, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x0000000000100000, true, + MCE_ERROR_TYPE_ERAT,MCE_ERAT_ERROR_MULTIHIT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x0000000000140000, true, + MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x0000000000180000, true, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x0000000008000000, true, + MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_IFETCH_TIMEOUT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x0000000008040000, true, + MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_PAGE_TABLE_WALK_IFETCH_TIMEOUT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x00000000080c0000, true, + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_IFETCH, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x0000000008100000, true, + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x0000000008140000, false, + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_STORE, + MCE_INITIATOR_CPU, MCE_SEV_FATAL, }, /* ASYNC is fatal */ +{ 0x00000000081c0000, 0x0000000008180000, false, + MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_STORE_TIMEOUT, + MCE_INITIATOR_CPU, MCE_SEV_FATAL, }, /* ASYNC is fatal */ +{ 0x00000000081c0000, 0x00000000081c0000, true, + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH_FOREIGN, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0, 0, 0, 0, 0, 0 } }; + +struct mce_derror_table { + unsigned long dsisr_value; + bool dar_valid; /* dar is a valid indicator of faulting address */ + unsigned int error_type; + unsigned int error_subtype; + unsigned int initiator; + unsigned int severity; +}; + +static const struct mce_derror_table mce_p7_derror_table[] = { +{ 0x00008000, false, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_LOAD_STORE, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00004000, true, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000800, true, + MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000400, true, + MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000100, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000080, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000040, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_INDETERMINATE, /* BOTH */ + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0, false, 0, 0, 0, 0 } }; + +static const struct mce_derror_table mce_p8_derror_table[] = { +{ 0x00008000, false, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_LOAD_STORE, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00004000, true, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00002000, true, + MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_LOAD_TIMEOUT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00001000, true, + MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_PAGE_TABLE_WALK_LOAD_STORE_TIMEOUT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000800, true, + MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000400, true, + MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000200, true, + MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, /* SECONDARY ERAT */ + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000100, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000080, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0, false, 0, 0, 0, 0 } }; + +static const struct mce_derror_table mce_p9_derror_table[] = { +{ 0x00008000, false, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_LOAD_STORE, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00004000, true, + MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00002000, true, + MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_LOAD_TIMEOUT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00001000, true, + MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_PAGE_TABLE_WALK_LOAD_STORE_TIMEOUT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000800, true, + MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000400, true, + MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000200, false, + MCE_ERROR_TYPE_USER, MCE_USER_ERROR_TLBIE, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000100, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000080, true, + MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000040, true, + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_LOAD, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000020, false, + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000010, false, + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000008, false, + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_LOAD_STORE_FOREIGN, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0, false, 0, 0, 0, 0 } }; + +static int mce_handle_ierror(struct pt_regs *regs, + const struct mce_ierror_table table[], + struct mce_error_info *mce_err, uint64_t *addr) { - long handled = 1; + uint64_t srr1 = regs->msr; + int handled = 0; + int i; + + *addr = 0; + + for (i = 0; table[i].srr1_mask; i++) { + if ((srr1 & table[i].srr1_mask) != table[i].srr1_value) + continue; + + /* attempt to correct the error */ + switch (table[i].error_type) { + case MCE_ERROR_TYPE_SLB: + handled = mce_flush(MCE_FLUSH_SLB); + break; + case MCE_ERROR_TYPE_ERAT: + handled = mce_flush(MCE_FLUSH_ERAT); + break; + case MCE_ERROR_TYPE_TLB: + handled = mce_flush(MCE_FLUSH_TLB); + break; + } - /* - * flush and reload SLBs for SLB errors and flush TLBs for TLB errors. - * reset the error bits whenever we handle them so that at the end - * we can check whether we handled all of them or not. - * */ -#ifdef CONFIG_PPC_STD_MMU_64 - if (dsisr & slb_error_bits) { - flush_and_reload_slb(); - /* reset error bits */ - dsisr &= ~(slb_error_bits); - } - if (dsisr & P7_DSISR_MC_TLB_MULTIHIT_MFTLB) { - if (cur_cpu_spec && cur_cpu_spec->flush_tlb) - cur_cpu_spec->flush_tlb(TLB_INVAL_SCOPE_GLOBAL); - /* reset error bits */ - dsisr &= ~P7_DSISR_MC_TLB_MULTIHIT_MFTLB; + /* now fill in mce_error_info */ + mce_err->error_type = table[i].error_type; + switch (table[i].error_type) { + case MCE_ERROR_TYPE_UE: + mce_err->u.ue_error_type = table[i].error_subtype; + break; + case MCE_ERROR_TYPE_SLB: + mce_err->u.slb_error_type = table[i].error_subtype; + break; + case MCE_ERROR_TYPE_ERAT: + mce_err->u.erat_error_type = table[i].error_subtype; + break; + case MCE_ERROR_TYPE_TLB: + mce_err->u.tlb_error_type = table[i].error_subtype; + break; + case MCE_ERROR_TYPE_USER: + mce_err->u.user_error_type = table[i].error_subtype; + break; + case MCE_ERROR_TYPE_RA: + mce_err->u.ra_error_type = table[i].error_subtype; + break; + case MCE_ERROR_TYPE_LINK: + mce_err->u.link_error_type = table[i].error_subtype; + break; + } + mce_err->severity = table[i].severity; + mce_err->initiator = table[i].initiator; + if (table[i].nip_valid) + *addr = regs->nip; + return handled; } -#endif - /* Any other errors we don't understand? */ - if (dsisr & 0xffffffffUL) - handled = 0; - return handled; -} + mce_err->error_type = MCE_ERROR_TYPE_UNKNOWN; + mce_err->severity = MCE_SEV_ERROR_SYNC; + mce_err->initiator = MCE_INITIATOR_CPU; -static long mce_handle_derror_p7(uint64_t dsisr) -{ - return mce_handle_derror(dsisr, P7_DSISR_MC_SLB_ERRORS); + return 0; } -static long mce_handle_common_ierror(uint64_t srr1) +static int mce_handle_derror(struct pt_regs *regs, + const struct mce_derror_table table[], + struct mce_error_info *mce_err, uint64_t *addr) { - long handled = 0; - - switch (P7_SRR1_MC_IFETCH(srr1)) { - case 0: - break; -#ifdef CONFIG_PPC_STD_MMU_64 - case P7_SRR1_MC_IFETCH_SLB_PARITY: - case P7_SRR1_MC_IFETCH_SLB_MULTIHIT: - /* flush and reload SLBs for SLB errors. */ - flush_and_reload_slb(); - handled = 1; - break; - case P7_SRR1_MC_IFETCH_TLB_MULTIHIT: - if (cur_cpu_spec && cur_cpu_spec->flush_tlb) { - cur_cpu_spec->flush_tlb(TLB_INVAL_SCOPE_GLOBAL); - handled = 1; + uint64_t dsisr = regs->dsisr; + int handled = 0; + int found = 0; + int i; + + *addr = 0; + + for (i = 0; table[i].dsisr_value; i++) { + if (!(dsisr & table[i].dsisr_value)) + continue; + + /* attempt to correct the error */ + switch (table[i].error_type) { + case MCE_ERROR_TYPE_SLB: + if (mce_flush(MCE_FLUSH_SLB)) + handled = 1; + break; + case MCE_ERROR_TYPE_ERAT: + if (mce_flush(MCE_FLUSH_ERAT)) + handled = 1; + break; + case MCE_ERROR_TYPE_TLB: + if (mce_flush(MCE_FLUSH_TLB)) + handled = 1; + break; } - break; -#endif - default: - break; - } - - return handled; -} - -static long mce_handle_ierror_p7(uint64_t srr1) -{ - long handled = 0; - handled = mce_handle_common_ierror(srr1); + /* + * Attempt to handle multiple conditions, but only return + * one. Ensure uncorrectable errors are first in the table + * to match. + */ + if (found) + continue; + + /* now fill in mce_error_info */ + mce_err->error_type = table[i].error_type; + switch (table[i].error_type) { + case MCE_ERROR_TYPE_UE: + mce_err->u.ue_error_type = table[i].error_subtype; + break; + case MCE_ERROR_TYPE_SLB: + mce_err->u.slb_error_type = table[i].error_subtype; + break; + case MCE_ERROR_TYPE_ERAT: + mce_err->u.erat_error_type = table[i].error_subtype; + break; + case MCE_ERROR_TYPE_TLB: + mce_err->u.tlb_error_type = table[i].error_subtype; + break; + case MCE_ERROR_TYPE_USER: + mce_err->u.user_error_type = table[i].error_subtype; + break; + case MCE_ERROR_TYPE_RA: + mce_err->u.ra_error_type = table[i].error_subtype; + break; + case MCE_ERROR_TYPE_LINK: + mce_err->u.link_error_type = table[i].error_subtype; + break; + } + mce_err->severity = table[i].severity; + mce_err->initiator = table[i].initiator; + if (table[i].dar_valid) + *addr = regs->dar; -#ifdef CONFIG_PPC_STD_MMU_64 - if (P7_SRR1_MC_IFETCH(srr1) == P7_SRR1_MC_IFETCH_SLB_BOTH) { - flush_and_reload_slb(); - handled = 1; + found = 1; } -#endif - return handled; -} -static void mce_get_common_ierror(struct mce_error_info *mce_err, uint64_t srr1) -{ - switch (P7_SRR1_MC_IFETCH(srr1)) { - case P7_SRR1_MC_IFETCH_SLB_PARITY: - mce_err->error_type = MCE_ERROR_TYPE_SLB; - mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY; - break; - case P7_SRR1_MC_IFETCH_SLB_MULTIHIT: - mce_err->error_type = MCE_ERROR_TYPE_SLB; - mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT; - break; - case P7_SRR1_MC_IFETCH_TLB_MULTIHIT: - mce_err->error_type = MCE_ERROR_TYPE_TLB; - mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT; - break; - case P7_SRR1_MC_IFETCH_UE: - case P7_SRR1_MC_IFETCH_UE_IFU_INTERNAL: - mce_err->error_type = MCE_ERROR_TYPE_UE; - mce_err->u.ue_error_type = MCE_UE_ERROR_IFETCH; - break; - case P7_SRR1_MC_IFETCH_UE_TLB_RELOAD: - mce_err->error_type = MCE_ERROR_TYPE_UE; - mce_err->u.ue_error_type = - MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH; - break; - } -} + if (found) + return handled; -static void mce_get_ierror_p7(struct mce_error_info *mce_err, uint64_t srr1) -{ - mce_get_common_ierror(mce_err, srr1); - if (P7_SRR1_MC_IFETCH(srr1) == P7_SRR1_MC_IFETCH_SLB_BOTH) { - mce_err->error_type = MCE_ERROR_TYPE_SLB; - mce_err->u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE; - } -} + mce_err->error_type = MCE_ERROR_TYPE_UNKNOWN; + mce_err->severity = MCE_SEV_ERROR_SYNC; + mce_err->initiator = MCE_INITIATOR_CPU; -static void mce_get_derror_p7(struct mce_error_info *mce_err, uint64_t dsisr) -{ - if (dsisr & P7_DSISR_MC_UE) { - mce_err->error_type = MCE_ERROR_TYPE_UE; - mce_err->u.ue_error_type = MCE_UE_ERROR_LOAD_STORE; - } else if (dsisr & P7_DSISR_MC_UE_TABLEWALK) { - mce_err->error_type = MCE_ERROR_TYPE_UE; - mce_err->u.ue_error_type = - MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE; - } else if (dsisr & P7_DSISR_MC_ERAT_MULTIHIT) { - mce_err->error_type = MCE_ERROR_TYPE_ERAT; - mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT; - } else if (dsisr & P7_DSISR_MC_SLB_MULTIHIT) { - mce_err->error_type = MCE_ERROR_TYPE_SLB; - mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT; - } else if (dsisr & P7_DSISR_MC_SLB_PARITY_MFSLB) { - mce_err->error_type = MCE_ERROR_TYPE_SLB; - mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY; - } else if (dsisr & P7_DSISR_MC_TLB_MULTIHIT_MFTLB) { - mce_err->error_type = MCE_ERROR_TYPE_TLB; - mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT; - } else if (dsisr & P7_DSISR_MC_SLB_MULTIHIT_PARITY) { - mce_err->error_type = MCE_ERROR_TYPE_SLB; - mce_err->u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE; - } + return 0; } static long mce_handle_ue_error(struct pt_regs *regs) @@ -320,292 +526,42 @@ static long mce_handle_ue_error(struct pt_regs *regs) return handled; } -long __machine_check_early_realmode_p7(struct pt_regs *regs) +static long mce_handle_error(struct pt_regs *regs, + const struct mce_derror_table dtable[], + const struct mce_ierror_table itable[]) { - uint64_t srr1, nip, addr; - long handled = 1; - struct mce_error_info mce_error_info = { 0 }; - - mce_error_info.severity = MCE_SEV_ERROR_SYNC; - mce_error_info.initiator = MCE_INITIATOR_CPU; - - srr1 = regs->msr; - nip = regs->nip; + struct mce_error_info mce_err = { 0 }; + uint64_t addr; + uint64_t srr1 = regs->msr; + long handled; - /* - * Handle memory errors depending whether this was a load/store or - * ifetch exception. Also, populate the mce error_type and - * type-specific error_type from either SRR1 or DSISR, depending - * whether this was a load/store or ifetch exception - */ - if (P7_SRR1_MC_LOADSTORE(srr1)) { - handled = mce_handle_derror_p7(regs->dsisr); - mce_get_derror_p7(&mce_error_info, regs->dsisr); - addr = regs->dar; - } else { - handled = mce_handle_ierror_p7(srr1); - mce_get_ierror_p7(&mce_error_info, srr1); - addr = regs->nip; - } + if (SRR1_MC_LOADSTORE(srr1)) + handled = mce_handle_derror(regs, dtable, &mce_err, &addr); + else + handled = mce_handle_ierror(regs, itable, &mce_err, &addr); - /* Handle UE error. */ - if (mce_error_info.error_type == MCE_ERROR_TYPE_UE) + if (!handled && mce_err.error_type == MCE_ERROR_TYPE_UE) handled = mce_handle_ue_error(regs); - save_mce_event(regs, handled, &mce_error_info, nip, addr); - return handled; -} - -static void mce_get_ierror_p8(struct mce_error_info *mce_err, uint64_t srr1) -{ - mce_get_common_ierror(mce_err, srr1); - if (P7_SRR1_MC_IFETCH(srr1) == P8_SRR1_MC_IFETCH_ERAT_MULTIHIT) { - mce_err->error_type = MCE_ERROR_TYPE_ERAT; - mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT; - } -} - -static void mce_get_derror_p8(struct mce_error_info *mce_err, uint64_t dsisr) -{ - mce_get_derror_p7(mce_err, dsisr); - if (dsisr & P8_DSISR_MC_ERAT_MULTIHIT_SEC) { - mce_err->error_type = MCE_ERROR_TYPE_ERAT; - mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT; - } -} - -static long mce_handle_ierror_p8(uint64_t srr1) -{ - long handled = 0; - - handled = mce_handle_common_ierror(srr1); - -#ifdef CONFIG_PPC_STD_MMU_64 - if (P7_SRR1_MC_IFETCH(srr1) == P8_SRR1_MC_IFETCH_ERAT_MULTIHIT) { - flush_and_reload_slb(); - handled = 1; - } -#endif - return handled; -} - -static long mce_handle_derror_p8(uint64_t dsisr) -{ - return mce_handle_derror(dsisr, P8_DSISR_MC_SLB_ERRORS); -} - -long __machine_check_early_realmode_p8(struct pt_regs *regs) -{ - uint64_t srr1, nip, addr; - long handled = 1; - struct mce_error_info mce_error_info = { 0 }; - - mce_error_info.severity = MCE_SEV_ERROR_SYNC; - mce_error_info.initiator = MCE_INITIATOR_CPU; - - srr1 = regs->msr; - nip = regs->nip; - - if (P7_SRR1_MC_LOADSTORE(srr1)) { - handled = mce_handle_derror_p8(regs->dsisr); - mce_get_derror_p8(&mce_error_info, regs->dsisr); - addr = regs->dar; - } else { - handled = mce_handle_ierror_p8(srr1); - mce_get_ierror_p8(&mce_error_info, srr1); - addr = regs->nip; - } - - /* Handle UE error. */ - if (mce_error_info.error_type == MCE_ERROR_TYPE_UE) - handled = mce_handle_ue_error(regs); + save_mce_event(regs, handled, &mce_err, regs->nip, addr); - save_mce_event(regs, handled, &mce_error_info, nip, addr); return handled; } -static int mce_handle_derror_p9(struct pt_regs *regs) -{ - uint64_t dsisr = regs->dsisr; - - return mce_handle_flush_derrors(dsisr, - P9_DSISR_MC_SLB_PARITY_MFSLB | - P9_DSISR_MC_SLB_MULTIHIT_MFSLB, - - P9_DSISR_MC_TLB_MULTIHIT_MFTLB, - - P9_DSISR_MC_ERAT_MULTIHIT); -} - -static int mce_handle_ierror_p9(struct pt_regs *regs) -{ - uint64_t srr1 = regs->msr; - - switch (P9_SRR1_MC_IFETCH(srr1)) { - case P9_SRR1_MC_IFETCH_SLB_PARITY: - case P9_SRR1_MC_IFETCH_SLB_MULTIHIT: - return mce_flush(MCE_FLUSH_SLB); - case P9_SRR1_MC_IFETCH_TLB_MULTIHIT: - return mce_flush(MCE_FLUSH_TLB); - case P9_SRR1_MC_IFETCH_ERAT_MULTIHIT: - return mce_flush(MCE_FLUSH_ERAT); - default: - return 0; - } -} - -static void mce_get_derror_p9(struct pt_regs *regs, - struct mce_error_info *mce_err, uint64_t *addr) +long __machine_check_early_realmode_p7(struct pt_regs *regs) { - uint64_t dsisr = regs->dsisr; - - mce_err->severity = MCE_SEV_ERROR_SYNC; - mce_err->initiator = MCE_INITIATOR_CPU; + /* P7 DD1 leaves top bits of DSISR undefined */ + regs->dsisr &= 0x0000ffff; - if (dsisr & P9_DSISR_MC_USER_TLBIE) - *addr = regs->nip; - else - *addr = regs->dar; - - if (dsisr & P9_DSISR_MC_UE) { - mce_err->error_type = MCE_ERROR_TYPE_UE; - mce_err->u.ue_error_type = MCE_UE_ERROR_LOAD_STORE; - } else if (dsisr & P9_DSISR_MC_UE_TABLEWALK) { - mce_err->error_type = MCE_ERROR_TYPE_UE; - mce_err->u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE; - } else if (dsisr & P9_DSISR_MC_LINK_LOAD_TIMEOUT) { - mce_err->error_type = MCE_ERROR_TYPE_LINK; - mce_err->u.link_error_type = MCE_LINK_ERROR_LOAD_TIMEOUT; - } else if (dsisr & P9_DSISR_MC_LINK_TABLEWALK_TIMEOUT) { - mce_err->error_type = MCE_ERROR_TYPE_LINK; - mce_err->u.link_error_type = MCE_LINK_ERROR_PAGE_TABLE_WALK_LOAD_STORE_TIMEOUT; - } else if (dsisr & P9_DSISR_MC_ERAT_MULTIHIT) { - mce_err->error_type = MCE_ERROR_TYPE_ERAT; - mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT; - } else if (dsisr & P9_DSISR_MC_TLB_MULTIHIT_MFTLB) { - mce_err->error_type = MCE_ERROR_TYPE_TLB; - mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT; - } else if (dsisr & P9_DSISR_MC_USER_TLBIE) { - mce_err->error_type = MCE_ERROR_TYPE_USER; - mce_err->u.user_error_type = MCE_USER_ERROR_TLBIE; - } else if (dsisr & P9_DSISR_MC_SLB_PARITY_MFSLB) { - mce_err->error_type = MCE_ERROR_TYPE_SLB; - mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY; - } else if (dsisr & P9_DSISR_MC_SLB_MULTIHIT_MFSLB) { - mce_err->error_type = MCE_ERROR_TYPE_SLB; - mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT; - } else if (dsisr & P9_DSISR_MC_RA_LOAD) { - mce_err->error_type = MCE_ERROR_TYPE_RA; - mce_err->u.ra_error_type = MCE_RA_ERROR_LOAD; - } else if (dsisr & P9_DSISR_MC_RA_TABLEWALK) { - mce_err->error_type = MCE_ERROR_TYPE_RA; - mce_err->u.ra_error_type = MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE; - } else if (dsisr & P9_DSISR_MC_RA_TABLEWALK_FOREIGN) { - mce_err->error_type = MCE_ERROR_TYPE_RA; - mce_err->u.ra_error_type = MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN; - } else if (dsisr & P9_DSISR_MC_RA_FOREIGN) { - mce_err->error_type = MCE_ERROR_TYPE_RA; - mce_err->u.ra_error_type = MCE_RA_ERROR_LOAD_STORE_FOREIGN; - } + return mce_handle_error(regs, mce_p7_derror_table, mce_p7_ierror_table); } -static void mce_get_ierror_p9(struct pt_regs *regs, - struct mce_error_info *mce_err, uint64_t *addr) +long __machine_check_early_realmode_p8(struct pt_regs *regs) { - uint64_t srr1 = regs->msr; - - switch (P9_SRR1_MC_IFETCH(srr1)) { - case P9_SRR1_MC_IFETCH_RA_ASYNC_STORE: - case P9_SRR1_MC_IFETCH_LINK_ASYNC_STORE_TIMEOUT: - mce_err->severity = MCE_SEV_FATAL; - break; - default: - mce_err->severity = MCE_SEV_ERROR_SYNC; - break; - } - - mce_err->initiator = MCE_INITIATOR_CPU; - - *addr = regs->nip; - - switch (P9_SRR1_MC_IFETCH(srr1)) { - case P9_SRR1_MC_IFETCH_UE: - mce_err->error_type = MCE_ERROR_TYPE_UE; - mce_err->u.ue_error_type = MCE_UE_ERROR_IFETCH; - break; - case P9_SRR1_MC_IFETCH_SLB_PARITY: - mce_err->error_type = MCE_ERROR_TYPE_SLB; - mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY; - break; - case P9_SRR1_MC_IFETCH_SLB_MULTIHIT: - mce_err->error_type = MCE_ERROR_TYPE_SLB; - mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT; - break; - case P9_SRR1_MC_IFETCH_ERAT_MULTIHIT: - mce_err->error_type = MCE_ERROR_TYPE_ERAT; - mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT; - break; - case P9_SRR1_MC_IFETCH_TLB_MULTIHIT: - mce_err->error_type = MCE_ERROR_TYPE_TLB; - mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT; - break; - case P9_SRR1_MC_IFETCH_UE_TLB_RELOAD: - mce_err->error_type = MCE_ERROR_TYPE_UE; - mce_err->u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH; - break; - case P9_SRR1_MC_IFETCH_LINK_TIMEOUT: - mce_err->error_type = MCE_ERROR_TYPE_LINK; - mce_err->u.link_error_type = MCE_LINK_ERROR_IFETCH_TIMEOUT; - break; - case P9_SRR1_MC_IFETCH_LINK_TABLEWALK_TIMEOUT: - mce_err->error_type = MCE_ERROR_TYPE_LINK; - mce_err->u.link_error_type = MCE_LINK_ERROR_PAGE_TABLE_WALK_IFETCH_TIMEOUT; - break; - case P9_SRR1_MC_IFETCH_RA: - mce_err->error_type = MCE_ERROR_TYPE_RA; - mce_err->u.ra_error_type = MCE_RA_ERROR_IFETCH; - break; - case P9_SRR1_MC_IFETCH_RA_TABLEWALK: - mce_err->error_type = MCE_ERROR_TYPE_RA; - mce_err->u.ra_error_type = MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH; - break; - case P9_SRR1_MC_IFETCH_RA_ASYNC_STORE: - mce_err->error_type = MCE_ERROR_TYPE_RA; - mce_err->u.ra_error_type = MCE_RA_ERROR_STORE; - break; - case P9_SRR1_MC_IFETCH_LINK_ASYNC_STORE_TIMEOUT: - mce_err->error_type = MCE_ERROR_TYPE_LINK; - mce_err->u.link_error_type = MCE_LINK_ERROR_STORE_TIMEOUT; - break; - case P9_SRR1_MC_IFETCH_RA_TABLEWALK_FOREIGN: - mce_err->error_type = MCE_ERROR_TYPE_RA; - mce_err->u.ra_error_type = MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH_FOREIGN; - break; - default: - break; - } + return mce_handle_error(regs, mce_p8_derror_table, mce_p8_ierror_table); } long __machine_check_early_realmode_p9(struct pt_regs *regs) { - uint64_t nip, addr; - long handled; - struct mce_error_info mce_error_info = { 0 }; - - nip = regs->nip; - - if (P9_SRR1_MC_LOADSTORE(regs->msr)) { - handled = mce_handle_derror_p9(regs); - mce_get_derror_p9(regs, &mce_error_info, &addr); - } else { - handled = mce_handle_ierror_p9(regs); - mce_get_ierror_p9(regs, &mce_error_info, &addr); - } - - /* Handle UE error. */ - if (mce_error_info.error_type == MCE_ERROR_TYPE_UE) - handled = mce_handle_ue_error(regs); - - save_mce_event(regs, handled, &mce_error_info, nip, addr); - return handled; + return mce_handle_error(regs, mce_p9_derror_table, mce_p9_ierror_table); } diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index dfc479df9634..8d63627e067f 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -245,3 +245,24 @@ void __init free_unused_pacas(void) free_lppacas(); } + +void copy_mm_to_paca(struct mm_struct *mm) +{ +#ifdef CONFIG_PPC_BOOK3S + mm_context_t *context = &mm->context; + + get_paca()->mm_ctx_id = context->id; +#ifdef CONFIG_PPC_MM_SLICES + VM_BUG_ON(!mm->context.addr_limit); + get_paca()->addr_limit = mm->context.addr_limit; + get_paca()->mm_ctx_low_slices_psize = context->low_slices_psize; + memcpy(&get_paca()->mm_ctx_high_slices_psize, + &context->high_slices_psize, TASK_SLICE_ARRAY_SZ(mm)); +#else /* CONFIG_PPC_MM_SLICES */ + get_paca()->mm_ctx_user_psize = context->user_psize; + get_paca()->mm_ctx_sllp = context->sllp; +#endif +#else /* CONFIG_PPC_BOOK3S */ + return; +#endif +} diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index f5d399e46193..d2f0afeae5a0 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -55,7 +55,6 @@ #include <asm/kexec.h> #include <asm/opal.h> #include <asm/fadump.h> -#include <asm/debug.h> #include <asm/epapr_hcalls.h> #include <asm/firmware.h> diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 1c1b44ec7642..dd8a04f3053a 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -815,7 +815,7 @@ struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = { .virt_base = cpu_to_be32(0xffffffff), .virt_size = cpu_to_be32(0xffffffff), .load_base = cpu_to_be32(0xffffffff), - .min_rma = cpu_to_be32(256), /* 256MB min RMA */ + .min_rma = cpu_to_be32(512), /* 512MB min RMA */ .min_load = cpu_to_be32(0xffffffff), /* full client load */ .min_rma_percent = 0, /* min RMA percentage of total RAM */ .max_pft_size = 48, /* max log_2(hash table size) */ diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 4697da895133..5c10b5925ac2 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -31,11 +31,11 @@ #include <linux/unistd.h> #include <linux/serial.h> #include <linux/serial_8250.h> -#include <linux/debugfs.h> #include <linux/percpu.h> #include <linux/memblock.h> #include <linux/of_platform.h> #include <linux/hugetlb.h> +#include <asm/debugfs.h> #include <asm/io.h> #include <asm/paca.h> #include <asm/prom.h> @@ -920,6 +920,15 @@ void __init setup_arch(char **cmdline_p) init_mm.end_code = (unsigned long) _etext; init_mm.end_data = (unsigned long) _edata; init_mm.brk = klimit; + +#ifdef CONFIG_PPC_MM_SLICES +#ifdef CONFIG_PPC64 + init_mm.context.addr_limit = TASK_SIZE_128TB; +#else +#error "context.addr_limit not initialized." +#endif +#endif + #ifdef CONFIG_PPC_64K_PAGES init_mm.context.pte_frag = NULL; #endif diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 9cfaa8b69b5f..729e990a019d 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -230,8 +230,8 @@ static void cpu_ready_for_interrupts(void) * If we are not in hypervisor mode the job is done once for * the whole partition in configure_exceptions(). */ - if (early_cpu_has_feature(CPU_FTR_HVMODE) && - early_cpu_has_feature(CPU_FTR_ARCH_207S)) { + if (cpu_has_feature(CPU_FTR_HVMODE) && + cpu_has_feature(CPU_FTR_ARCH_207S)) { unsigned long lpcr = mfspr(SPRN_LPCR); mtspr(SPRN_LPCR, lpcr | LPCR_AIL_3); } diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c index 66711958493c..d534ed901538 100644 --- a/arch/powerpc/kernel/stacktrace.c +++ b/arch/powerpc/kernel/stacktrace.c @@ -59,7 +59,14 @@ EXPORT_SYMBOL_GPL(save_stack_trace); void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { - save_context_stack(trace, tsk->thread.ksp, tsk, 0); + unsigned long sp; + + if (tsk == current) + sp = current_stack_pointer(); + else + sp = tsk->thread.ksp; + + save_context_stack(trace, sp, tsk, 0); } EXPORT_SYMBOL_GPL(save_stack_trace_tsk); diff --git a/arch/powerpc/kernel/swsusp.c b/arch/powerpc/kernel/swsusp.c index 6ae9bd5086a4..0050b2d2ff7a 100644 --- a/arch/powerpc/kernel/swsusp.c +++ b/arch/powerpc/kernel/swsusp.c @@ -10,6 +10,7 @@ */ #include <linux/sched.h> +#include <linux/suspend.h> #include <asm/current.h> #include <asm/mmu_context.h> #include <asm/switch_to.h> diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index ff365f9de27a..354946236c61 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -35,13 +35,13 @@ #include <linux/backlight.h> #include <linux/bug.h> #include <linux/kdebug.h> -#include <linux/debugfs.h> #include <linux/ratelimit.h> #include <linux/context_tracking.h> #include <asm/emulated_ops.h> #include <asm/pgtable.h> #include <linux/uaccess.h> +#include <asm/debugfs.h> #include <asm/io.h> #include <asm/machdep.h> #include <asm/rtas.h> diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c index a587e8f4fd26..74b0153780e3 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_host.c +++ b/arch/powerpc/kvm/book3s_64_mmu_host.c @@ -229,6 +229,7 @@ void kvmppc_mmu_unmap_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte) static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) { + unsigned long vsid_bits = VSID_BITS_65_256M; struct kvmppc_sid_map *map; struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); u16 sid_map_mask; @@ -257,7 +258,12 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) kvmppc_mmu_pte_flush(vcpu, 0, 0); kvmppc_mmu_flush_segments(vcpu); } - map->host_vsid = vsid_scramble(vcpu_book3s->proto_vsid_next++, 256M); + + if (mmu_has_feature(MMU_FTR_68_BIT_VA)) + vsid_bits = VSID_BITS_256M; + + map->host_vsid = vsid_scramble(vcpu_book3s->proto_vsid_next++, + VSID_MULTIPLIER_256M, vsid_bits); map->guest_vsid = gvsid; map->valid = true; @@ -390,7 +396,7 @@ int kvmppc_mmu_init(struct kvm_vcpu *vcpu) struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); int err; - err = __init_new_context(); + err = hash__alloc_context_id(); if (err < 0) return -1; vcpu3s->context_id[0] = err; diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 3a1a463a039a..ffde4507ddfd 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -16,7 +16,6 @@ #include <asm/kvm_ppc.h> #include <asm/hvcall.h> #include <asm/xics.h> -#include <asm/debug.h> #include <asm/synch.h> #include <asm/cputhreads.h> #include <asm/pgtable.h> diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index ef4fd528c193..459b72cb617a 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -19,10 +19,9 @@ #include <asm/kvm_ppc.h> #include <asm/hvcall.h> #include <asm/xics.h> -#include <asm/debug.h> +#include <asm/debugfs.h> #include <asm/time.h> -#include <linux/debugfs.h> #include <linux/seq_file.h> #include "book3s_xics.h" diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 51def8a515be..3a7d580fdc59 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -120,8 +120,6 @@ static int do_sigbus(struct pt_regs *regs, unsigned long address, siginfo_t info; unsigned int lsb = 0; - up_read(¤t->mm->mmap_sem); - if (!user_mode(regs)) return MM_FAULT_ERR(SIGBUS); @@ -154,13 +152,6 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault) * continue the pagefault. */ if (fatal_signal_pending(current)) { - /* - * If we have retry set, the mmap semaphore will have - * alrady been released in __lock_page_or_retry(). Else - * we release it now. - */ - if (!(fault & VM_FAULT_RETRY)) - up_read(¤t->mm->mmap_sem); /* Coming from kernel, we need to deal with uaccess fixups */ if (user_mode(regs)) return MM_FAULT_RETURN; @@ -173,8 +164,6 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault) /* Out of memory */ if (fault & VM_FAULT_OOM) { - up_read(¤t->mm->mmap_sem); - /* * We ran out of memory, or some other thing happened to us that * made us unable to handle the page fault gracefully. @@ -298,7 +287,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address, * can result in fault, which will cause a deadlock when called with * mmap_sem held */ - if (user_mode(regs)) + if (!is_exec && user_mode(regs)) store_update_sp = store_updates_sp(regs); if (user_mode(regs)) @@ -458,9 +447,30 @@ good_area: * the fault. */ fault = handle_mm_fault(vma, address, flags); + + /* + * Handle the retry right now, the mmap_sem has been released in that + * case. + */ + if (unlikely(fault & VM_FAULT_RETRY)) { + /* We retry only once */ + if (flags & FAULT_FLAG_ALLOW_RETRY) { + /* + * Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk + * of starvation. + */ + flags &= ~FAULT_FLAG_ALLOW_RETRY; + flags |= FAULT_FLAG_TRIED; + if (!fatal_signal_pending(current)) + goto retry; + } + /* We will enter mm_fault_error() below */ + } else + up_read(¤t->mm->mmap_sem); + if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) { if (fault & VM_FAULT_SIGSEGV) - goto bad_area; + goto bad_area_nosemaphore; rc = mm_fault_error(regs, address, fault); if (rc >= MM_FAULT_RETURN) goto bail; @@ -469,41 +479,29 @@ good_area: } /* - * Major/minor page fault accounting is only done on the - * initial attempt. If we go through a retry, it is extremely - * likely that the page will be found in page cache at that point. + * Major/minor page fault accounting. */ - if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_MAJOR) { - current->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, - regs, address); + if (fault & VM_FAULT_MAJOR) { + current->maj_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, + regs, address); #ifdef CONFIG_PPC_SMLPAR - if (firmware_has_feature(FW_FEATURE_CMO)) { - u32 page_ins; - - preempt_disable(); - page_ins = be32_to_cpu(get_lppaca()->page_ins); - page_ins += 1 << PAGE_FACTOR; - get_lppaca()->page_ins = cpu_to_be32(page_ins); - preempt_enable(); - } -#endif /* CONFIG_PPC_SMLPAR */ - } else { - current->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, - regs, address); - } - if (fault & VM_FAULT_RETRY) { - /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk - * of starvation. */ - flags &= ~FAULT_FLAG_ALLOW_RETRY; - flags |= FAULT_FLAG_TRIED; - goto retry; + if (firmware_has_feature(FW_FEATURE_CMO)) { + u32 page_ins; + + preempt_disable(); + page_ins = be32_to_cpu(get_lppaca()->page_ins); + page_ins += 1 << PAGE_FACTOR; + get_lppaca()->page_ins = cpu_to_be32(page_ins); + preempt_enable(); } +#endif /* CONFIG_PPC_SMLPAR */ + } else { + current->min_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, + regs, address); } - up_read(&mm->mmap_sem); goto bail; bad_area: diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S index 09cc50c8dace..6f962e5cb5e1 100644 --- a/arch/powerpc/mm/hash_low_32.S +++ b/arch/powerpc/mm/hash_low_32.S @@ -31,10 +31,8 @@ #ifdef CONFIG_SMP .section .bss .align 2 - .globl mmu_hash_lock mmu_hash_lock: .space 4 -EXPORT_SYMBOL(mmu_hash_lock) #endif /* CONFIG_SMP */ /* diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index c554768b1fa2..f2095ce9d4b0 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -35,9 +35,8 @@ #include <linux/memblock.h> #include <linux/context_tracking.h> #include <linux/libfdt.h> -#include <linux/debugfs.h> -#include <asm/debug.h> +#include <asm/debugfs.h> #include <asm/processor.h> #include <asm/pgtable.h> #include <asm/mmu.h> @@ -927,11 +926,6 @@ static void __init htab_initialize(void) } #endif /* CONFIG_DEBUG_PAGEALLOC */ - /* On U3 based machines, we need to reserve the DART area and - * _NOT_ map it to avoid cache paradoxes as it's remapped non - * cacheable later on - */ - /* create bolted the linear mapping in the hash table */ for_each_memblock(memory, reg) { base = (unsigned long)__va(reg->base); @@ -981,6 +975,19 @@ void __init hash__early_init_devtree(void) void __init hash__early_init_mmu(void) { + /* + * We have code in __hash_page_64K() and elsewhere, which assumes it can + * do the following: + * new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & (H_PAGE_F_SECOND | H_PAGE_F_GIX); + * + * Where the slot number is between 0-15, and values of 8-15 indicate + * the secondary bucket. For that code to work H_PAGE_F_SECOND and + * H_PAGE_F_GIX must occupy four contiguous bits in the PTE, and + * H_PAGE_F_SECOND must be placed above H_PAGE_F_GIX. Assert that here + * with a BUILD_BUG_ON(). + */ + BUILD_BUG_ON(H_PAGE_F_SECOND != (1ul << (H_PAGE_F_GIX_SHIFT + 3))); + htab_init_page_sizes(); /* @@ -1120,7 +1127,7 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr) copro_flush_all_slbs(mm); if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) { - copy_mm_to_paca(&mm->context); + copy_mm_to_paca(mm); slb_flush_and_rebolt(); } } @@ -1192,7 +1199,7 @@ static void check_paca_psize(unsigned long ea, struct mm_struct *mm, { if (user_region) { if (psize != get_paca_psize(ea)) { - copy_mm_to_paca(&mm->context); + copy_mm_to_paca(mm); slb_flush_and_rebolt(); } } else if (get_paca()->vmalloc_sllp != @@ -1855,5 +1862,4 @@ static int __init hash64_debugfs(void) return 0; } machine_device_initcall(pseries, hash64_debugfs); - #endif /* CONFIG_DEBUG_FS */ diff --git a/arch/powerpc/mm/hugetlbpage-book3e.c b/arch/powerpc/mm/hugetlbpage-book3e.c index 83a8be791e06..bfe4e8526b2d 100644 --- a/arch/powerpc/mm/hugetlbpage-book3e.c +++ b/arch/powerpc/mm/hugetlbpage-book3e.c @@ -148,16 +148,9 @@ void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, mm = vma->vm_mm; -#ifdef CONFIG_PPC_MM_SLICES - psize = get_slice_psize(mm, ea); - tsize = mmu_get_tsize(psize); - shift = mmu_psize_defs[psize].shift; -#else psize = vma_mmu_pagesize(vma); shift = __ilog2(psize); tsize = shift - 10; -#endif - /* * We can't be interrupted while we're setting up the MAS * regusters or after we've confirmed that no tlb exists. diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c index 35254a678456..0aa9cade422f 100644 --- a/arch/powerpc/mm/hugetlbpage-radix.c +++ b/arch/powerpc/mm/hugetlbpage-radix.c @@ -50,9 +50,12 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, struct hstate *h = hstate_file(file); struct vm_unmapped_area_info info; + if (unlikely(addr > mm->context.addr_limit && addr < TASK_SIZE)) + mm->context.addr_limit = TASK_SIZE; + if (len & ~huge_page_mask(h)) return -EINVAL; - if (len > TASK_SIZE) + if (len > mm->context.addr_limit) return -ENOMEM; if (flags & MAP_FIXED) { @@ -64,7 +67,7 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (addr) { addr = ALIGN(addr, huge_page_size(h)); vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && + if (mm->context.addr_limit - len >= addr && (!vma || addr + len <= vma->vm_start)) return addr; } @@ -78,5 +81,9 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, info.high_limit = current->mm->mmap_base; info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; + + if (addr > DEFAULT_MAP_WINDOW) + info.high_limit += mm->context.addr_limit - DEFAULT_MAP_WINDOW; + return vm_unmapped_area(&info); } diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 8c3389cbcd12..a4f33de4008e 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -753,6 +753,24 @@ static int __init add_huge_page_size(unsigned long long size) if ((mmu_psize = shift_to_mmu_psize(shift)) < 0) return -EINVAL; +#ifdef CONFIG_PPC_BOOK3S_64 + /* + * We need to make sure that for different page sizes reported by + * firmware we only add hugetlb support for page sizes that can be + * supported by linux page table layout. + * For now we have + * Radix: 2M + * Hash: 16M and 16G + */ + if (radix_enabled()) { + if (mmu_psize != MMU_PAGE_2M) + return -EINVAL; + } else { + if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G) + return -EINVAL; + } +#endif + BUG_ON(mmu_psize_defs[mmu_psize].shift != shift); /* Return if huge page size has already been setup */ diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 9be992083d2a..8f6f2a173e47 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -71,10 +71,6 @@ #if H_PGTABLE_RANGE > USER_VSID_RANGE #warning Limited user VSID range means pagetable space is wasted #endif - -#if (TASK_SIZE_USER64 < H_PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE) -#warning TASK_SIZE is smaller than it needs to be. -#endif #endif /* CONFIG_PPC_STD_MMU_64 */ phys_addr_t memstart_addr = ~0; diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c index a5d9ef59debe..b2111baa0da6 100644 --- a/arch/powerpc/mm/mmap.c +++ b/arch/powerpc/mm/mmap.c @@ -79,7 +79,7 @@ static inline unsigned long mmap_base(unsigned long rnd) else if (gap > MAX_GAP) gap = MAX_GAP; - return PAGE_ALIGN(TASK_SIZE - gap - rnd); + return PAGE_ALIGN(DEFAULT_MAP_WINDOW - gap - rnd); } #ifdef CONFIG_PPC_RADIX_MMU @@ -97,7 +97,10 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned long addr, struct vm_area_struct *vma; struct vm_unmapped_area_info info; - if (len > TASK_SIZE - mmap_min_addr) + if (unlikely(addr > mm->context.addr_limit && addr < TASK_SIZE)) + mm->context.addr_limit = TASK_SIZE; + + if (len > mm->context.addr_limit - mmap_min_addr) return -ENOMEM; if (flags & MAP_FIXED) @@ -106,7 +109,7 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned long addr, if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && + if (mm->context.addr_limit - len >= addr && addr >= mmap_min_addr && (!vma || addr + len <= vma->vm_start)) return addr; } @@ -114,8 +117,13 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned long addr, info.flags = 0; info.length = len; info.low_limit = mm->mmap_base; - info.high_limit = TASK_SIZE; info.align_mask = 0; + + if (unlikely(addr > DEFAULT_MAP_WINDOW)) + info.high_limit = mm->context.addr_limit; + else + info.high_limit = DEFAULT_MAP_WINDOW; + return vm_unmapped_area(&info); } @@ -131,8 +139,11 @@ radix__arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr = addr0; struct vm_unmapped_area_info info; + if (unlikely(addr > mm->context.addr_limit && addr < TASK_SIZE)) + mm->context.addr_limit = TASK_SIZE; + /* requested length too big for entire address space */ - if (len > TASK_SIZE - mmap_min_addr) + if (len > mm->context.addr_limit - mmap_min_addr) return -ENOMEM; if (flags & MAP_FIXED) @@ -142,7 +153,7 @@ radix__arch_get_unmapped_area_topdown(struct file *filp, if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && + if (mm->context.addr_limit - len >= addr && addr >= mmap_min_addr && (!vma || addr + len <= vma->vm_start)) return addr; } @@ -152,7 +163,14 @@ radix__arch_get_unmapped_area_topdown(struct file *filp, info.low_limit = max(PAGE_SIZE, mmap_min_addr); info.high_limit = mm->mmap_base; info.align_mask = 0; + + if (addr > DEFAULT_MAP_WINDOW) + info.high_limit += mm->context.addr_limit - DEFAULT_MAP_WINDOW; + addr = vm_unmapped_area(&info); + if (!(addr & ~PAGE_MASK)) + return addr; + VM_BUG_ON(addr != -ENOMEM); /* * A failed mmap() very likely causes application failure, @@ -160,15 +178,7 @@ radix__arch_get_unmapped_area_topdown(struct file *filp, * can happen with large stack limits and large mmap() * allocations. */ - if (addr & ~PAGE_MASK) { - VM_BUG_ON(addr != -ENOMEM); - info.flags = 0; - info.low_limit = TASK_UNMAPPED_BASE; - info.high_limit = TASK_SIZE; - addr = vm_unmapped_area(&info); - } - - return addr; + return radix__arch_get_unmapped_area(filp, addr0, len, pgoff, flags); } static void radix__arch_pick_mmap_layout(struct mm_struct *mm, diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c index 73bf6e14c3aa..c6dca2ae78ef 100644 --- a/arch/powerpc/mm/mmu_context_book3s64.c +++ b/arch/powerpc/mm/mmu_context_book3s64.c @@ -30,17 +30,16 @@ static DEFINE_SPINLOCK(mmu_context_lock); static DEFINE_IDA(mmu_context_ida); -int __init_new_context(void) +static int alloc_context_id(int min_id, int max_id) { - int index; - int err; + int index, err; again: if (!ida_pre_get(&mmu_context_ida, GFP_KERNEL)) return -ENOMEM; spin_lock(&mmu_context_lock); - err = ida_get_new_above(&mmu_context_ida, 1, &index); + err = ida_get_new_above(&mmu_context_ida, min_id, &index); spin_unlock(&mmu_context_lock); if (err == -EAGAIN) @@ -48,7 +47,7 @@ again: else if (err) return err; - if (index > MAX_USER_CONTEXT) { + if (index > max_id) { spin_lock(&mmu_context_lock); ida_remove(&mmu_context_ida, index); spin_unlock(&mmu_context_lock); @@ -57,48 +56,105 @@ again: return index; } -EXPORT_SYMBOL_GPL(__init_new_context); -static int radix__init_new_context(struct mm_struct *mm, int index) + +void hash__reserve_context_id(int id) +{ + int rc, result = 0; + + do { + if (!ida_pre_get(&mmu_context_ida, GFP_KERNEL)) + break; + + spin_lock(&mmu_context_lock); + rc = ida_get_new_above(&mmu_context_ida, id, &result); + spin_unlock(&mmu_context_lock); + } while (rc == -EAGAIN); + + WARN(result != id, "mmu: Failed to reserve context id %d (rc %d)\n", id, result); +} + +int hash__alloc_context_id(void) +{ + unsigned long max; + + if (mmu_has_feature(MMU_FTR_68_BIT_VA)) + max = MAX_USER_CONTEXT; + else + max = MAX_USER_CONTEXT_65BIT_VA; + + return alloc_context_id(MIN_USER_CONTEXT, max); +} +EXPORT_SYMBOL_GPL(hash__alloc_context_id); + +static int hash__init_new_context(struct mm_struct *mm) +{ + int index; + + index = hash__alloc_context_id(); + if (index < 0) + return index; + + /* + * We do switch_slb() early in fork, even before we setup the + * mm->context.addr_limit. Default to max task size so that we copy the + * default values to paca which will help us to handle slb miss early. + */ + mm->context.addr_limit = TASK_SIZE_128TB; + + /* + * The old code would re-promote on fork, we don't do that when using + * slices as it could cause problem promoting slices that have been + * forced down to 4K. + * + * For book3s we have MMU_NO_CONTEXT set to be ~0. Hence check + * explicitly against context.id == 0. This ensures that we properly + * initialize context slice details for newly allocated mm's (which will + * have id == 0) and don't alter context slice inherited via fork (which + * will have id != 0). + * + * We should not be calling init_new_context() on init_mm. Hence a + * check against 0 is OK. + */ + if (mm->context.id == 0) + slice_set_user_psize(mm, mmu_virtual_psize); + + subpage_prot_init_new_context(mm); + + return index; +} + +static int radix__init_new_context(struct mm_struct *mm) { unsigned long rts_field; + int index; + + index = alloc_context_id(1, PRTB_ENTRIES - 1); + if (index < 0) + return index; /* * set the process table entry, */ rts_field = radix__get_tree_size(); process_tb[index].prtb0 = cpu_to_be64(rts_field | __pa(mm->pgd) | RADIX_PGD_INDEX_SIZE); - return 0; + + mm->context.npu_context = NULL; + + return index; } int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { int index; - index = __init_new_context(); + if (radix_enabled()) + index = radix__init_new_context(mm); + else + index = hash__init_new_context(mm); + if (index < 0) return index; - if (radix_enabled()) { - radix__init_new_context(mm, index); - } else { - - /* The old code would re-promote on fork, we don't do that - * when using slices as it could cause problem promoting slices - * that have been forced down to 4K - * - * For book3s we have MMU_NO_CONTEXT set to be ~0. Hence check - * explicitly against context.id == 0. This ensures that we - * properly initialize context slice details for newly allocated - * mm's (which will have id == 0) and don't alter context slice - * inherited via fork (which will have id != 0). - * - * We should not be calling init_new_context() on init_mm. Hence a - * check against 0 is ok. - */ - if (mm->context.id == 0) - slice_set_user_psize(mm, mmu_virtual_psize); - subpage_prot_init_new_context(mm); - } mm->context.id = index; #ifdef CONFIG_PPC_ICSWX mm->context.cop_lockp = kmalloc(sizeof(spinlock_t), GFP_KERNEL); diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c index c491f2c8f2b9..4554d6527682 100644 --- a/arch/powerpc/mm/mmu_context_nohash.c +++ b/arch/powerpc/mm/mmu_context_nohash.c @@ -333,11 +333,6 @@ int init_new_context(struct task_struct *t, struct mm_struct *mm) mm->context.id = MMU_NO_CONTEXT; mm->context.active = 0; - -#ifdef CONFIG_PPC_MM_SLICES - slice_set_user_psize(mm, mmu_virtual_psize); -#endif - return 0; } diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 9befaee237d6..371792e4418f 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -875,13 +875,6 @@ static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn) void *nd; int tnid; - if (spanned_pages) - pr_info("Initmem setup node %d [mem %#010Lx-%#010Lx]\n", - nid, start_pfn << PAGE_SHIFT, - (end_pfn << PAGE_SHIFT) - 1); - else - pr_info("Initmem setup node %d\n", nid); - nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid); nd = __va(nd_pa); diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 5e01b2ece1d0..98ae810b8c21 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -229,7 +229,7 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm) asm volatile("slbie %0" : : "r" (slbie_data)); get_paca()->slb_cache_ptr = 0; - copy_mm_to_paca(&mm->context); + copy_mm_to_paca(mm); /* * preload some userspace segments into the SLB. diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S index a85e06ea6c20..1519617aab36 100644 --- a/arch/powerpc/mm/slb_low.S +++ b/arch/powerpc/mm/slb_low.S @@ -23,6 +23,48 @@ #include <asm/pgtable.h> #include <asm/firmware.h> +/* + * This macro generates asm code to compute the VSID scramble + * function. Used in slb_allocate() and do_stab_bolted. The function + * computed is: (protovsid*VSID_MULTIPLIER) % VSID_MODULUS + * + * rt = register containing the proto-VSID and into which the + * VSID will be stored + * rx = scratch register (clobbered) + * rf = flags + * + * - rt and rx must be different registers + * - The answer will end up in the low VSID_BITS bits of rt. The higher + * bits may contain other garbage, so you may need to mask the + * result. + */ +#define ASM_VSID_SCRAMBLE(rt, rx, rf, size) \ + lis rx,VSID_MULTIPLIER_##size@h; \ + ori rx,rx,VSID_MULTIPLIER_##size@l; \ + mulld rt,rt,rx; /* rt = rt * MULTIPLIER */ \ +/* \ + * powermac get slb fault before feature fixup, so make 65 bit part \ + * the default part of feature fixup \ + */ \ +BEGIN_MMU_FTR_SECTION \ + srdi rx,rt,VSID_BITS_65_##size; \ + clrldi rt,rt,(64-VSID_BITS_65_##size); \ + add rt,rt,rx; \ + addi rx,rt,1; \ + srdi rx,rx,VSID_BITS_65_##size; \ + add rt,rt,rx; \ + rldimi rf,rt,SLB_VSID_SHIFT_##size,(64 - (SLB_VSID_SHIFT_##size + VSID_BITS_65_##size)); \ +MMU_FTR_SECTION_ELSE \ + srdi rx,rt,VSID_BITS_##size; \ + clrldi rt,rt,(64-VSID_BITS_##size); \ + add rt,rt,rx; /* add high and low bits */ \ + addi rx,rt,1; \ + srdi rx,rx,VSID_BITS_##size; /* extract 2^VSID_BITS bit */ \ + add rt,rt,rx; \ + rldimi rf,rt,SLB_VSID_SHIFT_##size,(64 - (SLB_VSID_SHIFT_##size + VSID_BITS_##size)); \ +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_68_BIT_VA) + + /* void slb_allocate_realmode(unsigned long ea); * * Create an SLB entry for the given EA (user or kernel). @@ -45,13 +87,6 @@ _GLOBAL(slb_allocate_realmode) /* r3 = address, r10 = esid, cr7 = <> PAGE_OFFSET */ blt cr7,0f /* user or kernel? */ - /* kernel address: proto-VSID = ESID */ - /* WARNING - MAGIC: we don't use the VSID 0xfffffffff, but - * this code will generate the protoVSID 0xfffffffff for the - * top segment. That's ok, the scramble below will translate - * it to VSID 0, which is reserved as a bad VSID - one which - * will never have any pages in it. */ - /* Check if hitting the linear mapping or some other kernel space */ bne cr7,1f @@ -63,12 +98,10 @@ _GLOBAL(slb_allocate_realmode) slb_miss_kernel_load_linear: li r11,0 /* - * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1 + * context = (ea >> 60) - (0xc - 1) * r9 = region id. */ - addis r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha - addi r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l - + subi r9,r9,KERNEL_REGION_CONTEXT_OFFSET BEGIN_FTR_SECTION b .Lslb_finish_load @@ -77,9 +110,9 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT) 1: #ifdef CONFIG_SPARSEMEM_VMEMMAP - /* Check virtual memmap region. To be patches at kernel boot */ cmpldi cr0,r9,0xf bne 1f +/* Check virtual memmap region. To be patched at kernel boot */ .globl slb_miss_kernel_load_vmemmap slb_miss_kernel_load_vmemmap: li r11,0 @@ -102,11 +135,10 @@ slb_miss_kernel_load_io: li r11,0 6: /* - * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1 + * context = (ea >> 60) - (0xc - 1) * r9 = region id. */ - addis r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha - addi r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l + subi r9,r9,KERNEL_REGION_CONTEXT_OFFSET BEGIN_FTR_SECTION b .Lslb_finish_load @@ -117,7 +149,13 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT) * For userspace addresses, make sure this is region 0. */ cmpdi r9, 0 - bne 8f + bne- 8f + /* + * user space make sure we are within the allowed limit + */ + ld r11,PACA_ADDR_LIMIT(r13) + cmpld r3,r11 + bge- 8f /* when using slices, we extract the psize off the slice bitmaps * and then we need to get the sllp encoding off the mmu_psize_defs @@ -189,13 +227,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) */ .Lslb_finish_load: rldimi r10,r9,ESID_BITS,0 - ASM_VSID_SCRAMBLE(r10,r9,256M) - /* - * bits above VSID_BITS_256M need to be ignored from r10 - * also combine VSID and flags - */ - rldimi r11,r10,SLB_VSID_SHIFT,(64 - (SLB_VSID_SHIFT + VSID_BITS_256M)) - + ASM_VSID_SCRAMBLE(r10,r9,r11,256M) /* r3 = EA, r11 = VSID data */ /* * Find a slot, round robin. Previously we tried to find a @@ -259,12 +291,12 @@ slb_compare_rr_to_size: .Lslb_finish_load_1T: srdi r10,r10,(SID_SHIFT_1T - SID_SHIFT) /* get 1T ESID */ rldimi r10,r9,ESID_BITS_1T,0 - ASM_VSID_SCRAMBLE(r10,r9,1T) + ASM_VSID_SCRAMBLE(r10,r9,r11,1T) /* * bits above VSID_BITS_1T need to be ignored from r10 * also combine VSID and flags */ - rldimi r11,r10,SLB_VSID_SHIFT_1T,(64 - (SLB_VSID_SHIFT_1T + VSID_BITS_1T)) + li r10,MMU_SEGSIZE_1T rldimi r11,r10,SLB_VSID_SSIZE_SHIFT,0 /* insert segment size */ diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c index 2b27458902ee..251b6bae7023 100644 --- a/arch/powerpc/mm/slice.c +++ b/arch/powerpc/mm/slice.c @@ -36,38 +36,29 @@ #include <asm/copro.h> #include <asm/hugetlb.h> -/* some sanity checks */ -#if (H_PGTABLE_RANGE >> 43) > SLICE_MASK_SIZE -#error H_PGTABLE_RANGE exceeds slice_mask high_slices size -#endif - static DEFINE_SPINLOCK(slice_convert_lock); - +/* + * One bit per slice. We have lower slices which cover 256MB segments + * upto 4G range. That gets us 16 low slices. For the rest we track slices + * in 1TB size. + */ +struct slice_mask { + u64 low_slices; + DECLARE_BITMAP(high_slices, SLICE_NUM_HIGH); +}; #ifdef DEBUG int _slice_debug = 1; static void slice_print_mask(const char *label, struct slice_mask mask) { - char *p, buf[16 + 3 + 64 + 1]; - int i; - if (!_slice_debug) return; - p = buf; - for (i = 0; i < SLICE_NUM_LOW; i++) - *(p++) = (mask.low_slices & (1 << i)) ? '1' : '0'; - *(p++) = ' '; - *(p++) = '-'; - *(p++) = ' '; - for (i = 0; i < SLICE_NUM_HIGH; i++) - *(p++) = (mask.high_slices & (1ul << i)) ? '1' : '0'; - *(p++) = 0; - - printk(KERN_DEBUG "%s:%s\n", label, buf); + pr_devel("%s low_slice: %*pbl\n", label, (int)SLICE_NUM_LOW, &mask.low_slices); + pr_devel("%s high_slice: %*pbl\n", label, (int)SLICE_NUM_HIGH, mask.high_slices); } -#define slice_dbg(fmt...) do { if (_slice_debug) pr_debug(fmt); } while(0) +#define slice_dbg(fmt...) do { if (_slice_debug) pr_devel(fmt); } while (0) #else @@ -76,25 +67,28 @@ static void slice_print_mask(const char *label, struct slice_mask mask) {} #endif -static struct slice_mask slice_range_to_mask(unsigned long start, - unsigned long len) +static void slice_range_to_mask(unsigned long start, unsigned long len, + struct slice_mask *ret) { unsigned long end = start + len - 1; - struct slice_mask ret = { 0, 0 }; + + ret->low_slices = 0; + bitmap_zero(ret->high_slices, SLICE_NUM_HIGH); if (start < SLICE_LOW_TOP) { - unsigned long mend = min(end, SLICE_LOW_TOP); - unsigned long mstart = min(start, SLICE_LOW_TOP); + unsigned long mend = min(end, (SLICE_LOW_TOP - 1)); - ret.low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1)) - - (1u << GET_LOW_SLICE_INDEX(mstart)); + ret->low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1)) + - (1u << GET_LOW_SLICE_INDEX(start)); } - if ((start + len) > SLICE_LOW_TOP) - ret.high_slices = (1ul << (GET_HIGH_SLICE_INDEX(end) + 1)) - - (1ul << GET_HIGH_SLICE_INDEX(start)); + if ((start + len) > SLICE_LOW_TOP) { + unsigned long start_index = GET_HIGH_SLICE_INDEX(start); + unsigned long align_end = ALIGN(end, (1UL << SLICE_HIGH_SHIFT)); + unsigned long count = GET_HIGH_SLICE_INDEX(align_end) - start_index; - return ret; + bitmap_set(ret->high_slices, start_index, count); + } } static int slice_area_is_free(struct mm_struct *mm, unsigned long addr, @@ -128,53 +122,60 @@ static int slice_high_has_vma(struct mm_struct *mm, unsigned long slice) return !slice_area_is_free(mm, start, end - start); } -static struct slice_mask slice_mask_for_free(struct mm_struct *mm) +static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret) { - struct slice_mask ret = { 0, 0 }; unsigned long i; + ret->low_slices = 0; + bitmap_zero(ret->high_slices, SLICE_NUM_HIGH); + for (i = 0; i < SLICE_NUM_LOW; i++) if (!slice_low_has_vma(mm, i)) - ret.low_slices |= 1u << i; + ret->low_slices |= 1u << i; if (mm->task_size <= SLICE_LOW_TOP) - return ret; + return; - for (i = 0; i < SLICE_NUM_HIGH; i++) + for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++) if (!slice_high_has_vma(mm, i)) - ret.high_slices |= 1ul << i; - - return ret; + __set_bit(i, ret->high_slices); } -static struct slice_mask slice_mask_for_size(struct mm_struct *mm, int psize) +static void slice_mask_for_size(struct mm_struct *mm, int psize, struct slice_mask *ret) { unsigned char *hpsizes; int index, mask_index; - struct slice_mask ret = { 0, 0 }; unsigned long i; u64 lpsizes; + ret->low_slices = 0; + bitmap_zero(ret->high_slices, SLICE_NUM_HIGH); + lpsizes = mm->context.low_slices_psize; for (i = 0; i < SLICE_NUM_LOW; i++) if (((lpsizes >> (i * 4)) & 0xf) == psize) - ret.low_slices |= 1u << i; + ret->low_slices |= 1u << i; hpsizes = mm->context.high_slices_psize; - for (i = 0; i < SLICE_NUM_HIGH; i++) { + for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++) { mask_index = i & 0x1; index = i >> 1; if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == psize) - ret.high_slices |= 1ul << i; + __set_bit(i, ret->high_slices); } - - return ret; } -static int slice_check_fit(struct slice_mask mask, struct slice_mask available) +static int slice_check_fit(struct mm_struct *mm, + struct slice_mask mask, struct slice_mask available) { + DECLARE_BITMAP(result, SLICE_NUM_HIGH); + unsigned long slice_count = GET_HIGH_SLICE_INDEX(mm->context.addr_limit); + + bitmap_and(result, mask.high_slices, + available.high_slices, slice_count); + return (mask.low_slices & available.low_slices) == mask.low_slices && - (mask.high_slices & available.high_slices) == mask.high_slices; + bitmap_equal(result, mask.high_slices, slice_count); } static void slice_flush_segments(void *parm) @@ -185,7 +186,7 @@ static void slice_flush_segments(void *parm) if (mm != current->active_mm) return; - copy_mm_to_paca(¤t->active_mm->context); + copy_mm_to_paca(current->active_mm); local_irq_save(flags); slb_flush_and_rebolt(); @@ -218,18 +219,18 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz mm->context.low_slices_psize = lpsizes; hpsizes = mm->context.high_slices_psize; - for (i = 0; i < SLICE_NUM_HIGH; i++) { + for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++) { mask_index = i & 0x1; index = i >> 1; - if (mask.high_slices & (1ul << i)) + if (test_bit(i, mask.high_slices)) hpsizes[index] = (hpsizes[index] & ~(0xf << (mask_index * 4))) | (((unsigned long)psize) << (mask_index * 4)); } slice_dbg(" lsps=%lx, hsps=%lx\n", - mm->context.low_slices_psize, - mm->context.high_slices_psize); + (unsigned long)mm->context.low_slices_psize, + (unsigned long)mm->context.high_slices_psize); spin_unlock_irqrestore(&slice_convert_lock, flags); @@ -257,14 +258,14 @@ static bool slice_scan_available(unsigned long addr, slice = GET_HIGH_SLICE_INDEX(addr); *boundary_addr = (slice + end) ? ((slice + end) << SLICE_HIGH_SHIFT) : SLICE_LOW_TOP; - return !!(available.high_slices & (1ul << slice)); + return !!test_bit(slice, available.high_slices); } } static unsigned long slice_find_area_bottomup(struct mm_struct *mm, unsigned long len, struct slice_mask available, - int psize) + int psize, unsigned long high_limit) { int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); unsigned long addr, found, next_end; @@ -276,7 +277,10 @@ static unsigned long slice_find_area_bottomup(struct mm_struct *mm, info.align_offset = 0; addr = TASK_UNMAPPED_BASE; - while (addr < TASK_SIZE) { + /* + * Check till the allow max value for this mmap request + */ + while (addr < high_limit) { info.low_limit = addr; if (!slice_scan_available(addr, available, 1, &addr)) continue; @@ -288,8 +292,8 @@ static unsigned long slice_find_area_bottomup(struct mm_struct *mm, * Check if we need to reduce the range, or if we can * extend it to cover the next available slice. */ - if (addr >= TASK_SIZE) - addr = TASK_SIZE; + if (addr >= mm->context.addr_limit) + addr = mm->context.addr_limit; else if (slice_scan_available(addr, available, 1, &next_end)) { addr = next_end; goto next_slice; @@ -307,7 +311,7 @@ static unsigned long slice_find_area_bottomup(struct mm_struct *mm, static unsigned long slice_find_area_topdown(struct mm_struct *mm, unsigned long len, struct slice_mask available, - int psize) + int psize, unsigned long high_limit) { int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); unsigned long addr, found, prev; @@ -319,6 +323,15 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm, info.align_offset = 0; addr = mm->mmap_base; + /* + * If we are trying to allocate above DEFAULT_MAP_WINDOW + * Add the different to the mmap_base. + * Only for that request for which high_limit is above + * DEFAULT_MAP_WINDOW we should apply this. + */ + if (high_limit > DEFAULT_MAP_WINDOW) + addr += mm->context.addr_limit - DEFAULT_MAP_WINDOW; + while (addr > PAGE_SIZE) { info.high_limit = addr; if (!slice_scan_available(addr - 1, available, 0, &addr)) @@ -350,29 +363,38 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm, * can happen with large stack limits and large mmap() * allocations. */ - return slice_find_area_bottomup(mm, len, available, psize); + return slice_find_area_bottomup(mm, len, available, psize, high_limit); } static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len, struct slice_mask mask, int psize, - int topdown) + int topdown, unsigned long high_limit) { if (topdown) - return slice_find_area_topdown(mm, len, mask, psize); + return slice_find_area_topdown(mm, len, mask, psize, high_limit); else - return slice_find_area_bottomup(mm, len, mask, psize); + return slice_find_area_bottomup(mm, len, mask, psize, high_limit); } -#define or_mask(dst, src) do { \ - (dst).low_slices |= (src).low_slices; \ - (dst).high_slices |= (src).high_slices; \ -} while (0) +static inline void slice_or_mask(struct slice_mask *dst, struct slice_mask *src) +{ + DECLARE_BITMAP(result, SLICE_NUM_HIGH); + + dst->low_slices |= src->low_slices; + bitmap_or(result, dst->high_slices, src->high_slices, SLICE_NUM_HIGH); + bitmap_copy(dst->high_slices, result, SLICE_NUM_HIGH); +} -#define andnot_mask(dst, src) do { \ - (dst).low_slices &= ~(src).low_slices; \ - (dst).high_slices &= ~(src).high_slices; \ -} while (0) +static inline void slice_andnot_mask(struct slice_mask *dst, struct slice_mask *src) +{ + DECLARE_BITMAP(result, SLICE_NUM_HIGH); + + dst->low_slices &= ~src->low_slices; + + bitmap_andnot(result, dst->high_slices, src->high_slices, SLICE_NUM_HIGH); + bitmap_copy(dst->high_slices, result, SLICE_NUM_HIGH); +} #ifdef CONFIG_PPC_64K_PAGES #define MMU_PAGE_BASE MMU_PAGE_64K @@ -384,14 +406,42 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, unsigned long flags, unsigned int psize, int topdown) { - struct slice_mask mask = {0, 0}; + struct slice_mask mask; struct slice_mask good_mask; - struct slice_mask potential_mask = {0,0} /* silence stupid warning */; - struct slice_mask compat_mask = {0, 0}; + struct slice_mask potential_mask; + struct slice_mask compat_mask; int fixed = (flags & MAP_FIXED); int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); struct mm_struct *mm = current->mm; unsigned long newaddr; + unsigned long high_limit; + + /* + * Check if we need to expland slice area. + */ + if (unlikely(addr > mm->context.addr_limit && addr < TASK_SIZE)) { + mm->context.addr_limit = TASK_SIZE; + on_each_cpu(slice_flush_segments, mm, 1); + } + /* + * This mmap request can allocate upt to 512TB + */ + if (addr > DEFAULT_MAP_WINDOW) + high_limit = mm->context.addr_limit; + else + high_limit = DEFAULT_MAP_WINDOW; + /* + * init different masks + */ + mask.low_slices = 0; + bitmap_zero(mask.high_slices, SLICE_NUM_HIGH); + + /* silence stupid warning */; + potential_mask.low_slices = 0; + bitmap_zero(potential_mask.high_slices, SLICE_NUM_HIGH); + + compat_mask.low_slices = 0; + bitmap_zero(compat_mask.high_slices, SLICE_NUM_HIGH); /* Sanity checks */ BUG_ON(mm->task_size == 0); @@ -423,7 +473,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, /* First make up a "good" mask of slices that have the right size * already */ - good_mask = slice_mask_for_size(mm, psize); + slice_mask_for_size(mm, psize, &good_mask); slice_print_mask(" good_mask", good_mask); /* @@ -448,22 +498,22 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, #ifdef CONFIG_PPC_64K_PAGES /* If we support combo pages, we can allow 64k pages in 4k slices */ if (psize == MMU_PAGE_64K) { - compat_mask = slice_mask_for_size(mm, MMU_PAGE_4K); + slice_mask_for_size(mm, MMU_PAGE_4K, &compat_mask); if (fixed) - or_mask(good_mask, compat_mask); + slice_or_mask(&good_mask, &compat_mask); } #endif /* First check hint if it's valid or if we have MAP_FIXED */ if (addr != 0 || fixed) { /* Build a mask for the requested range */ - mask = slice_range_to_mask(addr, len); + slice_range_to_mask(addr, len, &mask); slice_print_mask(" mask", mask); /* Check if we fit in the good mask. If we do, we just return, * nothing else to do */ - if (slice_check_fit(mask, good_mask)) { + if (slice_check_fit(mm, mask, good_mask)) { slice_dbg(" fits good !\n"); return addr; } @@ -471,7 +521,8 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, /* Now let's see if we can find something in the existing * slices for that size */ - newaddr = slice_find_area(mm, len, good_mask, psize, topdown); + newaddr = slice_find_area(mm, len, good_mask, + psize, topdown, high_limit); if (newaddr != -ENOMEM) { /* Found within the good mask, we don't have to setup, * we thus return directly @@ -484,11 +535,11 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, /* We don't fit in the good mask, check what other slices are * empty and thus can be converted */ - potential_mask = slice_mask_for_free(mm); - or_mask(potential_mask, good_mask); + slice_mask_for_free(mm, &potential_mask); + slice_or_mask(&potential_mask, &good_mask); slice_print_mask(" potential", potential_mask); - if ((addr != 0 || fixed) && slice_check_fit(mask, potential_mask)) { + if ((addr != 0 || fixed) && slice_check_fit(mm, mask, potential_mask)) { slice_dbg(" fits potential !\n"); goto convert; } @@ -503,7 +554,8 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, * anywhere in the good area. */ if (addr) { - addr = slice_find_area(mm, len, good_mask, psize, topdown); + addr = slice_find_area(mm, len, good_mask, + psize, topdown, high_limit); if (addr != -ENOMEM) { slice_dbg(" found area at 0x%lx\n", addr); return addr; @@ -513,28 +565,29 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, /* Now let's see if we can find something in the existing slices * for that size plus free slices */ - addr = slice_find_area(mm, len, potential_mask, psize, topdown); + addr = slice_find_area(mm, len, potential_mask, + psize, topdown, high_limit); #ifdef CONFIG_PPC_64K_PAGES if (addr == -ENOMEM && psize == MMU_PAGE_64K) { /* retry the search with 4k-page slices included */ - or_mask(potential_mask, compat_mask); - addr = slice_find_area(mm, len, potential_mask, psize, - topdown); + slice_or_mask(&potential_mask, &compat_mask); + addr = slice_find_area(mm, len, potential_mask, + psize, topdown, high_limit); } #endif if (addr == -ENOMEM) return -ENOMEM; - mask = slice_range_to_mask(addr, len); + slice_range_to_mask(addr, len, &mask); slice_dbg(" found potential area at 0x%lx\n", addr); slice_print_mask(" mask", mask); convert: - andnot_mask(mask, good_mask); - andnot_mask(mask, compat_mask); - if (mask.low_slices || mask.high_slices) { + slice_andnot_mask(&mask, &good_mask); + slice_andnot_mask(&mask, &compat_mask); + if (mask.low_slices || !bitmap_empty(mask.high_slices, SLICE_NUM_HIGH)) { slice_convert(mm, mask, psize); if (psize > MMU_PAGE_BASE) on_each_cpu(slice_flush_segments, mm, 1); @@ -649,8 +702,8 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize) slice_dbg(" lsps=%lx, hsps=%lx\n", - mm->context.low_slices_psize, - mm->context.high_slices_psize); + (unsigned long)mm->context.low_slices_psize, + (unsigned long)mm->context.high_slices_psize); bail: spin_unlock_irqrestore(&slice_convert_lock, flags); @@ -659,9 +712,11 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize) void slice_set_range_psize(struct mm_struct *mm, unsigned long start, unsigned long len, unsigned int psize) { - struct slice_mask mask = slice_range_to_mask(start, len); + struct slice_mask mask; VM_BUG_ON(radix_enabled()); + + slice_range_to_mask(start, len, &mask); slice_convert(mm, mask, psize); } @@ -694,14 +749,14 @@ int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, if (radix_enabled()) return 0; - mask = slice_range_to_mask(addr, len); - available = slice_mask_for_size(mm, psize); + slice_range_to_mask(addr, len, &mask); + slice_mask_for_size(mm, psize, &available); #ifdef CONFIG_PPC_64K_PAGES /* We need to account for 4k slices too */ if (psize == MMU_PAGE_64K) { struct slice_mask compat_mask; - compat_mask = slice_mask_for_size(mm, MMU_PAGE_4K); - or_mask(available, compat_mask); + slice_mask_for_size(mm, MMU_PAGE_4K, &compat_mask); + slice_or_mask(&available, &compat_mask); } #endif @@ -711,6 +766,6 @@ int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr, slice_print_mask(" mask", mask); slice_print_mask(" available", available); #endif - return !slice_check_fit(mask, available); + return !slice_check_fit(mm, mask, available); } #endif diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c index 94210940112f..a409f78d206b 100644 --- a/arch/powerpc/mm/subpage-prot.c +++ b/arch/powerpc/mm/subpage-prot.c @@ -197,7 +197,8 @@ long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map) /* Check parameters */ if ((addr & ~PAGE_MASK) || (len & ~PAGE_MASK) || - addr >= TASK_SIZE || len >= TASK_SIZE || addr + len > TASK_SIZE) + addr >= mm->context.addr_limit || len >= mm->context.addr_limit || + addr + len > mm->context.addr_limit) return -EINVAL; if (is_hugepage_only_range(mm, addr, len)) diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index 952713d6cf04..b68b5219cf45 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -34,10 +34,8 @@ static inline void __tlbiel_pid(unsigned long pid, int set, prs = 1; /* process scoped */ r = 1; /* raidx format */ - asm volatile("ptesync": : :"memory"); asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); - asm volatile("ptesync": : :"memory"); } /* @@ -47,9 +45,11 @@ static inline void _tlbiel_pid(unsigned long pid, unsigned long ric) { int set; + asm volatile("ptesync": : :"memory"); for (set = 0; set < POWER9_TLB_SETS_RADIX ; set++) { __tlbiel_pid(pid, set, ric); } + asm volatile("ptesync": : :"memory"); asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); } @@ -129,6 +129,12 @@ void radix__local_flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr) { unsigned long pid; struct mm_struct *mm = tlb->mm; + /* + * If we are doing a full mm flush, we will do a tlb flush + * with RIC_FLUSH_ALL later. + */ + if (tlb->fullmm) + return; preempt_disable(); @@ -195,6 +201,12 @@ void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr) unsigned long pid; struct mm_struct *mm = tlb->mm; + /* + * If we are doing a full mm flush, we will do a tlb flush + * with RIC_FLUSH_ALL later. + */ + if (tlb->fullmm) + return; preempt_disable(); pid = mm->context.id; @@ -437,7 +449,7 @@ void radix__flush_tlb_pte_p9_dd1(unsigned long old_pte, struct mm_struct *mm, return; } - if (old_pte & _PAGE_LARGE) + if (old_pte & R_PAGE_LARGE) radix__flush_tlb_page_psize(mm, address, MMU_PAGE_2M); else radix__flush_tlb_page_psize(mm, address, mmu_virtual_psize); diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c index ba28fcb98597..bfc4a0869609 100644 --- a/arch/powerpc/mm/tlb_nohash.c +++ b/arch/powerpc/mm/tlb_nohash.c @@ -770,7 +770,7 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base, * avoid going over total available memory just in case... */ #ifdef CONFIG_PPC_FSL_BOOK3E - if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { + if (early_mmu_has_feature(MMU_FTR_TYPE_FSL_E)) { unsigned long linear_sz; unsigned int num_cams; diff --git a/arch/powerpc/platforms/44x/sam440ep.c b/arch/powerpc/platforms/44x/sam440ep.c index 688ffeab0699..55fed5e4de14 100644 --- a/arch/powerpc/platforms/44x/sam440ep.c +++ b/arch/powerpc/platforms/44x/sam440ep.c @@ -70,7 +70,7 @@ static struct i2c_board_info sam440ep_rtc_info = { .irq = -1, }; -static int sam440ep_setup_rtc(void) +static int __init sam440ep_setup_rtc(void) { return i2c_register_board_info(0, &sam440ep_rtc_info, 1); } diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 9b25cded03e9..ef4c4b8fc547 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -359,7 +359,7 @@ config PPC_BOOK3E_MMU config PPC_MM_SLICES bool - default y if (!PPC_FSL_BOOK3E && PPC64 && HUGETLB_PAGE) || (PPC_STD_MMU_64 && PPC_64K_PAGES) + default y if PPC_STD_MMU_64 default n config PPC_HAVE_PMU_SUPPORT diff --git a/arch/powerpc/platforms/cell/axon_msi.c b/arch/powerpc/platforms/cell/axon_msi.c index 8b55c5f19d4c..8d3ae2cc52bf 100644 --- a/arch/powerpc/platforms/cell/axon_msi.c +++ b/arch/powerpc/platforms/cell/axon_msi.c @@ -15,9 +15,9 @@ #include <linux/msi.h> #include <linux/export.h> #include <linux/of_platform.h> -#include <linux/debugfs.h> #include <linux/slab.h> +#include <asm/debugfs.h> #include <asm/dcr.h> #include <asm/machdep.h> #include <asm/prom.h> diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig index 9689a6272995..2489805e79f1 100644 --- a/arch/powerpc/platforms/powernv/Kconfig +++ b/arch/powerpc/platforms/powernv/Kconfig @@ -20,6 +20,7 @@ config PPC_POWERNV select CPU_FREQ_GOV_ONDEMAND select CPU_FREQ_GOV_CONSERVATIVE select PPC_DOORBELL + select MMU_NOTIFIER default y config OPAL_PRD diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 4ee837e6391a..b369e39aa392 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -122,9 +122,12 @@ static void pnv_alloc_idle_core_states(void) for (i = 0; i < nr_cores; i++) { int first_cpu = i * threads_per_core; int node = cpu_to_node(first_cpu); + size_t paca_ptr_array_size; core_idle_state = kmalloc_node(sizeof(u32), GFP_KERNEL, node); *core_idle_state = PNV_CORE_IDLE_THREAD_BITS; + paca_ptr_array_size = (threads_per_core * + sizeof(struct paca_struct *)); for (j = 0; j < threads_per_core; j++) { int cpu = first_cpu + j; @@ -132,6 +135,11 @@ static void pnv_alloc_idle_core_states(void) paca[cpu].core_idle_state_ptr = core_idle_state; paca[cpu].thread_idle_state = PNV_THREAD_RUNNING; paca[cpu].thread_mask = 1 << j; + if (!cpu_has_feature(CPU_FTR_POWER9_DD1)) + continue; + paca[cpu].thread_sibling_pacas = + kmalloc_node(paca_ptr_array_size, + GFP_KERNEL, node); } } @@ -147,7 +155,6 @@ u32 pnv_get_supported_cpuidle_states(void) } EXPORT_SYMBOL_GPL(pnv_get_supported_cpuidle_states); - static void pnv_fastsleep_workaround_apply(void *info) { @@ -241,8 +248,9 @@ static DEVICE_ATTR(fastsleep_workaround_applyonce, 0600, * The default stop state that will be used by ppc_md.power_save * function on platforms that support stop instruction. */ -u64 pnv_default_stop_val; -u64 pnv_default_stop_mask; +static u64 pnv_default_stop_val; +static u64 pnv_default_stop_mask; +static bool default_stop_found; /* * Used for ppc_md.power_save which needs a function with no parameters @@ -262,8 +270,42 @@ u64 pnv_first_deep_stop_state = MAX_STOP_STATE; * psscr value and mask of the deepest stop idle state. * Used when a cpu is offlined. */ -u64 pnv_deepest_stop_psscr_val; -u64 pnv_deepest_stop_psscr_mask; +static u64 pnv_deepest_stop_psscr_val; +static u64 pnv_deepest_stop_psscr_mask; +static bool deepest_stop_found; + +/* + * pnv_cpu_offline: A function that puts the CPU into the deepest + * available platform idle state on a CPU-Offline. + */ +unsigned long pnv_cpu_offline(unsigned int cpu) +{ + unsigned long srr1; + + u32 idle_states = pnv_get_supported_cpuidle_states(); + + if (cpu_has_feature(CPU_FTR_ARCH_300) && deepest_stop_found) { + srr1 = power9_idle_stop(pnv_deepest_stop_psscr_val, + pnv_deepest_stop_psscr_mask); + } else if (idle_states & OPAL_PM_WINKLE_ENABLED) { + srr1 = power7_winkle(); + } else if ((idle_states & OPAL_PM_SLEEP_ENABLED) || + (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) { + srr1 = power7_sleep(); + } else if (idle_states & OPAL_PM_NAP_ENABLED) { + srr1 = power7_nap(1); + } else { + /* This is the fallback method. We emulate snooze */ + while (!generic_check_cpu_restart(cpu)) { + HMT_low(); + HMT_very_low(); + } + srr1 = 0; + HMT_medium(); + } + + return srr1; +} /* * Power ISA 3.0 idle initialization. @@ -352,7 +394,6 @@ static int __init pnv_power9_idle_init(struct device_node *np, u32 *flags, u32 *residency_ns = NULL; u64 max_residency_ns = 0; int rc = 0, i; - bool default_stop_found = false, deepest_stop_found = false; psscr_val = kcalloc(dt_idle_states, sizeof(*psscr_val), GFP_KERNEL); psscr_mask = kcalloc(dt_idle_states, sizeof(*psscr_mask), GFP_KERNEL); @@ -432,21 +473,24 @@ static int __init pnv_power9_idle_init(struct device_node *np, u32 *flags, } } - if (!default_stop_found) { - pnv_default_stop_val = PSSCR_HV_DEFAULT_VAL; - pnv_default_stop_mask = PSSCR_HV_DEFAULT_MASK; - pr_warn("Setting default stop psscr val=0x%016llx,mask=0x%016llx\n", + if (unlikely(!default_stop_found)) { + pr_warn("cpuidle-powernv: No suitable default stop state found. Disabling platform idle.\n"); + } else { + ppc_md.power_save = power9_idle; + pr_info("cpuidle-powernv: Default stop: psscr = 0x%016llx,mask=0x%016llx\n", pnv_default_stop_val, pnv_default_stop_mask); } - if (!deepest_stop_found) { - pnv_deepest_stop_psscr_val = PSSCR_HV_DEFAULT_VAL; - pnv_deepest_stop_psscr_mask = PSSCR_HV_DEFAULT_MASK; - pr_warn("Setting default stop psscr val=0x%016llx,mask=0x%016llx\n", + if (unlikely(!deepest_stop_found)) { + pr_warn("cpuidle-powernv: No suitable stop state for CPU-Hotplug. Offlined CPUs will busy wait"); + } else { + pr_info("cpuidle-powernv: Deepest stop: psscr = 0x%016llx,mask=0x%016llx\n", pnv_deepest_stop_psscr_val, pnv_deepest_stop_psscr_mask); } + pr_info("cpuidle-powernv: Requested Level (RL) value of first deep stop = 0x%llx\n", + pnv_first_deep_stop_state); out: kfree(psscr_val); kfree(psscr_mask); @@ -524,10 +568,30 @@ static int __init pnv_init_idle_states(void) pnv_alloc_idle_core_states(); + /* + * For each CPU, record its PACA address in each of it's + * sibling thread's PACA at the slot corresponding to this + * CPU's index in the core. + */ + if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { + int cpu; + + pr_info("powernv: idle: Saving PACA pointers of all CPUs in their thread sibling PACA\n"); + for_each_possible_cpu(cpu) { + int base_cpu = cpu_first_thread_sibling(cpu); + int idx = cpu_thread_in_core(cpu); + int i; + + for (i = 0; i < threads_per_core; i++) { + int j = base_cpu + i; + + paca[j].thread_sibling_pacas[idx] = &paca[cpu]; + } + } + } + if (supported_cpuidle_states & OPAL_PM_NAP_ENABLED) ppc_md.power_save = power7_idle; - else if (supported_cpuidle_states & OPAL_PM_STOP_INST_FAST) - ppc_md.power_save = power9_idle; out: return 0; diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c index 1c383f38031d..4c88c3e6ec9e 100644 --- a/arch/powerpc/platforms/powernv/npu-dma.c +++ b/arch/powerpc/platforms/powernv/npu-dma.c @@ -9,11 +9,20 @@ * License as published by the Free Software Foundation. */ +#include <linux/slab.h> +#include <linux/mmu_notifier.h> +#include <linux/mmu_context.h> +#include <linux/of.h> #include <linux/export.h> #include <linux/pci.h> #include <linux/memblock.h> #include <linux/iommu.h> +#include <asm/tlb.h> +#include <asm/powernv.h> +#include <asm/reg.h> +#include <asm/opal.h> +#include <asm/io.h> #include <asm/iommu.h> #include <asm/pnv-pci.h> #include <asm/msi_bitmap.h> @@ -22,6 +31,8 @@ #include "powernv.h" #include "pci.h" +#define npu_to_phb(x) container_of(x, struct pnv_phb, npu) + /* * Other types of TCE cache invalidation are not functional in the * hardware. @@ -37,6 +48,12 @@ struct pci_dev *pnv_pci_get_gpu_dev(struct pci_dev *npdev) struct device_node *dn; struct pci_dev *gpdev; + if (WARN_ON(!npdev)) + return NULL; + + if (WARN_ON(!npdev->dev.of_node)) + return NULL; + /* Get assoicated PCI device */ dn = of_parse_phandle(npdev->dev.of_node, "ibm,gpu", 0); if (!dn) @@ -55,6 +72,12 @@ struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index) struct device_node *dn; struct pci_dev *npdev; + if (WARN_ON(!gpdev)) + return NULL; + + if (WARN_ON(!gpdev->dev.of_node)) + return NULL; + /* Get assoicated PCI device */ dn = of_parse_phandle(gpdev->dev.of_node, "ibm,npu", index); if (!dn) @@ -359,3 +382,442 @@ struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe) return gpe; } + +/* Maximum number of nvlinks per npu */ +#define NV_MAX_LINKS 6 + +/* Maximum index of npu2 hosts in the system. Always < NV_MAX_NPUS */ +static int max_npu2_index; + +struct npu_context { + struct mm_struct *mm; + struct pci_dev *npdev[NV_MAX_NPUS][NV_MAX_LINKS]; + struct mmu_notifier mn; + struct kref kref; + + /* Callback to stop translation requests on a given GPU */ + struct npu_context *(*release_cb)(struct npu_context *, void *); + + /* + * Private pointer passed to the above callback for usage by + * device drivers. + */ + void *priv; +}; + +/* + * Find a free MMIO ATSD register and mark it in use. Return -ENOSPC + * if none are available. + */ +static int get_mmio_atsd_reg(struct npu *npu) +{ + int i; + + for (i = 0; i < npu->mmio_atsd_count; i++) { + if (!test_and_set_bit(i, &npu->mmio_atsd_usage)) + return i; + } + + return -ENOSPC; +} + +static void put_mmio_atsd_reg(struct npu *npu, int reg) +{ + clear_bit(reg, &npu->mmio_atsd_usage); +} + +/* MMIO ATSD register offsets */ +#define XTS_ATSD_AVA 1 +#define XTS_ATSD_STAT 2 + +static int mmio_launch_invalidate(struct npu *npu, unsigned long launch, + unsigned long va) +{ + int mmio_atsd_reg; + + do { + mmio_atsd_reg = get_mmio_atsd_reg(npu); + cpu_relax(); + } while (mmio_atsd_reg < 0); + + __raw_writeq(cpu_to_be64(va), + npu->mmio_atsd_regs[mmio_atsd_reg] + XTS_ATSD_AVA); + eieio(); + __raw_writeq(cpu_to_be64(launch), npu->mmio_atsd_regs[mmio_atsd_reg]); + + return mmio_atsd_reg; +} + +static int mmio_invalidate_pid(struct npu *npu, unsigned long pid) +{ + unsigned long launch; + + /* IS set to invalidate matching PID */ + launch = PPC_BIT(12); + + /* PRS set to process-scoped */ + launch |= PPC_BIT(13); + + /* AP */ + launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17); + + /* PID */ + launch |= pid << PPC_BITLSHIFT(38); + + /* Invalidating the entire process doesn't use a va */ + return mmio_launch_invalidate(npu, launch, 0); +} + +static int mmio_invalidate_va(struct npu *npu, unsigned long va, + unsigned long pid) +{ + unsigned long launch; + + /* IS set to invalidate target VA */ + launch = 0; + + /* PRS set to process scoped */ + launch |= PPC_BIT(13); + + /* AP */ + launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17); + + /* PID */ + launch |= pid << PPC_BITLSHIFT(38); + + return mmio_launch_invalidate(npu, launch, va); +} + +#define mn_to_npu_context(x) container_of(x, struct npu_context, mn) + +/* + * Invalidate either a single address or an entire PID depending on + * the value of va. + */ +static void mmio_invalidate(struct npu_context *npu_context, int va, + unsigned long address) +{ + int i, j, reg; + struct npu *npu; + struct pnv_phb *nphb; + struct pci_dev *npdev; + struct { + struct npu *npu; + int reg; + } mmio_atsd_reg[NV_MAX_NPUS]; + unsigned long pid = npu_context->mm->context.id; + + /* + * Loop over all the NPUs this process is active on and launch + * an invalidate. + */ + for (i = 0; i <= max_npu2_index; i++) { + mmio_atsd_reg[i].reg = -1; + for (j = 0; j < NV_MAX_LINKS; j++) { + npdev = npu_context->npdev[i][j]; + if (!npdev) + continue; + + nphb = pci_bus_to_host(npdev->bus)->private_data; + npu = &nphb->npu; + mmio_atsd_reg[i].npu = npu; + + if (va) + mmio_atsd_reg[i].reg = + mmio_invalidate_va(npu, address, pid); + else + mmio_atsd_reg[i].reg = + mmio_invalidate_pid(npu, pid); + + /* + * The NPU hardware forwards the shootdown to all GPUs + * so we only have to launch one shootdown per NPU. + */ + break; + } + } + + /* + * Unfortunately the nest mmu does not support flushing specific + * addresses so we have to flush the whole mm. + */ + flush_tlb_mm(npu_context->mm); + + /* Wait for all invalidations to complete */ + for (i = 0; i <= max_npu2_index; i++) { + if (mmio_atsd_reg[i].reg < 0) + continue; + + /* Wait for completion */ + npu = mmio_atsd_reg[i].npu; + reg = mmio_atsd_reg[i].reg; + while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT)) + cpu_relax(); + put_mmio_atsd_reg(npu, reg); + } +} + +static void pnv_npu2_mn_release(struct mmu_notifier *mn, + struct mm_struct *mm) +{ + struct npu_context *npu_context = mn_to_npu_context(mn); + + /* Call into device driver to stop requests to the NMMU */ + if (npu_context->release_cb) + npu_context->release_cb(npu_context, npu_context->priv); + + /* + * There should be no more translation requests for this PID, but we + * need to ensure any entries for it are removed from the TLB. + */ + mmio_invalidate(npu_context, 0, 0); +} + +static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address, + pte_t pte) +{ + struct npu_context *npu_context = mn_to_npu_context(mn); + + mmio_invalidate(npu_context, 1, address); +} + +static void pnv_npu2_mn_invalidate_page(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + struct npu_context *npu_context = mn_to_npu_context(mn); + + mmio_invalidate(npu_context, 1, address); +} + +static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct npu_context *npu_context = mn_to_npu_context(mn); + unsigned long address; + + for (address = start; address <= end; address += PAGE_SIZE) + mmio_invalidate(npu_context, 1, address); +} + +static const struct mmu_notifier_ops nv_nmmu_notifier_ops = { + .release = pnv_npu2_mn_release, + .change_pte = pnv_npu2_mn_change_pte, + .invalidate_page = pnv_npu2_mn_invalidate_page, + .invalidate_range = pnv_npu2_mn_invalidate_range, +}; + +/* + * Call into OPAL to setup the nmmu context for the current task in + * the NPU. This must be called to setup the context tables before the + * GPU issues ATRs. pdev should be a pointed to PCIe GPU device. + * + * A release callback should be registered to allow a device driver to + * be notified that it should not launch any new translation requests + * as the final TLB invalidate is about to occur. + * + * Returns an error if there no contexts are currently available or a + * npu_context which should be passed to pnv_npu2_handle_fault(). + * + * mmap_sem must be held in write mode. + */ +struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, + unsigned long flags, + struct npu_context *(*cb)(struct npu_context *, void *), + void *priv) +{ + int rc; + u32 nvlink_index; + struct device_node *nvlink_dn; + struct mm_struct *mm = current->mm; + struct pnv_phb *nphb; + struct npu *npu; + struct npu_context *npu_context; + + /* + * At present we don't support GPUs connected to multiple NPUs and I'm + * not sure the hardware does either. + */ + struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0); + + if (!firmware_has_feature(FW_FEATURE_OPAL)) + return ERR_PTR(-ENODEV); + + if (!npdev) + /* No nvlink associated with this GPU device */ + return ERR_PTR(-ENODEV); + + if (!mm) { + /* kernel thread contexts are not supported */ + return ERR_PTR(-EINVAL); + } + + nphb = pci_bus_to_host(npdev->bus)->private_data; + npu = &nphb->npu; + + /* + * Setup the NPU context table for a particular GPU. These need to be + * per-GPU as we need the tables to filter ATSDs when there are no + * active contexts on a particular GPU. + */ + rc = opal_npu_init_context(nphb->opal_id, mm->context.id, flags, + PCI_DEVID(gpdev->bus->number, gpdev->devfn)); + if (rc < 0) + return ERR_PTR(-ENOSPC); + + /* + * We store the npu pci device so we can more easily get at the + * associated npus. + */ + npu_context = mm->context.npu_context; + if (!npu_context) { + npu_context = kzalloc(sizeof(struct npu_context), GFP_KERNEL); + if (!npu_context) + return ERR_PTR(-ENOMEM); + + mm->context.npu_context = npu_context; + npu_context->mm = mm; + npu_context->mn.ops = &nv_nmmu_notifier_ops; + __mmu_notifier_register(&npu_context->mn, mm); + kref_init(&npu_context->kref); + } else { + kref_get(&npu_context->kref); + } + + npu_context->release_cb = cb; + npu_context->priv = priv; + nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0); + if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index", + &nvlink_index))) + return ERR_PTR(-ENODEV); + npu_context->npdev[npu->index][nvlink_index] = npdev; + + return npu_context; +} +EXPORT_SYMBOL(pnv_npu2_init_context); + +static void pnv_npu2_release_context(struct kref *kref) +{ + struct npu_context *npu_context = + container_of(kref, struct npu_context, kref); + + npu_context->mm->context.npu_context = NULL; + mmu_notifier_unregister(&npu_context->mn, + npu_context->mm); + + kfree(npu_context); +} + +void pnv_npu2_destroy_context(struct npu_context *npu_context, + struct pci_dev *gpdev) +{ + struct pnv_phb *nphb, *phb; + struct npu *npu; + struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0); + struct device_node *nvlink_dn; + u32 nvlink_index; + + if (WARN_ON(!npdev)) + return; + + if (!firmware_has_feature(FW_FEATURE_OPAL)) + return; + + nphb = pci_bus_to_host(npdev->bus)->private_data; + npu = &nphb->npu; + phb = pci_bus_to_host(gpdev->bus)->private_data; + nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0); + if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index", + &nvlink_index))) + return; + npu_context->npdev[npu->index][nvlink_index] = NULL; + opal_npu_destroy_context(phb->opal_id, npu_context->mm->context.id, + PCI_DEVID(gpdev->bus->number, gpdev->devfn)); + kref_put(&npu_context->kref, pnv_npu2_release_context); +} +EXPORT_SYMBOL(pnv_npu2_destroy_context); + +/* + * Assumes mmap_sem is held for the contexts associated mm. + */ +int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea, + unsigned long *flags, unsigned long *status, int count) +{ + u64 rc = 0, result = 0; + int i, is_write; + struct page *page[1]; + + /* mmap_sem should be held so the struct_mm must be present */ + struct mm_struct *mm = context->mm; + + if (!firmware_has_feature(FW_FEATURE_OPAL)) + return -ENODEV; + + WARN_ON(!rwsem_is_locked(&mm->mmap_sem)); + + for (i = 0; i < count; i++) { + is_write = flags[i] & NPU2_WRITE; + rc = get_user_pages_remote(NULL, mm, ea[i], 1, + is_write ? FOLL_WRITE : 0, + page, NULL, NULL); + + /* + * To support virtualised environments we will have to do an + * access to the page to ensure it gets faulted into the + * hypervisor. For the moment virtualisation is not supported in + * other areas so leave the access out. + */ + if (rc != 1) { + status[i] = rc; + result = -EFAULT; + continue; + } + + status[i] = 0; + put_page(page[0]); + } + + return result; +} +EXPORT_SYMBOL(pnv_npu2_handle_fault); + +int pnv_npu2_init(struct pnv_phb *phb) +{ + unsigned int i; + u64 mmio_atsd; + struct device_node *dn; + struct pci_dev *gpdev; + static int npu_index; + uint64_t rc = 0; + + for_each_child_of_node(phb->hose->dn, dn) { + gpdev = pnv_pci_get_gpu_dev(get_pci_dev(dn)); + if (gpdev) { + rc = opal_npu_map_lpar(phb->opal_id, + PCI_DEVID(gpdev->bus->number, gpdev->devfn), + 0, 0); + if (rc) + dev_err(&gpdev->dev, + "Error %lld mapping device to LPAR\n", + rc); + } + } + + for (i = 0; !of_property_read_u64_index(phb->hose->dn, "ibm,mmio-atsd", + i, &mmio_atsd); i++) + phb->npu.mmio_atsd_regs[i] = ioremap(mmio_atsd, 32); + + pr_info("NPU%lld: Found %d MMIO ATSD registers", phb->opal_id, i); + phb->npu.mmio_atsd_count = i; + phb->npu.mmio_atsd_usage = 0; + npu_index++; + if (WARN_ON(npu_index >= NV_MAX_NPUS)) + return -ENOSPC; + max_npu2_index = npu_index; + phb->npu.index = npu_index; + + return 0; +} diff --git a/arch/powerpc/platforms/powernv/opal-lpc.c b/arch/powerpc/platforms/powernv/opal-lpc.c index a91d7876fae2..6c7ad1d8b32e 100644 --- a/arch/powerpc/platforms/powernv/opal-lpc.c +++ b/arch/powerpc/platforms/powernv/opal-lpc.c @@ -12,7 +12,6 @@ #include <linux/kernel.h> #include <linux/of.h> #include <linux/bug.h> -#include <linux/debugfs.h> #include <linux/io.h> #include <linux/slab.h> @@ -21,7 +20,7 @@ #include <asm/opal.h> #include <asm/prom.h> #include <linux/uaccess.h> -#include <asm/debug.h> +#include <asm/debugfs.h> #include <asm/isa-bridge.h> static int opal_lpc_chip_id = -1; diff --git a/arch/powerpc/platforms/powernv/opal-sensor.c b/arch/powerpc/platforms/powernv/opal-sensor.c index 308efd170c27..aa267f120033 100644 --- a/arch/powerpc/platforms/powernv/opal-sensor.c +++ b/arch/powerpc/platforms/powernv/opal-sensor.c @@ -64,6 +64,10 @@ int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data) *sensor_data = be32_to_cpu(data); break; + case OPAL_WRONG_STATE: + ret = -EIO; + break; + default: ret = opal_error_code(ret); break; diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index 085605a73168..f620572f891f 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -50,21 +50,13 @@ END_FTR_SECTION(0, 1); \ #define OPAL_BRANCH(LABEL) #endif -/* TODO: - * - * - Trace irqs in/off (needs saving/restoring all args, argh...) - * - Get r11 feed up by Dave so I can have better register usage +/* + * DO_OPAL_CALL assumes: + * r0 = opal call token + * r12 = msr + * LR has been saved */ - -#define OPAL_CALL(name, token) \ - _GLOBAL_TOC(name); \ - mfmsr r12; \ - mflr r0; \ - andi. r11,r12,MSR_IR|MSR_DR; \ - std r0,PPC_LR_STKOFF(r1); \ - li r0,token; \ - beq opal_real_call; \ - OPAL_BRANCH(opal_tracepoint_entry) \ +#define DO_OPAL_CALL() \ mfcr r11; \ stw r11,8(r1); \ li r11,0; \ @@ -83,6 +75,18 @@ END_FTR_SECTION(0, 1); \ mtspr SPRN_HSRR0,r12; \ hrfid +#define OPAL_CALL(name, token) \ + _GLOBAL_TOC(name); \ + mfmsr r12; \ + mflr r0; \ + andi. r11,r12,MSR_IR|MSR_DR; \ + std r0,PPC_LR_STKOFF(r1); \ + li r0,token; \ + beq opal_real_call; \ + OPAL_BRANCH(opal_tracepoint_entry) \ + DO_OPAL_CALL() + + opal_return: /* * Fixup endian on OPAL return... we should be able to simplify @@ -148,26 +152,13 @@ opal_tracepoint_entry: ld r8,STK_REG(R29)(r1) ld r9,STK_REG(R30)(r1) ld r10,STK_REG(R31)(r1) + + /* setup LR so we return via tracepoint_return */ LOAD_REG_ADDR(r11,opal_tracepoint_return) - mfcr r12 std r11,16(r1) - stw r12,8(r1) - li r11,0 + mfmsr r12 - ori r11,r11,MSR_EE - std r12,PACASAVEDMSR(r13) - andc r12,r12,r11 - mtmsrd r12,1 - LOAD_REG_ADDR(r11,opal_return) - mtlr r11 - li r11,MSR_DR|MSR_IR|MSR_LE - andc r12,r12,r11 - mtspr SPRN_HSRR1,r12 - LOAD_REG_ADDR(r11,opal) - ld r12,8(r11) - ld r2,0(r11) - mtspr SPRN_HSRR0,r12 - hrfid + DO_OPAL_CALL() opal_tracepoint_return: std r3,STK_REG(R31)(r1) @@ -316,3 +307,6 @@ OPAL_CALL(opal_xive_get_vp_info, OPAL_XIVE_GET_VP_INFO); OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO); OPAL_CALL(opal_xive_sync, OPAL_XIVE_SYNC); OPAL_CALL(opal_xive_dump, OPAL_XIVE_DUMP); +OPAL_CALL(opal_npu_init_context, OPAL_NPU_INIT_CONTEXT); +OPAL_CALL(opal_npu_destroy_context, OPAL_NPU_DESTROY_CONTEXT); +OPAL_CALL(opal_npu_map_lpar, OPAL_NPU_MAP_LPAR); diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c index d0ac535cf5d7..28651fb25417 100644 --- a/arch/powerpc/platforms/powernv/opal-xscom.c +++ b/arch/powerpc/platforms/powernv/opal-xscom.c @@ -73,25 +73,32 @@ static int opal_xscom_err_xlate(int64_t rc) static u64 opal_scom_unmangle(u64 addr) { + u64 tmp; + /* - * XSCOM indirect addresses have the top bit set. Additionally - * the rest of the top 3 nibbles is always 0. + * XSCOM addresses use the top nibble to set indirect mode and + * its form. Bits 4-11 are always 0. * * Because the debugfs interface uses signed offsets and shifts * the address left by 3, we basically cannot use the top 4 bits * of the 64-bit address, and thus cannot use the indirect bit. * - * To deal with that, we support the indirect bit being in bit - * 4 (IBM notation) instead of bit 0 in this API, we do the - * conversion here. To leave room for further xscom address - * expansion, we only clear out the top byte + * To deal with that, we support the indirect bits being in + * bits 4-7 (IBM notation) instead of bit 0-3 in this API, we + * do the conversion here. * - * For in-kernel use, we also support the real indirect bit, so - * we test for any of the top 5 bits + * For in-kernel use, we don't need to do this mangling. In + * kernel won't have bits 4-7 set. * + * So: + * debugfs will always set 0-3 = 0 and clear 4-7 + * kernel will always clear 0-3 = 0 and set 4-7 */ - if (addr & (0x1full << 59)) - addr = (addr & ~(0xffull << 56)) | (1ull << 63); + tmp = addr; + tmp &= 0x0f00000000000000; + addr &= 0xf0ffffffffffffff; + addr |= tmp << 4; + return addr; } diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index e0f856bfbfe8..76e153fc1f93 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -435,7 +435,7 @@ int opal_machine_check(struct pt_regs *regs) evt.version); return 0; } - machine_check_print_event_info(&evt); + machine_check_print_event_info(&evt, user_mode(regs)); if (opal_recover_mce(regs, &evt)) return 1; @@ -595,6 +595,79 @@ static void opal_export_symmap(void) pr_warn("Error %d creating OPAL symbols file\n", rc); } +static ssize_t export_attr_read(struct file *fp, struct kobject *kobj, + struct bin_attribute *bin_attr, char *buf, + loff_t off, size_t count) +{ + return memory_read_from_buffer(buf, count, &off, bin_attr->private, + bin_attr->size); +} + +/* + * opal_export_attrs: creates a sysfs node for each property listed in + * the device-tree under /ibm,opal/firmware/exports/ + * All new sysfs nodes are created under /opal/exports/. + * This allows for reserved memory regions (e.g. HDAT) to be read. + * The new sysfs nodes are only readable by root. + */ +static void opal_export_attrs(void) +{ + struct bin_attribute *attr; + struct device_node *np; + struct property *prop; + struct kobject *kobj; + u64 vals[2]; + int rc; + + np = of_find_node_by_path("/ibm,opal/firmware/exports"); + if (!np) + return; + + /* Create new 'exports' directory - /sys/firmware/opal/exports */ + kobj = kobject_create_and_add("exports", opal_kobj); + if (!kobj) { + pr_warn("kobject_create_and_add() of exports failed\n"); + return; + } + + for_each_property_of_node(np, prop) { + if (!strcmp(prop->name, "name") || !strcmp(prop->name, "phandle")) + continue; + + if (of_property_read_u64_array(np, prop->name, &vals[0], 2)) + continue; + + attr = kmalloc(sizeof(*attr), GFP_KERNEL); + + if (attr == NULL) { + pr_warn("Failed kmalloc for bin_attribute!"); + continue; + } + + attr->attr.name = kstrdup(prop->name, GFP_KERNEL); + attr->attr.mode = 0400; + attr->read = export_attr_read; + attr->private = __va(vals[0]); + attr->size = vals[1]; + + if (attr->attr.name == NULL) { + pr_warn("Failed kstrdup for bin_attribute attr.name"); + kfree(attr); + continue; + } + + rc = sysfs_create_bin_file(kobj, attr); + if (rc) { + pr_warn("Error %d creating OPAL sysfs exports/%s file\n", + rc, prop->name); + kfree(attr->attr.name); + kfree(attr); + } + } + + of_node_put(np); +} + static void __init opal_dump_region_init(void) { void *addr; @@ -733,6 +806,9 @@ static int __init opal_init(void) opal_msglog_sysfs_init(); } + /* Export all properties */ + opal_export_attrs(); + /* Initialize platform devices: IPMI backend, PRD & flash interface */ opal_pdev_init("ibm,opal-ipmi"); opal_pdev_init("ibm,opal-flash"); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index e36738291c32..7eebc76721ea 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -14,7 +14,6 @@ #include <linux/kernel.h> #include <linux/pci.h> #include <linux/crash_dump.h> -#include <linux/debugfs.h> #include <linux/delay.h> #include <linux/string.h> #include <linux/init.h> @@ -38,7 +37,7 @@ #include <asm/iommu.h> #include <asm/tce.h> #include <asm/xics.h> -#include <asm/debug.h> +#include <asm/debugfs.h> #include <asm/firmware.h> #include <asm/pnv-pci.h> #include <asm/mmzone.h> @@ -1262,6 +1261,8 @@ static void pnv_pci_ioda_setup_PEs(void) /* PE#0 is needed for error reporting */ pnv_ioda_reserve_pe(phb, 0); pnv_ioda_setup_npu_PEs(hose->bus); + if (phb->model == PNV_PHB_MODEL_NPU2) + pnv_npu2_init(phb); } } } @@ -2735,9 +2736,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, if (rc) return; - if (pe->flags & PNV_IODA_PE_DEV) - iommu_add_device(&pe->pdev->dev); - else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) + if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) pnv_ioda_setup_bus_dma(pe, pe->pbus, true); } diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index eb835e977e33..a43f22dc069e 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -758,7 +758,7 @@ void pnv_tce_free(struct iommu_table *tbl, long index, long npages) unsigned long pnv_tce_get(struct iommu_table *tbl, long index) { - return *(pnv_tce(tbl, index - tbl->it_offset)); + return be64_to_cpu(*(pnv_tce(tbl, index - tbl->it_offset))); } struct iommu_table *pnv_pci_table_alloc(int nid) diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index e1d3e5526b54..4eab713136d1 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -7,6 +7,9 @@ struct pci_dn; +/* Maximum possible number of ATSD MMIO registers per NPU */ +#define NV_NMMU_ATSD_REGS 8 + enum pnv_phb_type { PNV_PHB_IODA1 = 0, PNV_PHB_IODA2 = 1, @@ -174,6 +177,16 @@ struct pnv_phb { struct OpalIoP7IOCErrorData hub_diag; } diag; + /* Nvlink2 data */ + struct npu { + int index; + __be64 *mmio_atsd_regs[NV_NMMU_ATSD_REGS]; + unsigned int mmio_atsd_count; + + /* Bitmask for MMIO register usage */ + unsigned long mmio_atsd_usage; + } npu; + #ifdef CONFIG_CXL_BASE struct cxl_afu *cxl_afu; #endif @@ -236,7 +249,7 @@ extern long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num, extern long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num); extern void pnv_npu_take_ownership(struct pnv_ioda_pe *npe); extern void pnv_npu_release_ownership(struct pnv_ioda_pe *npe); - +extern int pnv_npu2_init(struct pnv_phb *phb); /* cxl functions */ extern bool pnv_cxl_enable_device_hook(struct pci_dev *dev); diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h index 613052232475..6dbc0a1da1f6 100644 --- a/arch/powerpc/platforms/powernv/powernv.h +++ b/arch/powerpc/platforms/powernv/powernv.h @@ -18,8 +18,6 @@ static inline void pnv_pci_shutdown(void) { } #endif extern u32 pnv_get_supported_cpuidle_states(void); -extern u64 pnv_deepest_stop_psscr_val; -extern u64 pnv_deepest_stop_psscr_mask; extern void pnv_lpc_init(void); diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index adceac978d18..2dc7e5fb86c3 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -98,6 +98,10 @@ static void pnv_show_cpuinfo(struct seq_file *m) else seq_printf(m, "firmware\t: BML\n"); of_node_put(root); + if (radix_enabled()) + seq_printf(m, "MMU\t\t: Radix\n"); + else + seq_printf(m, "MMU\t\t: Hash\n"); } static void pnv_prepare_going_down(void) diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index f57195588c6c..39296bf7009e 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -36,6 +36,7 @@ #include <asm/dbell.h> #include <asm/kvm_ppc.h> #include <asm/ppc-opcode.h> +#include <asm/cpuidle.h> #include "powernv.h" @@ -146,7 +147,6 @@ static void pnv_smp_cpu_kill_self(void) { unsigned int cpu; unsigned long srr1, wmask; - u32 idle_states; /* Standard hot unplug procedure */ local_irq_disable(); @@ -161,8 +161,6 @@ static void pnv_smp_cpu_kill_self(void) if (cpu_has_feature(CPU_FTR_ARCH_207S)) wmask = SRR1_WAKEMASK_P8; - idle_states = pnv_get_supported_cpuidle_states(); - /* We don't want to take decrementer interrupts while we are offline, * so clear LPCR:PECE1. We keep PECE2 (and LPCR_PECE_HVEE on P9) * enabled as to let IPIs in. @@ -190,19 +188,7 @@ static void pnv_smp_cpu_kill_self(void) kvmppc_set_host_ipi(cpu, 0); ppc64_runlatch_off(); - - if (cpu_has_feature(CPU_FTR_ARCH_300)) { - srr1 = power9_idle_stop(pnv_deepest_stop_psscr_val, - pnv_deepest_stop_psscr_mask); - } else if (idle_states & OPAL_PM_WINKLE_ENABLED) { - srr1 = power7_winkle(); - } else if ((idle_states & OPAL_PM_SLEEP_ENABLED) || - (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) { - srr1 = power7_sleep(); - } else { - srr1 = power7_nap(1); - } - + srr1 = pnv_cpu_offline(cpu); ppc64_runlatch_on(); /* diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c index 6b04e3f0f982..18014cdeb590 100644 --- a/arch/powerpc/platforms/pseries/dtl.c +++ b/arch/powerpc/platforms/pseries/dtl.c @@ -21,13 +21,12 @@ */ #include <linux/slab.h> -#include <linux/debugfs.h> #include <linux/spinlock.h> #include <asm/smp.h> #include <linux/uaccess.h> #include <asm/firmware.h> #include <asm/lppaca.h> -#include <asm/debug.h> +#include <asm/debugfs.h> #include <asm/plpar_wrappers.h> #include <asm/machdep.h> diff --git a/arch/powerpc/platforms/pseries/hvCall_inst.c b/arch/powerpc/platforms/pseries/hvCall_inst.c index f02ec3ab428c..957ae347b0b3 100644 --- a/arch/powerpc/platforms/pseries/hvCall_inst.c +++ b/arch/powerpc/platforms/pseries/hvCall_inst.c @@ -29,6 +29,16 @@ #include <asm/trace.h> #include <asm/machdep.h> +/* For hcall instrumentation. One structure per-hcall, per-CPU */ +struct hcall_stats { + unsigned long num_calls; /* number of calls (on this CPU) */ + unsigned long tb_total; /* total wall time (mftb) of calls. */ + unsigned long purr_total; /* total cpu time (PURR) of calls. */ + unsigned long tb_start; + unsigned long purr_start; +}; +#define HCALL_STAT_ARRAY_SIZE ((MAX_HCALL_OPCODE >> 2) + 1) + DEFINE_PER_CPU(struct hcall_stats[HCALL_STAT_ARRAY_SIZE], hcall_stats); /* diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 8b1fe895daa3..6541d0b03e4c 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -958,3 +958,64 @@ int h_get_mpp_x(struct hvcall_mpp_x_data *mpp_x_data) return rc; } + +static unsigned long vsid_unscramble(unsigned long vsid, int ssize) +{ + unsigned long protovsid; + unsigned long va_bits = VA_BITS; + unsigned long modinv, vsid_modulus; + unsigned long max_mod_inv, tmp_modinv; + + if (!mmu_has_feature(MMU_FTR_68_BIT_VA)) + va_bits = 65; + + if (ssize == MMU_SEGSIZE_256M) { + modinv = VSID_MULINV_256M; + vsid_modulus = ((1UL << (va_bits - SID_SHIFT)) - 1); + } else { + modinv = VSID_MULINV_1T; + vsid_modulus = ((1UL << (va_bits - SID_SHIFT_1T)) - 1); + } + + /* + * vsid outside our range. + */ + if (vsid >= vsid_modulus) + return 0; + + /* + * If modinv is the modular multiplicate inverse of (x % vsid_modulus) + * and vsid = (protovsid * x) % vsid_modulus, then we say: + * protovsid = (vsid * modinv) % vsid_modulus + */ + + /* Check if (vsid * modinv) overflow (63 bits) */ + max_mod_inv = 0x7fffffffffffffffull / vsid; + if (modinv < max_mod_inv) + return (vsid * modinv) % vsid_modulus; + + tmp_modinv = modinv/max_mod_inv; + modinv %= max_mod_inv; + + protovsid = (((vsid * max_mod_inv) % vsid_modulus) * tmp_modinv) % vsid_modulus; + protovsid = (protovsid + vsid * modinv) % vsid_modulus; + + return protovsid; +} + +static int __init reserve_vrma_context_id(void) +{ + unsigned long protovsid; + + /* + * Reserve context ids which map to reserved virtual addresses. For now + * we only reserve the context id which maps to the VRMA VSID. We ignore + * the addresses in "ibm,adjunct-virtual-addresses" because we don't + * enable adjunct support via the "ibm,client-architecture-support" + * interface. + */ + protovsid = vsid_unscramble(VRMA_VSID, MMU_SEGSIZE_1T); + hash__reserve_context_id(protovsid >> ESID_BITS_1T); + return 0; +} +machine_device_initcall(pseries, reserve_vrma_context_id); diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index b4d362ed03a1..b5d86426e97b 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -87,6 +87,10 @@ static void pSeries_show_cpuinfo(struct seq_file *m) model = of_get_property(root, "model", NULL); seq_printf(m, "machine\t\t: CHRP %s\n", model); of_node_put(root); + if (radix_enabled()) + seq_printf(m, "MMU\t\t: Radix\n"); + else + seq_printf(m, "MMU\t\t: Hash\n"); } /* Initialize firmware assisted non-maskable interrupts if diff --git a/arch/powerpc/sysdev/scom.c b/arch/powerpc/sysdev/scom.c index d0e9f178a324..76ea32c1b664 100644 --- a/arch/powerpc/sysdev/scom.c +++ b/arch/powerpc/sysdev/scom.c @@ -19,10 +19,9 @@ */ #include <linux/kernel.h> -#include <linux/debugfs.h> #include <linux/slab.h> #include <linux/export.h> -#include <asm/debug.h> +#include <asm/debugfs.h> #include <asm/prom.h> #include <asm/scom.h> #include <linux/uaccess.h> diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 67435b9bf98d..f77a104abf9f 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -29,6 +29,7 @@ #include <linux/nmi.h> #include <linux/ctype.h> +#include <asm/debugfs.h> #include <asm/ptrace.h> #include <asm/smp.h> #include <asm/string.h> @@ -77,6 +78,7 @@ static int xmon_gate; #endif /* CONFIG_SMP */ static unsigned long in_xmon __read_mostly = 0; +static int xmon_on = IS_ENABLED(CONFIG_XMON_DEFAULT); static unsigned long adrs; static int size = 1; @@ -185,8 +187,6 @@ static void dump_tlb_44x(void); static void dump_tlb_book3e(void); #endif -static int xmon_no_auto_backtrace; - #ifdef CONFIG_PPC64 #define REG "%.16lx" #else @@ -891,10 +891,7 @@ cmds(struct pt_regs *excp) last_cmd = NULL; xmon_regs = excp; - if (!xmon_no_auto_backtrace) { - xmon_no_auto_backtrace = 1; - xmon_show_stack(excp->gpr[1], excp->link, excp->nip); - } + xmon_show_stack(excp->gpr[1], excp->link, excp->nip); for(;;) { #ifdef CONFIG_SMP @@ -3392,6 +3389,8 @@ static void sysrq_handle_xmon(int key) /* ensure xmon is enabled */ xmon_init(1); debugger(get_irq_regs()); + if (!xmon_on) + xmon_init(0); } static struct sysrq_key_op sysrq_xmon_op = { @@ -3405,10 +3404,37 @@ static int __init setup_xmon_sysrq(void) register_sysrq_key('x', &sysrq_xmon_op); return 0; } -__initcall(setup_xmon_sysrq); +device_initcall(setup_xmon_sysrq); #endif /* CONFIG_MAGIC_SYSRQ */ -static int __initdata xmon_early, xmon_off; +#ifdef CONFIG_DEBUG_FS +static int xmon_dbgfs_set(void *data, u64 val) +{ + xmon_on = !!val; + xmon_init(xmon_on); + + return 0; +} + +static int xmon_dbgfs_get(void *data, u64 *val) +{ + *val = xmon_on; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(xmon_dbgfs_ops, xmon_dbgfs_get, + xmon_dbgfs_set, "%llu\n"); + +static int __init setup_xmon_dbgfs(void) +{ + debugfs_create_file("xmon", 0600, powerpc_debugfs_root, NULL, + &xmon_dbgfs_ops); + return 0; +} +device_initcall(setup_xmon_dbgfs); +#endif /* CONFIG_DEBUG_FS */ + +static int xmon_early __initdata; static int __init early_parse_xmon(char *p) { @@ -3416,12 +3442,12 @@ static int __init early_parse_xmon(char *p) /* just "xmon" is equivalent to "xmon=early" */ xmon_init(1); xmon_early = 1; - } else if (strncmp(p, "on", 2) == 0) + xmon_on = 1; + } else if (strncmp(p, "on", 2) == 0) { xmon_init(1); - else if (strncmp(p, "off", 3) == 0) - xmon_off = 1; - else if (strncmp(p, "nobt", 4) == 0) - xmon_no_auto_backtrace = 1; + xmon_on = 1; + } else if (strncmp(p, "off", 3) == 0) + xmon_on = 0; else return 1; @@ -3431,10 +3457,8 @@ early_param("xmon", early_parse_xmon); void __init xmon_setup(void) { -#ifdef CONFIG_XMON_DEFAULT - if (!xmon_off) + if (xmon_on) xmon_init(1); -#endif if (xmon_early) debugger(NULL); } |