From 8119cefd9a29b71997e62b762932d23499ba4896 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Wed, 14 Jul 2021 18:17:58 +0530 Subject: powerpc/kexec: blacklist functions called in real mode for kprobe As kprobe does not handle events happening in real mode, blacklist the functions that only get called in real mode or in kexec sequence with MMU turned off. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/162626687834.155313.4692863392927831843.stgit@hbathini-workstation.ibm.com --- arch/powerpc/kernel/head_64.S | 2 ++ arch/powerpc/kexec/core_64.c | 6 ++++-- arch/powerpc/mm/book3s64/hash_native.c | 2 +- arch/powerpc/mm/book3s64/pgtable.c | 4 ++-- arch/powerpc/mm/book3s64/radix_pgtable.c | 3 ++- arch/powerpc/platforms/ps3/htab.c | 3 ++- arch/powerpc/platforms/ps3/mm.c | 8 ++++++-- arch/powerpc/platforms/pseries/lpar.c | 9 ++++++--- 8 files changed, 25 insertions(+), 12 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index 79930b0bc781..f17ae2083733 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -712,6 +712,8 @@ _GLOBAL(copy_and_flush) isync blr +_ASM_NOKPROBE_SYMBOL(copy_and_flush); /* Called in real mode */ + .align 8 copy_to_here: diff --git a/arch/powerpc/kexec/core_64.c b/arch/powerpc/kexec/core_64.c index 8a449b2d8715..84618d3c8013 100644 --- a/arch/powerpc/kexec/core_64.c +++ b/arch/powerpc/kexec/core_64.c @@ -72,7 +72,8 @@ int default_machine_kexec_prepare(struct kimage *image) return 0; } -static void copy_segments(unsigned long ind) +/* Called during kexec sequence with MMU off */ +static notrace void copy_segments(unsigned long ind) { unsigned long entry; unsigned long *ptr; @@ -105,7 +106,8 @@ static void copy_segments(unsigned long ind) } } -void kexec_copy_flush(struct kimage *image) +/* Called during kexec sequence with MMU off */ +notrace void kexec_copy_flush(struct kimage *image) { long i, nr_segments = image->nr_segments; struct kexec_segment ranges[KEXEC_SEGMENT_MAX]; diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c index 52e170bd95ae..d8279bfe68ea 100644 --- a/arch/powerpc/mm/book3s64/hash_native.c +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -787,7 +787,7 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot, * TODO: add batching support when enabled. remember, no dynamic memory here, * although there is the control page available... */ -static void native_hpte_clear(void) +static notrace void native_hpte_clear(void) { unsigned long vpn = 0; unsigned long slot, slots; diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 9ffa65074cb0..300099de553b 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -172,8 +172,8 @@ pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -/* For use by kexec */ -void mmu_cleanup_all(void) +/* For use by kexec, called with MMU off */ +notrace void mmu_cleanup_all(void) { if (radix_enabled()) radix__mmu_cleanup_all(); diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index e50ddf129c15..ae20add7954a 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -679,7 +679,8 @@ void radix__early_init_mmu_secondary(void) mtspr(SPRN_UAMOR, 0); } -void radix__mmu_cleanup_all(void) +/* Called during kexec sequence with MMU off */ +notrace void radix__mmu_cleanup_all(void) { unsigned long lpcr; diff --git a/arch/powerpc/platforms/ps3/htab.c b/arch/powerpc/platforms/ps3/htab.c index 7ddc7ec6a7c0..ef710a715903 100644 --- a/arch/powerpc/platforms/ps3/htab.c +++ b/arch/powerpc/platforms/ps3/htab.c @@ -169,7 +169,8 @@ static void ps3_hpte_invalidate(unsigned long slot, unsigned long vpn, spin_unlock_irqrestore(&ps3_htab_lock, flags); } -static void ps3_hpte_clear(void) +/* Called during kexec sequence with MMU off */ +static notrace void ps3_hpte_clear(void) { unsigned long hpte_count = (1UL << ppc64_pft_size) >> 4; u64 i; diff --git a/arch/powerpc/platforms/ps3/mm.c b/arch/powerpc/platforms/ps3/mm.c index a81eac35d900..9c44f335c0b9 100644 --- a/arch/powerpc/platforms/ps3/mm.c +++ b/arch/powerpc/platforms/ps3/mm.c @@ -195,9 +195,11 @@ fail: /** * ps3_mm_vas_destroy - + * + * called during kexec sequence with MMU off. */ -void ps3_mm_vas_destroy(void) +notrace void ps3_mm_vas_destroy(void) { int result; @@ -1243,9 +1245,11 @@ void __init ps3_mm_init(void) /** * ps3_mm_shutdown - final cleanup of address space + * + * called during kexec sequence with MMU off. */ -void ps3_mm_shutdown(void) +notrace void ps3_mm_shutdown(void) { ps3_mm_region_destroy(&map.r1); } diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index dab356e3ff87..869ef638698a 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -801,7 +801,8 @@ static long pSeries_lpar_hpte_remove(unsigned long hpte_group) return -1; } -static void manual_hpte_clear_all(void) +/* Called during kexec sequence with MMU off */ +static notrace void manual_hpte_clear_all(void) { unsigned long size_bytes = 1UL << ppc64_pft_size; unsigned long hpte_count = size_bytes >> 4; @@ -834,7 +835,8 @@ static void manual_hpte_clear_all(void) } } -static int hcall_hpte_clear_all(void) +/* Called during kexec sequence with MMU off */ +static notrace int hcall_hpte_clear_all(void) { int rc; @@ -845,7 +847,8 @@ static int hcall_hpte_clear_all(void) return rc; } -static void pseries_hpte_clear_all(void) +/* Called during kexec sequence with MMU off */ +static notrace void pseries_hpte_clear_all(void) { int rc; -- cgit v1.2.3-59-g8ed1b From a6cae77f1bc89368a4e2822afcddc45c3062d499 Mon Sep 17 00:00:00 2001 From: Michal Suchanek Date: Thu, 29 Jul 2021 20:01:03 +0200 Subject: powerpc/stacktrace: Include linux/delay.h commit 7c6986ade69e ("powerpc/stacktrace: Fix spurious "stale" traces in raise_backtrace_ipi()") introduces udelay() call without including the linux/delay.h header. This may happen to work on master but the header that declares the functionshould be included nonetheless. Fixes: 7c6986ade69e ("powerpc/stacktrace: Fix spurious "stale" traces in raise_backtrace_ipi()") Signed-off-by: Michal Suchanek Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210729180103.15578-1-msuchanek@suse.de --- arch/powerpc/kernel/stacktrace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c index 2b0d04a1b7d2..9e4a4a7af380 100644 --- a/arch/powerpc/kernel/stacktrace.c +++ b/arch/powerpc/kernel/stacktrace.c @@ -8,6 +8,7 @@ * Copyright 2018 Nick Piggin, Michael Ellerman, IBM Corp. */ +#include #include #include #include -- cgit v1.2.3-59-g8ed1b From 156ca4e650bfb9a4259b427069caa11b5a4df3d4 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 29 Jul 2021 23:19:35 +0900 Subject: powerpc: remove unused zInstall target from arch/powerpc/boot/Makefile Commit c913e5f95e54 ("powerpc/boot: Don't install zImage.* from make install") added the zInstall target to arch/powerpc/boot/Makefile, but you cannot use it since the corresponding hook is missing in arch/powerpc/Makefile. It has never worked since its addition. Nobody has complained about it for 7 years, which means this code was unneeded. With this removal, the install.sh will be passed in with 4 parameters. Simplify the shell script. Signed-off-by: Masahiro Yamada Reviewed-by: Nick Desaulniers Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210729141937.445051-1-masahiroy@kernel.org --- arch/powerpc/boot/Makefile | 6 +----- arch/powerpc/boot/install.sh | 13 ------------- 2 files changed, 1 insertion(+), 18 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index e312ea802aa6..a702f9d1ec0d 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -448,11 +448,7 @@ $(obj)/zImage.initrd: $(addprefix $(obj)/, $(initrd-y)) install: $(CONFIGURE) $(addprefix $(obj)/, $(image-y)) sh -x $(srctree)/$(src)/install.sh "$(KERNELRELEASE)" vmlinux System.map "$(INSTALL_PATH)" -# Install the vmlinux and other built boot targets. -zInstall: $(CONFIGURE) $(addprefix $(obj)/, $(image-y)) - sh -x $(srctree)/$(src)/install.sh "$(KERNELRELEASE)" vmlinux System.map "$(INSTALL_PATH)" $^ - -PHONY += install zInstall +PHONY += install # anything not in $(targets) clean-files += $(image-) $(initrd-) cuImage.* dtbImage.* treeImage.* \ diff --git a/arch/powerpc/boot/install.sh b/arch/powerpc/boot/install.sh index b6a256bc96ee..658c93ca7437 100644 --- a/arch/powerpc/boot/install.sh +++ b/arch/powerpc/boot/install.sh @@ -15,7 +15,6 @@ # $2 - kernel image file # $3 - kernel map file # $4 - default install path (blank if root directory) -# $5 and more - kernel boot files; zImage*, uImage, cuImage.*, etc. # # Bail with error code if anything goes wrong @@ -41,15 +40,3 @@ fi cat $2 > $4/$image_name cp $3 $4/System.map - -# Copy all the bootable image files -path=$4 -shift 4 -while [ $# -ne 0 ]; do - image_name=`basename $1` - if [ -f $path/$image_name ]; then - mv $path/$image_name $path/$image_name.old - fi - cat $1 > $path/$image_name - shift -done; -- cgit v1.2.3-59-g8ed1b From 9bef456b20581e630ef9a13555ca04fed65a859d Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 29 Jul 2021 23:19:36 +0900 Subject: powerpc: make the install target not depend on any build artifact The install target should not depend on any build artifact. The reason is explained in commit 19514fc665ff ("arm, kbuild: make "make install" not depend on vmlinux"). Change the PowerPC installation code in a similar way. Signed-off-by: Masahiro Yamada Reviewed-by: Nick Desaulniers Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210729141937.445051-2-masahiroy@kernel.org --- arch/powerpc/boot/Makefile | 2 +- arch/powerpc/boot/install.sh | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index a702f9d1ec0d..0d165bd98b61 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -445,7 +445,7 @@ $(obj)/zImage.initrd: $(addprefix $(obj)/, $(initrd-y)) $(Q)rm -f $@; ln $< $@ # Only install the vmlinux -install: $(CONFIGURE) $(addprefix $(obj)/, $(image-y)) +install: sh -x $(srctree)/$(src)/install.sh "$(KERNELRELEASE)" vmlinux System.map "$(INSTALL_PATH)" PHONY += install diff --git a/arch/powerpc/boot/install.sh b/arch/powerpc/boot/install.sh index 658c93ca7437..14473150ddb4 100644 --- a/arch/powerpc/boot/install.sh +++ b/arch/powerpc/boot/install.sh @@ -20,6 +20,20 @@ # Bail with error code if anything goes wrong set -e +verify () { + if [ ! -f "$1" ]; then + echo "" 1>&2 + echo " *** Missing file: $1" 1>&2 + echo ' *** You need to run "make" before "make install".' 1>&2 + echo "" 1>&2 + exit 1 + fi +} + +# Make sure the files actually exist +verify "$2" +verify "$3" + # User may have a custom install script if [ -x ~/bin/${INSTALLKERNEL} ]; then exec ~/bin/${INSTALLKERNEL} "$@"; fi -- cgit v1.2.3-59-g8ed1b From 86ff0bce2e9665c8b074930fe6caed615da070c1 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 29 Jul 2021 23:19:37 +0900 Subject: powerpc: move the install rule to arch/powerpc/Makefile Currently, the install target in arch/powerpc/Makefile descends into arch/powerpc/boot/Makefile to invoke the shell script, but there is no good reason to do so. arch/powerpc/Makefile can run the shell script directly. Signed-off-by: Masahiro Yamada Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210729141937.445051-3-masahiroy@kernel.org --- arch/powerpc/Makefile | 3 ++- arch/powerpc/boot/Makefile | 6 ------ 2 files changed, 2 insertions(+), 7 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 6505d66f1193..9aaf1abbc641 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -407,7 +407,8 @@ endef PHONY += install install: - $(Q)$(MAKE) $(build)=$(boot) install + sh -x $(srctree)/$(boot)/install.sh "$(KERNELRELEASE)" vmlinux \ + System.map "$(INSTALL_PATH)" archclean: $(Q)$(MAKE) $(clean)=$(boot) diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index 0d165bd98b61..10c0fb306f15 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -444,12 +444,6 @@ $(obj)/zImage: $(addprefix $(obj)/, $(image-y)) $(obj)/zImage.initrd: $(addprefix $(obj)/, $(initrd-y)) $(Q)rm -f $@; ln $< $@ -# Only install the vmlinux -install: - sh -x $(srctree)/$(src)/install.sh "$(KERNELRELEASE)" vmlinux System.map "$(INSTALL_PATH)" - -PHONY += install - # anything not in $(targets) clean-files += $(image-) $(initrd-) cuImage.* dtbImage.* treeImage.* \ zImage zImage.initrd zImage.chrp zImage.coff zImage.holly \ -- cgit v1.2.3-59-g8ed1b From a4bec516b9c0823d7e2bb8c8928c98b535cf9adf Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Wed, 28 Jul 2021 23:26:05 +0530 Subject: powerpc/cacheinfo: Lookup cache by dt node and thread-group id Currently the cacheinfo code on powerpc indexes the "cache" objects (modelling the L1/L2/L3 caches) where the key is device-tree node corresponding to that cache. On some of the POWER server platforms thread-groups within the core share different sets of caches (Eg: On SMT8 POWER9 systems, threads 0,2,4,6 of a core share L1 cache and threads 1,3,5,7 of the same core share another L1 cache). On such platforms, there is a single device-tree node corresponding to that cache and the cache-configuration within the threads of the core is indicated via "ibm,thread-groups" device-tree property. Since the current code is not aware of the "ibm,thread-groups" property, on the aforementoined systems, cacheinfo code still treats all the threads in the core to be sharing the cache because of the single device-tree node (In the earlier example, the cacheinfo code would says CPUs 0-7 share L1 cache). In this patch, we make the powerpc cacheinfo code aware of the "ibm,thread-groups" property. We indexe the "cache" objects by the key-pair (device-tree node, thread-group id). For any CPUX, for a given level of cache, the thread-group id is defined to be the first CPU in the "ibm,thread-groups" cache-group containing CPUX. For levels of cache which are not represented in "ibm,thread-groups" property, the thread-group id is -1. [parth: Remove "static" keyword for the definition of "thread_group_l1_cache_map" and "thread_group_l2_cache_map" to get rid of the compile error.] Signed-off-by: Gautham R. Shenoy Signed-off-by: Parth Shah Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210728175607.591679-2-parth@linux.ibm.com --- arch/powerpc/include/asm/smp.h | 3 ++ arch/powerpc/kernel/cacheinfo.c | 80 +++++++++++++++++++++++++++++------------ arch/powerpc/kernel/smp.c | 4 +-- 3 files changed, 63 insertions(+), 24 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index 03b3d010cbab..1259040cc3a4 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -33,6 +33,9 @@ extern bool coregroup_enabled; extern int cpu_to_chip_id(int cpu); extern int *chip_id_lookup_table; +DECLARE_PER_CPU(cpumask_var_t, thread_group_l1_cache_map); +DECLARE_PER_CPU(cpumask_var_t, thread_group_l2_cache_map); + #ifdef CONFIG_SMP struct smp_ops_t { diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c index 6f903e9aa20b..5a6925d87424 100644 --- a/arch/powerpc/kernel/cacheinfo.c +++ b/arch/powerpc/kernel/cacheinfo.c @@ -120,6 +120,7 @@ struct cache { struct cpumask shared_cpu_map; /* online CPUs using this cache */ int type; /* split cache disambiguation */ int level; /* level not explicit in device tree */ + int group_id; /* id of the group of threads that share this cache */ struct list_head list; /* global list of cache objects */ struct cache *next_local; /* next cache of >= level */ }; @@ -142,22 +143,24 @@ static const char *cache_type_string(const struct cache *cache) } static void cache_init(struct cache *cache, int type, int level, - struct device_node *ofnode) + struct device_node *ofnode, int group_id) { cache->type = type; cache->level = level; cache->ofnode = of_node_get(ofnode); + cache->group_id = group_id; INIT_LIST_HEAD(&cache->list); list_add(&cache->list, &cache_list); } -static struct cache *new_cache(int type, int level, struct device_node *ofnode) +static struct cache *new_cache(int type, int level, + struct device_node *ofnode, int group_id) { struct cache *cache; cache = kzalloc(sizeof(*cache), GFP_KERNEL); if (cache) - cache_init(cache, type, level, ofnode); + cache_init(cache, type, level, ofnode, group_id); return cache; } @@ -309,20 +312,24 @@ static struct cache *cache_find_first_sibling(struct cache *cache) return cache; list_for_each_entry(iter, &cache_list, list) - if (iter->ofnode == cache->ofnode && iter->next_local == cache) + if (iter->ofnode == cache->ofnode && + iter->group_id == cache->group_id && + iter->next_local == cache) return iter; return cache; } -/* return the first cache on a local list matching node */ -static struct cache *cache_lookup_by_node(const struct device_node *node) +/* return the first cache on a local list matching node and thread-group id */ +static struct cache *cache_lookup_by_node_group(const struct device_node *node, + int group_id) { struct cache *cache = NULL; struct cache *iter; list_for_each_entry(iter, &cache_list, list) { - if (iter->ofnode != node) + if (iter->ofnode != node || + iter->group_id != group_id) continue; cache = cache_find_first_sibling(iter); break; @@ -352,14 +359,15 @@ static int cache_is_unified_d(const struct device_node *np) CACHE_TYPE_UNIFIED_D : CACHE_TYPE_UNIFIED; } -static struct cache *cache_do_one_devnode_unified(struct device_node *node, int level) +static struct cache *cache_do_one_devnode_unified(struct device_node *node, int group_id, + int level) { pr_debug("creating L%d ucache for %pOFP\n", level, node); - return new_cache(cache_is_unified_d(node), level, node); + return new_cache(cache_is_unified_d(node), level, node, group_id); } -static struct cache *cache_do_one_devnode_split(struct device_node *node, +static struct cache *cache_do_one_devnode_split(struct device_node *node, int group_id, int level) { struct cache *dcache, *icache; @@ -367,8 +375,8 @@ static struct cache *cache_do_one_devnode_split(struct device_node *node, pr_debug("creating L%d dcache and icache for %pOFP\n", level, node); - dcache = new_cache(CACHE_TYPE_DATA, level, node); - icache = new_cache(CACHE_TYPE_INSTRUCTION, level, node); + dcache = new_cache(CACHE_TYPE_DATA, level, node, group_id); + icache = new_cache(CACHE_TYPE_INSTRUCTION, level, node, group_id); if (!dcache || !icache) goto err; @@ -382,31 +390,32 @@ err: return NULL; } -static struct cache *cache_do_one_devnode(struct device_node *node, int level) +static struct cache *cache_do_one_devnode(struct device_node *node, int group_id, int level) { struct cache *cache; if (cache_node_is_unified(node)) - cache = cache_do_one_devnode_unified(node, level); + cache = cache_do_one_devnode_unified(node, group_id, level); else - cache = cache_do_one_devnode_split(node, level); + cache = cache_do_one_devnode_split(node, group_id, level); return cache; } static struct cache *cache_lookup_or_instantiate(struct device_node *node, + int group_id, int level) { struct cache *cache; - cache = cache_lookup_by_node(node); + cache = cache_lookup_by_node_group(node, group_id); WARN_ONCE(cache && cache->level != level, "cache level mismatch on lookup (got %d, expected %d)\n", cache->level, level); if (!cache) - cache = cache_do_one_devnode(node, level); + cache = cache_do_one_devnode(node, group_id, level); return cache; } @@ -443,7 +452,27 @@ static void do_subsidiary_caches_debugcheck(struct cache *cache) of_node_get_device_type(cache->ofnode)); } -static void do_subsidiary_caches(struct cache *cache) +/* + * If sub-groups of threads in a core containing @cpu_id share the + * L@level-cache (information obtained via "ibm,thread-groups" + * device-tree property), then we identify the group by the first + * thread-sibling in the group. We define this to be the group-id. + * + * In the absence of any thread-group information for L@level-cache, + * this function returns -1. + */ +static int get_group_id(unsigned int cpu_id, int level) +{ + if (has_big_cores && level == 1) + return cpumask_first(per_cpu(thread_group_l1_cache_map, + cpu_id)); + else if (thread_group_shares_l2 && level == 2) + return cpumask_first(per_cpu(thread_group_l2_cache_map, + cpu_id)); + return -1; +} + +static void do_subsidiary_caches(struct cache *cache, unsigned int cpu_id) { struct device_node *subcache_node; int level = cache->level; @@ -452,9 +481,11 @@ static void do_subsidiary_caches(struct cache *cache) while ((subcache_node = of_find_next_cache_node(cache->ofnode))) { struct cache *subcache; + int group_id; level++; - subcache = cache_lookup_or_instantiate(subcache_node, level); + group_id = get_group_id(cpu_id, level); + subcache = cache_lookup_or_instantiate(subcache_node, group_id, level); of_node_put(subcache_node); if (!subcache) break; @@ -468,6 +499,7 @@ static struct cache *cache_chain_instantiate(unsigned int cpu_id) { struct device_node *cpu_node; struct cache *cpu_cache = NULL; + int group_id; pr_debug("creating cache object(s) for CPU %i\n", cpu_id); @@ -476,11 +508,13 @@ static struct cache *cache_chain_instantiate(unsigned int cpu_id) if (!cpu_node) goto out; - cpu_cache = cache_lookup_or_instantiate(cpu_node, 1); + group_id = get_group_id(cpu_id, 1); + + cpu_cache = cache_lookup_or_instantiate(cpu_node, group_id, 1); if (!cpu_cache) goto out; - do_subsidiary_caches(cpu_cache); + do_subsidiary_caches(cpu_cache, cpu_id); cache_cpu_set(cpu_cache, cpu_id); out: @@ -848,13 +882,15 @@ static struct cache *cache_lookup_by_cpu(unsigned int cpu_id) { struct device_node *cpu_node; struct cache *cache; + int group_id; cpu_node = of_get_cpu_node(cpu_id, NULL); WARN_ONCE(!cpu_node, "no OF node found for CPU %i\n", cpu_id); if (!cpu_node) return NULL; - cache = cache_lookup_by_node(cpu_node); + group_id = get_group_id(cpu_id, 1); + cache = cache_lookup_by_node_group(cpu_node, group_id); of_node_put(cpu_node); return cache; diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 447b78a87c8f..a7fcac44a8e2 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -122,14 +122,14 @@ static struct thread_groups_list tgl[NR_CPUS] __initdata; * On big-cores system, thread_group_l1_cache_map for each CPU corresponds to * the set its siblings that share the L1-cache. */ -static DEFINE_PER_CPU(cpumask_var_t, thread_group_l1_cache_map); +DEFINE_PER_CPU(cpumask_var_t, thread_group_l1_cache_map); /* * On some big-cores system, thread_group_l2_cache_map for each CPU * corresponds to the set its siblings within the core that share the * L2-cache. */ -static DEFINE_PER_CPU(cpumask_var_t, thread_group_l2_cache_map); +DEFINE_PER_CPU(cpumask_var_t, thread_group_l2_cache_map); /* SMP operations for this machine */ struct smp_ops_t *smp_ops; -- cgit v1.2.3-59-g8ed1b From 69aa8e078545bc14d84a8b4b3cb914ac8f9f280e Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Wed, 28 Jul 2021 23:26:06 +0530 Subject: powerpc/cacheinfo: Remove the redundant get_shared_cpu_map() The helper function get_shared_cpu_map() was added in 'commit 500fe5f550ec ("powerpc/cacheinfo: Report the correct shared_cpu_map on big-cores")' and subsequently expanded upon in 'commit 0be47634db0b ("powerpc/cacheinfo: Print correct cache-sibling map/list for L2 cache")' in order to help report the correct groups of threads sharing these caches on big-core systems where groups of threads within a core can share different sets of caches. Now that powerpc/cacheinfo is aware of "ibm,thread-groups" property, cache->shared_cpu_map contains the correct set of thread-siblings sharing the cache. Hence we no longer need the functions get_shared_cpu_map(). This patch removes this function. We also remove the helper function index_dir_to_cpu() which was only called by get_shared_cpu_map(). With these functions removed, we can still see the correct cache-sibling map/list for L1 and L2 caches on systems with L1 and L2 caches distributed among groups of threads in a core. With this patch, on a SMT8 POWER10 system where the L1 and L2 caches are split between the two groups of threads in a core, for CPUs 8,9, the L1-Data, L1-Instruction, L2, L3 cache CPU sibling list is as follows: $ grep . /sys/devices/system/cpu/cpu[89]/cache/index[0123]/shared_cpu_list /sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list:8,10,12,14 /sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list:8,10,12,14 /sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list:8,10,12,14 /sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list:8-15 /sys/devices/system/cpu/cpu9/cache/index0/shared_cpu_list:9,11,13,15 /sys/devices/system/cpu/cpu9/cache/index1/shared_cpu_list:9,11,13,15 /sys/devices/system/cpu/cpu9/cache/index2/shared_cpu_list:9,11,13,15 /sys/devices/system/cpu/cpu9/cache/index3/shared_cpu_list:8-15 $ ppc64_cpu --smt=4 $ grep . /sys/devices/system/cpu/cpu[89]/cache/index[0123]/shared_cpu_list /sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list:8,10 /sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list:8,10 /sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list:8,10 /sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list:8-11 /sys/devices/system/cpu/cpu9/cache/index0/shared_cpu_list:9,11 /sys/devices/system/cpu/cpu9/cache/index1/shared_cpu_list:9,11 /sys/devices/system/cpu/cpu9/cache/index2/shared_cpu_list:9,11 /sys/devices/system/cpu/cpu9/cache/index3/shared_cpu_list:8-11 $ ppc64_cpu --smt=2 $ grep . /sys/devices/system/cpu/cpu[89]/cache/index[0123]/shared_cpu_list /sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list:8 /sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list:8 /sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list:8 /sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list:8-9 /sys/devices/system/cpu/cpu9/cache/index0/shared_cpu_list:9 /sys/devices/system/cpu/cpu9/cache/index1/shared_cpu_list:9 /sys/devices/system/cpu/cpu9/cache/index2/shared_cpu_list:9 /sys/devices/system/cpu/cpu9/cache/index3/shared_cpu_list:8-9 $ ppc64_cpu --smt=1 $ grep . /sys/devices/system/cpu/cpu[89]/cache/index[0123]/shared_cpu_list /sys/devices/system/cpu/cpu8/cache/index0/shared_cpu_list:8 /sys/devices/system/cpu/cpu8/cache/index1/shared_cpu_list:8 /sys/devices/system/cpu/cpu8/cache/index2/shared_cpu_list:8 /sys/devices/system/cpu/cpu8/cache/index3/shared_cpu_list:8 Signed-off-by: Gautham R. Shenoy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210728175607.591679-3-parth@linux.ibm.com --- arch/powerpc/kernel/cacheinfo.c | 41 +---------------------------------------- 1 file changed, 1 insertion(+), 40 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c index 5a6925d87424..20d91693eac1 100644 --- a/arch/powerpc/kernel/cacheinfo.c +++ b/arch/powerpc/kernel/cacheinfo.c @@ -675,45 +675,6 @@ static ssize_t level_show(struct kobject *k, struct kobj_attribute *attr, char * static struct kobj_attribute cache_level_attr = __ATTR(level, 0444, level_show, NULL); -static unsigned int index_dir_to_cpu(struct cache_index_dir *index) -{ - struct kobject *index_dir_kobj = &index->kobj; - struct kobject *cache_dir_kobj = index_dir_kobj->parent; - struct kobject *cpu_dev_kobj = cache_dir_kobj->parent; - struct device *dev = kobj_to_dev(cpu_dev_kobj); - - return dev->id; -} - -/* - * On big-core systems, each core has two groups of CPUs each of which - * has its own L1-cache. The thread-siblings which share l1-cache with - * @cpu can be obtained via cpu_smallcore_mask(). - * - * On some big-core systems, the L2 cache is shared only between some - * groups of siblings. This is already parsed and encoded in - * cpu_l2_cache_mask(). - * - * TODO: cache_lookup_or_instantiate() needs to be made aware of the - * "ibm,thread-groups" property so that cache->shared_cpu_map - * reflects the correct siblings on platforms that have this - * device-tree property. This helper function is only a stop-gap - * solution so that we report the correct siblings to the - * userspace via sysfs. - */ -static const struct cpumask *get_shared_cpu_map(struct cache_index_dir *index, struct cache *cache) -{ - if (has_big_cores) { - int cpu = index_dir_to_cpu(index); - if (cache->level == 1) - return cpu_smallcore_mask(cpu); - if (cache->level == 2 && thread_group_shares_l2) - return cpu_l2_cache_mask(cpu); - } - - return &cache->shared_cpu_map; -} - static ssize_t show_shared_cpumap(struct kobject *k, struct kobj_attribute *attr, char *buf, bool list) { @@ -724,7 +685,7 @@ show_shared_cpumap(struct kobject *k, struct kobj_attribute *attr, char *buf, bo index = kobj_to_cache_index_dir(k); cache = index->cache; - mask = get_shared_cpu_map(index, cache); + mask = &cache->shared_cpu_map; return cpumap_print_to_pagebuf(list, buf, mask); } -- cgit v1.2.3-59-g8ed1b From e9ef81e1079b0c4c374fba0f9affa7129c7c913b Mon Sep 17 00:00:00 2001 From: Parth Shah Date: Wed, 28 Jul 2021 23:26:07 +0530 Subject: powerpc/smp: Use existing L2 cache_map cpumask to find L3 cache siblings On POWER10 systems, the "ibm,thread-groups" property "2" indicates the cpus in thread-group share both L2 and L3 caches. Hence, use cache_property = 2 itself to find both the L2 and L3 cache siblings. Hence, create a new thread_group_l3_cache_map to keep list of L3 siblings, but fill the mask using same property "2" array. Signed-off-by: Parth Shah Reviewed-by: Gautham R. Shenoy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210728175607.591679-4-parth@linux.ibm.com --- arch/powerpc/include/asm/smp.h | 3 ++ arch/powerpc/kernel/cacheinfo.c | 3 ++ arch/powerpc/kernel/smp.c | 66 ++++++++++++++++++++++++++++------------- 3 files changed, 51 insertions(+), 21 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h index 1259040cc3a4..7ef1cd8168a0 100644 --- a/arch/powerpc/include/asm/smp.h +++ b/arch/powerpc/include/asm/smp.h @@ -35,6 +35,7 @@ extern int *chip_id_lookup_table; DECLARE_PER_CPU(cpumask_var_t, thread_group_l1_cache_map); DECLARE_PER_CPU(cpumask_var_t, thread_group_l2_cache_map); +DECLARE_PER_CPU(cpumask_var_t, thread_group_l3_cache_map); #ifdef CONFIG_SMP @@ -144,6 +145,7 @@ extern int cpu_to_core_id(int cpu); extern bool has_big_cores; extern bool thread_group_shares_l2; +extern bool thread_group_shares_l3; #define cpu_smt_mask cpu_smt_mask #ifdef CONFIG_SCHED_SMT @@ -198,6 +200,7 @@ extern void __cpu_die(unsigned int cpu); #define hard_smp_processor_id() get_hard_smp_processor_id(0) #define smp_setup_cpu_maps() #define thread_group_shares_l2 0 +#define thread_group_shares_l3 0 static inline void inhibit_secondary_onlining(void) {} static inline void uninhibit_secondary_onlining(void) {} static inline const struct cpumask *cpu_sibling_mask(int cpu) diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c index 20d91693eac1..cf1be75b7833 100644 --- a/arch/powerpc/kernel/cacheinfo.c +++ b/arch/powerpc/kernel/cacheinfo.c @@ -469,6 +469,9 @@ static int get_group_id(unsigned int cpu_id, int level) else if (thread_group_shares_l2 && level == 2) return cpumask_first(per_cpu(thread_group_l2_cache_map, cpu_id)); + else if (thread_group_shares_l3 && level == 3) + return cpumask_first(per_cpu(thread_group_l3_cache_map, + cpu_id)); return -1; } diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index a7fcac44a8e2..f2abd88e0c25 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -78,6 +78,7 @@ struct task_struct *secondary_current; bool has_big_cores; bool coregroup_enabled; bool thread_group_shares_l2; +bool thread_group_shares_l3; DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map); DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map); @@ -101,7 +102,7 @@ enum { #define MAX_THREAD_LIST_SIZE 8 #define THREAD_GROUP_SHARE_L1 1 -#define THREAD_GROUP_SHARE_L2 2 +#define THREAD_GROUP_SHARE_L2_L3 2 struct thread_groups { unsigned int property; unsigned int nr_groups; @@ -131,6 +132,12 @@ DEFINE_PER_CPU(cpumask_var_t, thread_group_l1_cache_map); */ DEFINE_PER_CPU(cpumask_var_t, thread_group_l2_cache_map); +/* + * On P10, thread_group_l3_cache_map for each CPU is equal to the + * thread_group_l2_cache_map + */ +DEFINE_PER_CPU(cpumask_var_t, thread_group_l3_cache_map); + /* SMP operations for this machine */ struct smp_ops_t *smp_ops; @@ -889,19 +896,41 @@ out: return tg; } +static int update_mask_from_threadgroup(cpumask_var_t *mask, struct thread_groups *tg, int cpu, int cpu_group_start) +{ + int first_thread = cpu_first_thread_sibling(cpu); + int i; + + zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cpu)); + + for (i = first_thread; i < first_thread + threads_per_core; i++) { + int i_group_start = get_cpu_thread_group_start(i, tg); + + if (unlikely(i_group_start == -1)) { + WARN_ON_ONCE(1); + return -ENODATA; + } + + if (i_group_start == cpu_group_start) + cpumask_set_cpu(i, *mask); + } + + return 0; +} + static int __init init_thread_group_cache_map(int cpu, int cache_property) { - int first_thread = cpu_first_thread_sibling(cpu); - int i, cpu_group_start = -1, err = 0; + int cpu_group_start = -1, err = 0; struct thread_groups *tg = NULL; cpumask_var_t *mask = NULL; if (cache_property != THREAD_GROUP_SHARE_L1 && - cache_property != THREAD_GROUP_SHARE_L2) + cache_property != THREAD_GROUP_SHARE_L2_L3) return -EINVAL; tg = get_thread_groups(cpu, cache_property, &err); + if (!tg) return err; @@ -912,25 +941,18 @@ static int __init init_thread_group_cache_map(int cpu, int cache_property) return -ENODATA; } - if (cache_property == THREAD_GROUP_SHARE_L1) + if (cache_property == THREAD_GROUP_SHARE_L1) { mask = &per_cpu(thread_group_l1_cache_map, cpu); - else if (cache_property == THREAD_GROUP_SHARE_L2) + update_mask_from_threadgroup(mask, tg, cpu, cpu_group_start); + } + else if (cache_property == THREAD_GROUP_SHARE_L2_L3) { mask = &per_cpu(thread_group_l2_cache_map, cpu); - - zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cpu)); - - for (i = first_thread; i < first_thread + threads_per_core; i++) { - int i_group_start = get_cpu_thread_group_start(i, tg); - - if (unlikely(i_group_start == -1)) { - WARN_ON_ONCE(1); - return -ENODATA; - } - - if (i_group_start == cpu_group_start) - cpumask_set_cpu(i, *mask); + update_mask_from_threadgroup(mask, tg, cpu, cpu_group_start); + mask = &per_cpu(thread_group_l3_cache_map, cpu); + update_mask_from_threadgroup(mask, tg, cpu, cpu_group_start); } + return 0; } @@ -1020,14 +1042,16 @@ static int __init init_big_cores(void) has_big_cores = true; for_each_possible_cpu(cpu) { - int err = init_thread_group_cache_map(cpu, THREAD_GROUP_SHARE_L2); + int err = init_thread_group_cache_map(cpu, THREAD_GROUP_SHARE_L2_L3); if (err) return err; } thread_group_shares_l2 = true; - pr_debug("L2 cache only shared by the threads in the small core\n"); + thread_group_shares_l3 = true; + pr_debug("L2/L3 cache only shared by the threads in the small core\n"); + return 0; } -- cgit v1.2.3-59-g8ed1b From cf9c615cde49fb5d2480549c8c955a0a387798d3 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Wed, 21 Jul 2021 00:15:04 +1000 Subject: powerpc/64s/perf: Always use SIAR for kernel interrupts If an interrupt is taken in kernel mode, always use SIAR for it rather than looking at regs_sipr. This prevents samples piling up around interrupt enable (hard enable or interrupt replay via soft enable) in PMUs / modes where the PR sample indication is not in synch with SIAR. This results in better sampling of interrupt entry and exit in particular. Signed-off-by: Nicholas Piggin Tested-by: Athira Rajeev Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210720141504.420110-1-npiggin@gmail.com --- arch/powerpc/perf/core-book3s.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index bb0ee716de91..91203ed9d0ff 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -340,6 +340,13 @@ static inline void perf_read_regs(struct pt_regs *regs) * If the PMU doesn't update the SIAR for non marked events use * pt_regs. * + * If regs is a kernel interrupt, always use SIAR. Some PMUs have an + * issue with regs_sipr not being in synch with SIAR in interrupt entry + * and return sequences, which can result in regs_sipr being true for + * kernel interrupts and SIAR, which has the effect of causing samples + * to pile up at mtmsrd MSR[EE] 0->1 or pending irq replay around + * interrupt entry/exit. + * * If the PMU has HV/PR flags then check to see if they * place the exception in userspace. If so, use pt_regs. In * continuous sampling mode the SIAR and the PMU exception are @@ -356,6 +363,8 @@ static inline void perf_read_regs(struct pt_regs *regs) use_siar = 1; else if ((ppmu->flags & PPMU_NO_CONT_SAMPLING)) use_siar = 0; + else if (!user_mode(regs)) + use_siar = 1; else if (!(ppmu->flags & PPMU_NO_SIPR) && regs_sipr(regs)) use_siar = 0; else -- cgit v1.2.3-59-g8ed1b From 09ca497528dac12cbbceab8197011c875a96d053 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sun, 27 Jun 2021 17:09:18 +0000 Subject: powerpc: Remove in_kernel_text() Last user of in_kernel_text() stopped using in with commit 549e8152de80 ("powerpc: Make the 64-bit kernel as a position-independent executable"). Generic function is_kernel_text() does the same. So remote it. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/2a3a5b6f8cc0ef4e854d7b764f66aa8d2ee270d2.1624813698.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/sections.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h index 324d7b298ec3..6e4af4492a14 100644 --- a/arch/powerpc/include/asm/sections.h +++ b/arch/powerpc/include/asm/sections.h @@ -38,14 +38,6 @@ extern char start_virt_trampolines[]; extern char end_virt_trampolines[]; #endif -static inline int in_kernel_text(unsigned long addr) -{ - if (addr >= (unsigned long)_stext && addr < (unsigned long)__init_end) - return 1; - - return 0; -} - static inline unsigned long kernel_toc_addr(void) { /* Defined by the linker, see vmlinux.lds.S */ -- cgit v1.2.3-59-g8ed1b From c8a6d91005343dea0d53be0ff0620c66934dcd44 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 5 Jul 2021 12:00:50 +0000 Subject: powerpc/non-smp: Unconditionaly call smp_mb() on switch_mm Commit 3ccfebedd8cf ("powerpc, membarrier: Skip memory barrier in switch_mm()") added some logic to skip the smp_mb() in switch_mm_irqs_off() before the call to switch_mmu_context(). However, on non SMP smp_mb() is just a compiler barrier and doing it unconditionaly is simpler than the logic used to check whether the barrier is needed or not. After the patch: 00000000 : ... c: 7c 04 18 40 cmplw r4,r3 10: 81 24 00 24 lwz r9,36(r4) 14: 91 25 04 c8 stw r9,1224(r5) 18: 4d 82 00 20 beqlr 1c: 48 00 00 00 b 1c 1c: R_PPC_REL24 switch_mmu_context Before the patch: 00000000 : ... c: 7c 04 18 40 cmplw r4,r3 10: 81 24 00 24 lwz r9,36(r4) 14: 91 25 04 c8 stw r9,1224(r5) 18: 4d 82 00 20 beqlr 1c: 81 24 00 28 lwz r9,40(r4) 20: 71 29 00 0a andi. r9,r9,10 24: 40 82 00 34 bne 58 28: 48 00 00 00 b 28 28: R_PPC_REL24 switch_mmu_context ... 58: 2c 03 00 00 cmpwi r3,0 5c: 41 82 ff cc beq 28 60: 48 00 00 00 b 60 60: R_PPC_REL24 switch_mmu_context Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/e9d501da0c59f60ca767b1b3ea4603fce6d02b9e.1625486440.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/membarrier.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/membarrier.h b/arch/powerpc/include/asm/membarrier.h index 6e20bb5c74ea..de7f79157918 100644 --- a/arch/powerpc/include/asm/membarrier.h +++ b/arch/powerpc/include/asm/membarrier.h @@ -12,7 +12,8 @@ static inline void membarrier_arch_switch_mm(struct mm_struct *prev, * when switching from userspace to kernel is not needed after * store to rq->curr. */ - if (likely(!(atomic_read(&next->membarrier_state) & + if (IS_ENABLED(CONFIG_SMP) && + likely(!(atomic_read(&next->membarrier_state) & (MEMBARRIER_STATE_PRIVATE_EXPEDITED | MEMBARRIER_STATE_GLOBAL_EXPEDITED)) || !prev)) return; -- cgit v1.2.3-59-g8ed1b From 9c7248bb8de31f51c693bfa6a6ea53b1c07e0fa8 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Tue, 11 May 2021 09:31:36 +0200 Subject: powerpc/numa: Consider the max NUMA node for migratable LPAR When a LPAR is migratable, we should consider the maximum possible NUMA node instead of the number of NUMA nodes from the actual system. The DT property 'ibm,current-associativity-domains' defines the maximum number of nodes the LPAR can see when running on that box. But if the LPAR is being migrated on another box, it may see up to the nodes defined by 'ibm,max-associativity-domains'. So if a LPAR is migratable, that value should be used. Unfortunately, there is no easy way to know if an LPAR is migratable or not. The hypervisor exports the property 'ibm,migratable-partition' in the case it set to migrate partition, but that would not mean that the current partition is migratable. Without this patch, when a LPAR is started on a 2 node box and then migrated to a 3 node box, the hypervisor may spread the LPAR's CPUs on the 3rd node. In that case if a CPU from that 3rd node is added to the LPAR, it will be wrongly assigned to the node because the kernel has been set to use up to 2 nodes (the configuration of the departure node). With this patch applies, the CPU is correctly added to the 3rd node. Fixes: f9f130ff2ec9 ("powerpc/numa: Detect support for coregroup") Signed-off-by: Laurent Dufour Reviewed-by: Srikar Dronamraju Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210511073136.17795-1-ldufour@linux.ibm.com --- arch/powerpc/mm/numa.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index f2bf98bdcea2..094a1076fd1f 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -893,7 +893,7 @@ static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn) static void __init find_possible_nodes(void) { struct device_node *rtas; - const __be32 *domains; + const __be32 *domains = NULL; int prop_length, max_nodes; u32 i; @@ -909,9 +909,14 @@ static void __init find_possible_nodes(void) * it doesn't exist, then fallback on ibm,max-associativity-domains. * Current denotes what the platform can support compared to max * which denotes what the Hypervisor can support. + * + * If the LPAR is migratable, new nodes might be activated after a LPM, + * so we should consider the max number in that case. */ - domains = of_get_property(rtas, "ibm,current-associativity-domains", - &prop_length); + if (!of_get_property(of_root, "ibm,migratable-partition", NULL)) + domains = of_get_property(rtas, + "ibm,current-associativity-domains", + &prop_length); if (!domains) { domains = of_get_property(rtas, "ibm,max-associativity-domains", &prop_length); @@ -920,6 +925,8 @@ static void __init find_possible_nodes(void) } max_nodes = of_read_number(&domains[min_common_depth], 1); + pr_info("Partition configured for %d NUMA nodes.\n", max_nodes); + for (i = 0; i < max_nodes; i++) { if (!node_possible(i)) node_set(i, node_possible_map); -- cgit v1.2.3-59-g8ed1b From d144f4d5a8a804133d20ff311d7be70bcdbfaac2 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Mon, 17 May 2021 11:06:06 +0200 Subject: pseries/drmem: update LMBs after LPM After a LPM, the device tree node ibm,dynamic-reconfiguration-memory may be updated by the hypervisor in the case the NUMA topology of the LPAR's memory is updated. This is handled by the kernel, but the memory's node is not updated because there is no way to move a memory block between nodes from the Linux kernel point of view. If later a memory block is added or removed, drmem_update_dt() is called and it is overwriting the DT node ibm,dynamic-reconfiguration-memory to match the added or removed LMB. But the LMB's associativity node has not been updated after the DT node update and thus the node is overwritten by the Linux's topology instead of the hypervisor one. Introduce a hook called when the ibm,dynamic-reconfiguration-memory node is updated to force an update of the LMB's associativity. However, ignore the call to that hook when the update has been triggered by drmem_update_dt(). Because, in that case, the LMB tree has been used to set the DT property and thus it doesn't need to be updated back. Since drmem_update_dt() is called under the protection of the device_hotplug_lock and the hook is called in the same context, use a simple boolean variable to detect that call. Signed-off-by: Laurent Dufour Reviewed-by: Nathan Lynch Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210517090606.56930-1-ldufour@linux.ibm.com --- arch/powerpc/include/asm/drmem.h | 1 + arch/powerpc/mm/drmem.c | 46 +++++++++++++++++++++++++ arch/powerpc/platforms/pseries/hotplug-memory.c | 4 +++ 3 files changed, 51 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/drmem.h b/arch/powerpc/include/asm/drmem.h index bf2402fed3e0..4265d5e95c2c 100644 --- a/arch/powerpc/include/asm/drmem.h +++ b/arch/powerpc/include/asm/drmem.h @@ -111,6 +111,7 @@ int drmem_update_dt(void); int __init walk_drmem_lmbs_early(unsigned long node, void *data, int (*func)(struct drmem_lmb *, const __be32 **, void *)); +void drmem_update_lmbs(struct property *prop); #endif static inline void invalidate_lmb_associativity_index(struct drmem_lmb *lmb) diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c index 9af3832c9d8d..22197b18d85e 100644 --- a/arch/powerpc/mm/drmem.c +++ b/arch/powerpc/mm/drmem.c @@ -18,6 +18,7 @@ static int n_root_addr_cells, n_root_size_cells; static struct drmem_lmb_info __drmem_info; struct drmem_lmb_info *drmem_info = &__drmem_info; +static bool in_drmem_update; u64 drmem_lmb_memory_max(void) { @@ -178,6 +179,11 @@ int drmem_update_dt(void) if (!memory) return -1; + /* + * Set in_drmem_update to prevent the notifier callback to process the + * DT property back since the change is coming from the LMB tree. + */ + in_drmem_update = true; prop = of_find_property(memory, "ibm,dynamic-memory", NULL); if (prop) { rc = drmem_update_dt_v1(memory, prop); @@ -186,6 +192,7 @@ int drmem_update_dt(void) if (prop) rc = drmem_update_dt_v2(memory, prop); } + in_drmem_update = false; of_node_put(memory); return rc; @@ -307,6 +314,45 @@ int __init walk_drmem_lmbs_early(unsigned long node, void *data, return ret; } +/* + * Update the LMB associativity index. + */ +static int update_lmb(struct drmem_lmb *updated_lmb, + __maybe_unused const __be32 **usm, + __maybe_unused void *data) +{ + struct drmem_lmb *lmb; + + for_each_drmem_lmb(lmb) { + if (lmb->drc_index != updated_lmb->drc_index) + continue; + + lmb->aa_index = updated_lmb->aa_index; + break; + } + return 0; +} + +/* + * Update the LMB associativity index. + * + * This needs to be called when the hypervisor is updating the + * dynamic-reconfiguration-memory node property. + */ +void drmem_update_lmbs(struct property *prop) +{ + /* + * Don't update the LMBs if triggered by the update done in + * drmem_update_dt(), the LMB values have been used to the update the DT + * property in that case. + */ + if (in_drmem_update) + return; + if (!strcmp(prop->name, "ibm,dynamic-memory")) + __walk_drmem_v1_lmbs(prop->value, NULL, NULL, update_lmb); + else if (!strcmp(prop->name, "ibm,dynamic-memory-v2")) + __walk_drmem_v2_lmbs(prop->value, NULL, NULL, update_lmb); +} #endif static int init_drmem_lmb_size(struct device_node *dn) diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 377d852f5a9a..0beb3ca2b549 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -979,6 +979,10 @@ static int pseries_memory_notifier(struct notifier_block *nb, case OF_RECONFIG_DETACH_NODE: err = pseries_remove_mem_node(rd->dn); break; + case OF_RECONFIG_UPDATE_PROPERTY: + if (!strcmp(rd->dn->name, + "ibm,dynamic-reconfiguration-memory")) + drmem_update_lmbs(rd->prop); } return notifier_from_errno(err); } -- cgit v1.2.3-59-g8ed1b From bd1dd4c5f5286df0148b5b316f37c583b8f55fa1 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Thu, 29 Apr 2021 19:49:08 +0200 Subject: powerpc/pseries: Prevent free CPU ids being reused on another node MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a CPU is hot added, the CPU ids are taken from the available mask from the lower possible set. If that set of values was previously used for a CPU attached to a different node, it appears to an application as if these CPUs have migrated from one node to another node which is not expected. To prevent this, it is needed to record the CPU ids used for each node and to not reuse them on another node. However, to prevent CPU hot plug to fail, in the case the CPU ids is starved on a node, the capability to reuse other nodes’ free CPU ids is kept. A warning is displayed in such a case to warn the user. A new CPU bit mask (node_recorded_ids_map) is introduced for each possible node. It is populated with the CPU onlined at boot time, and then when a CPU is hot plugged to a node. The bits in that mask remain when the CPU is hot unplugged, to remind this CPU ids have been used for this node. If no id set was found, a retry is made without removing the ids used on the other nodes to try reusing them. This is the way ids have been allocated prior to this patch. The effect of this patch can be seen by removing and adding CPUs using the Qemu monitor. In the following case, the first CPU from the node 2 is removed, then the first one from the node 1 is removed too. Later, the first CPU of the node 2 is added back. Without that patch, the kernel will number these CPUs using the first CPU ids available which are the ones freed when removing the second CPU of the node 0. This leads to the CPU ids 16-23 to move from the node 1 to the node 2. With the patch applied, the CPU ids 32-39 are used since they are the lowest free ones which have not been used on another node. At boot time: [root@vm40 ~]# numactl -H | grep cpus node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 node 1 cpus: 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 node 2 cpus: 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 Vanilla kernel, after the CPU hot unplug/plug operations: [root@vm40 ~]# numactl -H | grep cpus node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 node 1 cpus: 24 25 26 27 28 29 30 31 node 2 cpus: 16 17 18 19 20 21 22 23 40 41 42 43 44 45 46 47 Patched kernel, after the CPU hot unplug/plug operations: [root@vm40 ~]# numactl -H | grep cpus node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 node 1 cpus: 24 25 26 27 28 29 30 31 node 2 cpus: 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 Signed-off-by: Laurent Dufour Reviewed-by: Nathan Lynch Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210429174908.16613-1-ldufour@linux.ibm.com --- arch/powerpc/platforms/pseries/hotplug-cpu.c | 171 +++++++++++++++++++++------ 1 file changed, 132 insertions(+), 39 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index 7e970f81d8ff..e1f224320102 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -39,6 +39,12 @@ /* This version can't take the spinlock, because it never returns */ static int rtas_stop_self_token = RTAS_UNKNOWN_SERVICE; +/* + * Record the CPU ids used on each nodes. + * Protected by cpu_add_remove_lock. + */ +static cpumask_var_t node_recorded_ids_map[MAX_NUMNODES]; + static void rtas_stop_self(void) { static struct rtas_args args; @@ -139,72 +145,148 @@ static void pseries_cpu_die(unsigned int cpu) paca_ptrs[cpu]->cpu_start = 0; } +/** + * find_cpu_id_range - found a linear ranger of @nthreads free CPU ids. + * @nthreads : the number of threads (cpu ids) + * @assigned_node : the node it belongs to or NUMA_NO_NODE if free ids from any + * node can be peek. + * @cpu_mask: the returned CPU mask. + * + * Returns 0 on success. + */ +static int find_cpu_id_range(unsigned int nthreads, int assigned_node, + cpumask_var_t *cpu_mask) +{ + cpumask_var_t candidate_mask; + unsigned int cpu, node; + int rc = -ENOSPC; + + if (!zalloc_cpumask_var(&candidate_mask, GFP_KERNEL)) + return -ENOMEM; + + cpumask_clear(*cpu_mask); + for (cpu = 0; cpu < nthreads; cpu++) + cpumask_set_cpu(cpu, *cpu_mask); + + BUG_ON(!cpumask_subset(cpu_present_mask, cpu_possible_mask)); + + /* Get a bitmap of unoccupied slots. */ + cpumask_xor(candidate_mask, cpu_possible_mask, cpu_present_mask); + + if (assigned_node != NUMA_NO_NODE) { + /* + * Remove free ids previously assigned on the other nodes. We + * can walk only online nodes because once a node became online + * it is not turned offlined back. + */ + for_each_online_node(node) { + if (node == assigned_node) + continue; + cpumask_andnot(candidate_mask, candidate_mask, + node_recorded_ids_map[node]); + } + } + + if (cpumask_empty(candidate_mask)) + goto out; + + while (!cpumask_empty(*cpu_mask)) { + if (cpumask_subset(*cpu_mask, candidate_mask)) + /* Found a range where we can insert the new cpu(s) */ + break; + cpumask_shift_left(*cpu_mask, *cpu_mask, nthreads); + } + + if (!cpumask_empty(*cpu_mask)) + rc = 0; + +out: + free_cpumask_var(candidate_mask); + return rc; +} + /* * Update cpu_present_mask and paca(s) for a new cpu node. The wrinkle - * here is that a cpu device node may represent up to two logical cpus + * here is that a cpu device node may represent multiple logical cpus * in the SMT case. We must honor the assumption in other code that * the logical ids for sibling SMT threads x and y are adjacent, such * that x^1 == y and y^1 == x. */ static int pseries_add_processor(struct device_node *np) { - unsigned int cpu; - cpumask_var_t candidate_mask, tmp; - int err = -ENOSPC, len, nthreads, i; + int len, nthreads, node, cpu, assigned_node; + int rc = 0; + cpumask_var_t cpu_mask; const __be32 *intserv; intserv = of_get_property(np, "ibm,ppc-interrupt-server#s", &len); if (!intserv) return 0; - zalloc_cpumask_var(&candidate_mask, GFP_KERNEL); - zalloc_cpumask_var(&tmp, GFP_KERNEL); - nthreads = len / sizeof(u32); - for (i = 0; i < nthreads; i++) - cpumask_set_cpu(i, tmp); - cpu_maps_update_begin(); + if (!alloc_cpumask_var(&cpu_mask, GFP_KERNEL)) + return -ENOMEM; - BUG_ON(!cpumask_subset(cpu_present_mask, cpu_possible_mask)); + /* + * Fetch from the DT nodes read by dlpar_configure_connector() the NUMA + * node id the added CPU belongs to. + */ + node = of_node_to_nid(np); + if (node < 0 || !node_possible(node)) + node = first_online_node; - /* Get a bitmap of unoccupied slots. */ - cpumask_xor(candidate_mask, cpu_possible_mask, cpu_present_mask); - if (cpumask_empty(candidate_mask)) { - /* If we get here, it most likely means that NR_CPUS is - * less than the partition's max processors setting. + BUG_ON(node == NUMA_NO_NODE); + assigned_node = node; + + cpu_maps_update_begin(); + + rc = find_cpu_id_range(nthreads, node, &cpu_mask); + if (rc && nr_node_ids > 1) { + /* + * Try again, considering the free CPU ids from the other node. */ - printk(KERN_ERR "Cannot add cpu %pOF; this system configuration" - " supports %d logical cpus.\n", np, - num_possible_cpus()); - goto out_unlock; + node = NUMA_NO_NODE; + rc = find_cpu_id_range(nthreads, NUMA_NO_NODE, &cpu_mask); } - while (!cpumask_empty(tmp)) - if (cpumask_subset(tmp, candidate_mask)) - /* Found a range where we can insert the new cpu(s) */ - break; - else - cpumask_shift_left(tmp, tmp, nthreads); - - if (cpumask_empty(tmp)) { - printk(KERN_ERR "Unable to find space in cpu_present_mask for" - " processor %pOFn with %d thread(s)\n", np, - nthreads); - goto out_unlock; + if (rc) { + pr_err("Cannot add cpu %pOF; this system configuration" + " supports %d logical cpus.\n", np, num_possible_cpus()); + goto out; } - for_each_cpu(cpu, tmp) { + for_each_cpu(cpu, cpu_mask) { BUG_ON(cpu_present(cpu)); set_cpu_present(cpu, true); set_hard_smp_processor_id(cpu, be32_to_cpu(*intserv++)); } - err = 0; -out_unlock: + + /* Record the newly used CPU ids for the associate node. */ + cpumask_or(node_recorded_ids_map[assigned_node], + node_recorded_ids_map[assigned_node], cpu_mask); + + /* + * If node is set to NUMA_NO_NODE, CPU ids have be reused from + * another node, remove them from its mask. + */ + if (node == NUMA_NO_NODE) { + cpu = cpumask_first(cpu_mask); + pr_warn("Reusing free CPU ids %d-%d from another node\n", + cpu, cpu + nthreads - 1); + for_each_online_node(node) { + if (node == assigned_node) + continue; + cpumask_andnot(node_recorded_ids_map[node], + node_recorded_ids_map[node], + cpu_mask); + } + } + +out: cpu_maps_update_done(); - free_cpumask_var(candidate_mask); - free_cpumask_var(tmp); - return err; + free_cpumask_var(cpu_mask); + return rc; } /* @@ -908,6 +990,7 @@ static struct notifier_block pseries_smp_nb = { static int __init pseries_cpu_hotplug_init(void) { int qcss_tok; + unsigned int node; #ifdef CONFIG_ARCH_CPU_PROBE_RELEASE ppc_md.cpu_probe = dlpar_cpu_probe; @@ -929,8 +1012,18 @@ static int __init pseries_cpu_hotplug_init(void) smp_ops->cpu_die = pseries_cpu_die; /* Processors can be added/removed only on LPAR */ - if (firmware_has_feature(FW_FEATURE_LPAR)) + if (firmware_has_feature(FW_FEATURE_LPAR)) { + for_each_node(node) { + alloc_bootmem_cpumask_var(&node_recorded_ids_map[node]); + + /* Record ids of CPU added at boot time */ + cpumask_or(node_recorded_ids_map[node], + node_recorded_ids_map[node], + node_to_cpumask_map[node]); + } + of_reconfig_notifier_register(&pseries_smp_nb); + } return 0; } -- cgit v1.2.3-59-g8ed1b From c00103abf76fd3916596afd07dd3fdeee0dca15d Mon Sep 17 00:00:00 2001 From: kernel test robot Date: Tue, 3 Aug 2021 16:59:55 +0200 Subject: powerpc/kexec: fix for_each_child.cocci warning for_each_node_by_type should have of_node_put() before return. Generated by: scripts/coccinelle/iterators/for_each_child.cocci Reported-by: kernel test robot Signed-off-by: kernel test robot Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/alpine.DEB.2.22.394.2108031654080.17639@hadrien --- arch/powerpc/kexec/core_64.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kexec/core_64.c b/arch/powerpc/kexec/core_64.c index 84618d3c8013..89c069d664a5 100644 --- a/arch/powerpc/kexec/core_64.c +++ b/arch/powerpc/kexec/core_64.c @@ -64,8 +64,10 @@ int default_machine_kexec_prepare(struct kimage *image) begin = image->segment[i].mem; end = begin + image->segment[i].memsz; - if ((begin < high) && (end > low)) + if ((begin < high) && (end > low)) { + of_node_put(node); return -ETXTBSY; + } } } -- cgit v1.2.3-59-g8ed1b From 5ae36401ca4ea2737d779ce7c267444b16530001 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 3 Aug 2021 16:15:46 +0200 Subject: powerpc: Replace deprecated CPU-hotplug functions. The functions get_online_cpus() and put_online_cpus() have been deprecated during the CPU hotplug rework. They map directly to cpus_read_lock() and cpus_read_unlock(). Replace deprecated CPU-hotplug functions with the official version. The behavior remains unchanged. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210803141621.780504-4-bigeasy@linutronix.de --- arch/powerpc/kernel/rtasd.c | 4 ++-- arch/powerpc/kvm/book3s_hv_builtin.c | 10 +++++----- arch/powerpc/platforms/powernv/idle.c | 4 ++-- arch/powerpc/platforms/powernv/opal-imc.c | 8 ++++---- 4 files changed, 13 insertions(+), 13 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c index 8561dfb33f24..32ee17753eb4 100644 --- a/arch/powerpc/kernel/rtasd.c +++ b/arch/powerpc/kernel/rtasd.c @@ -429,7 +429,7 @@ static void rtas_event_scan(struct work_struct *w) do_event_scan(); - get_online_cpus(); + cpus_read_lock(); /* raw_ OK because just using CPU as starting point. */ cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); @@ -451,7 +451,7 @@ static void rtas_event_scan(struct work_struct *w) schedule_delayed_work_on(cpu, &event_scan_work, __round_jiffies_relative(event_scan_delay, cpu)); - put_online_cpus(); + cpus_read_unlock(); } #ifdef CONFIG_PPC64 diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index be8ef1c5b1bf..fcf4760a3a0e 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -137,23 +137,23 @@ long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target, * exist in the system. We use a counter of VMs to track this. * * One of the operations we need to block is onlining of secondaries, so we - * protect hv_vm_count with get/put_online_cpus(). + * protect hv_vm_count with cpus_read_lock/unlock(). */ static atomic_t hv_vm_count; void kvm_hv_vm_activated(void) { - get_online_cpus(); + cpus_read_lock(); atomic_inc(&hv_vm_count); - put_online_cpus(); + cpus_read_unlock(); } EXPORT_SYMBOL_GPL(kvm_hv_vm_activated); void kvm_hv_vm_deactivated(void) { - get_online_cpus(); + cpus_read_lock(); atomic_dec(&hv_vm_count); - put_online_cpus(); + cpus_read_unlock(); } EXPORT_SYMBOL_GPL(kvm_hv_vm_deactivated); diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 528a7e0cf83a..aa27689b832d 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -199,12 +199,12 @@ static ssize_t store_fastsleep_workaround_applyonce(struct device *dev, */ power7_fastsleep_workaround_exit = false; - get_online_cpus(); + cpus_read_lock(); primary_thread_mask = cpu_online_cores_map(); on_each_cpu_mask(&primary_thread_mask, pnv_fastsleep_workaround_apply, &err, 1); - put_online_cpus(); + cpus_read_unlock(); if (err) { pr_err("fastsleep_workaround_applyonce change failed while running pnv_fastsleep_workaround_apply"); goto fail; diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c index 7824cc364bc4..ba02a75c1410 100644 --- a/arch/powerpc/platforms/powernv/opal-imc.c +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -186,7 +186,7 @@ static void disable_nest_pmu_counters(void) int nid, cpu; const struct cpumask *l_cpumask; - get_online_cpus(); + cpus_read_lock(); for_each_node_with_cpus(nid) { l_cpumask = cpumask_of_node(nid); cpu = cpumask_first_and(l_cpumask, cpu_online_mask); @@ -195,7 +195,7 @@ static void disable_nest_pmu_counters(void) opal_imc_counters_stop(OPAL_IMC_COUNTERS_NEST, get_hard_smp_processor_id(cpu)); } - put_online_cpus(); + cpus_read_unlock(); } static void disable_core_pmu_counters(void) @@ -203,7 +203,7 @@ static void disable_core_pmu_counters(void) cpumask_t cores_map; int cpu, rc; - get_online_cpus(); + cpus_read_lock(); /* Disable the IMC Core functions */ cores_map = cpu_online_cores_map(); for_each_cpu(cpu, &cores_map) { @@ -213,7 +213,7 @@ static void disable_core_pmu_counters(void) pr_err("%s: Failed to stop Core (cpu = %d)\n", __FUNCTION__, cpu); } - put_online_cpus(); + cpus_read_unlock(); } int get_max_nest_dev(void) -- cgit v1.2.3-59-g8ed1b From 27fd1111051dc218e5b6cb2da5dbb3f342879ff1 Mon Sep 17 00:00:00 2001 From: Jordan Niethe Date: Wed, 4 Aug 2021 11:37:24 +1000 Subject: powerpc: Always inline radix_enabled() to fix build failure This is the same as commit acdad8fb4a15 ("powerpc: Force inlining of mmu_has_feature to fix build failure") but for radix_enabled(). The config in the linked bugzilla causes the following build failure: LD .tmp_vmlinux.kallsyms1 powerpc64-linux-ld: arch/powerpc/mm/pgtable.o: in function `.__ptep_set_access_flags': pgtable.c:(.text+0x17c): undefined reference to `.radix__ptep_set_access_flags' powerpc64-linux-ld: arch/powerpc/mm/pageattr.o: in function `.change_page_attr': pageattr.c:(.text+0xc0): undefined reference to `.radix__flush_tlb_kernel_range' etc. This is due to radix_enabled() not being inlined. See extract from building with -Winline: In file included from arch/powerpc/include/asm/lppaca.h:46, from arch/powerpc/include/asm/paca.h:17, from arch/powerpc/include/asm/current.h:13, from include/linux/thread_info.h:23, from include/asm-generic/preempt.h:5, from ./arch/powerpc/include/generated/asm/preempt.h:1, from include/linux/preempt.h:78, from include/linux/spinlock.h:51, from include/linux/mmzone.h:8, from include/linux/gfp.h:6, from arch/powerpc/mm/pgtable.c:21: arch/powerpc/include/asm/book3s/64/pgtable.h: In function '__ptep_set_access_flags': arch/powerpc/include/asm/mmu.h:327:20: error: inlining failed in call to 'radix_enabled': call is unlikely and code size would grow [-Werror=inline] The code relies on constant folding of MMU_FTRS_POSSIBLE at buildtime and elimination of non possible parts of code at compile time. For this to work radix_enabled() must be inlined so make it __always_inline. Reported-by: Erhard F. Suggested-by: Michael Ellerman Tested-by: Randy Dunlap Signed-off-by: Jordan Niethe [mpe: Trimmed error messages in change log] Signed-off-by: Michael Ellerman Link: https://bugzilla.kernel.org/show_bug.cgi?id=213803 Link: https://lore.kernel.org/r/20210804013724.514468-1-jniethe5@gmail.com --- arch/powerpc/include/asm/mmu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 27016b98ecb2..8abe8e42e045 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -324,7 +324,7 @@ static inline void assert_pte_locked(struct mm_struct *mm, unsigned long addr) } #endif /* !CONFIG_DEBUG_VM */ -static inline bool radix_enabled(void) +static __always_inline bool radix_enabled(void) { return mmu_has_feature(MMU_FTR_TYPE_RADIX); } -- cgit v1.2.3-59-g8ed1b From 9b49f979b3d560cb75ea9f1a596baf432d566798 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Thu, 5 Aug 2021 11:20:05 +1000 Subject: powerpc/configs: Disable legacy ptys on microwatt defconfig We shouldn't need legacy ptys, and disabling the option improves boot time by about 0.5 seconds. Signed-off-by: Anton Blanchard Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210805112005.3cb1f412@kryten.localdomain --- arch/powerpc/configs/microwatt_defconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/configs/microwatt_defconfig b/arch/powerpc/configs/microwatt_defconfig index a08b739123da..ebc90aefbc0c 100644 --- a/arch/powerpc/configs/microwatt_defconfig +++ b/arch/powerpc/configs/microwatt_defconfig @@ -57,6 +57,7 @@ CONFIG_NETDEVICES=y # CONFIG_INPUT is not set # CONFIG_SERIO is not set # CONFIG_VT is not set +# CONFIG_LEGACY_PTYS is not set CONFIG_SERIAL_8250=y # CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set CONFIG_SERIAL_8250_CONSOLE=y -- cgit v1.2.3-59-g8ed1b From 2ac78e0c00184a9ba53d507be7549c69a3f566b6 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 5 Aug 2021 17:56:49 +1000 Subject: KVM: PPC: Use arch_get_random_seed_long instead of powernv variant The powernv_get_random_long() does not work in nested KVM (which is pseries) and produces a crash when accessing in_be64(rng->regs) in powernv_get_random_long(). This replaces powernv_get_random_long with the ppc_md machine hook wrapper. Signed-off-by: Alexey Kardashevskiy Reviewed-by: Fabiano Rosas Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210805075649.2086567-1-aik@ozlabs.ru --- arch/powerpc/kvm/book3s_hv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 085fb8ecbf68..9f957ceee58a 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1165,7 +1165,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) break; #endif case H_RANDOM: - if (!powernv_get_random_long(&vcpu->arch.regs.gpr[4])) + if (!arch_get_random_seed_long(&vcpu->arch.regs.gpr[4])) ret = H_HARDWARE; break; case H_RPT_INVALIDATE: -- cgit v1.2.3-59-g8ed1b From 786e5b102a0007d81579822eac23cb5bfaa0b65f Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Thu, 1 Jul 2021 15:27:19 +0200 Subject: powerpc/pseries/pci: Introduce __find_pe_total_msi() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It will help to size the PCI MSI domain. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-2-clg@kaod.org --- arch/powerpc/platforms/pseries/msi.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index 637300330507..d2d090e04745 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -164,12 +164,12 @@ static int check_req_msix(struct pci_dev *pdev, int nvec) /* Quota calculation */ -static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total) +static struct device_node *__find_pe_total_msi(struct device_node *node, int *total) { struct device_node *dn; const __be32 *p; - dn = of_node_get(pci_device_to_OF_node(dev)); + dn = of_node_get(node); while (dn) { p = of_get_property(dn, "ibm,pe-total-#msi", NULL); if (p) { @@ -185,6 +185,11 @@ static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total) return NULL; } +static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total) +{ + return __find_pe_total_msi(pci_device_to_OF_node(dev), total); +} + static struct device_node *find_pe_dn(struct pci_dev *dev, int *total) { struct device_node *dn; -- cgit v1.2.3-59-g8ed1b From e81202007363bd694b711f307f02320b5f98edaa Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Thu, 1 Jul 2021 15:27:20 +0200 Subject: powerpc/pseries/pci: Introduce rtas_prepare_msi_irqs() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This splits the routine setting the MSIs in two parts: allocation of MSIs for the PCI device at the FW level (RTAS) and the actual mapping and activation of the IRQs. rtas_prepare_msi_irqs() will serve as a handler for the PCI MSI domain. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-3-clg@kaod.org --- arch/powerpc/platforms/pseries/msi.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index d2d090e04745..4bf14f27e1aa 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -373,12 +373,11 @@ static void rtas_hack_32bit_msi_gen2(struct pci_dev *pdev) pci_write_config_dword(pdev, pdev->msi_cap + PCI_MSI_ADDRESS_HI, 0); } -static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec_in, int type) +static int rtas_prepare_msi_irqs(struct pci_dev *pdev, int nvec_in, int type, + msi_alloc_info_t *arg) { struct pci_dn *pdn; - int hwirq, virq, i, quota, rc; - struct msi_desc *entry; - struct msi_msg msg; + int quota, rc; int nvec = nvec_in; int use_32bit_msi_hack = 0; @@ -456,6 +455,22 @@ again: return rc; } + return 0; +} + +static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec_in, int type) +{ + struct pci_dn *pdn; + int hwirq, virq, i; + int rc; + struct msi_desc *entry; + struct msi_msg msg; + + rc = rtas_prepare_msi_irqs(pdev, nvec_in, type, NULL); + if (rc) + return rc; + + pdn = pci_get_pdn(pdev); i = 0; for_each_pci_msi_entry(entry, pdev) { hwirq = rtas_query_irq_number(pdn, i++); -- cgit v1.2.3-59-g8ed1b From 14be098c5387eb93b794f299f3c3e2ddf6038ec7 Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Thu, 1 Jul 2021 15:27:21 +0200 Subject: powerpc/xive: Add support for IRQ domain hierarchy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds handlers to allocate/free IRQs in a domain hierarchy. We could try to use xive_irq_domain_map() in xive_irq_domain_alloc() but we rely on xive_irq_alloc_data() to set the IRQ handler data and duplicating the code is simpler. xive_irq_free_data() needs to be called when IRQ are freed to clear the MMIO mappings and free the XIVE handler data, xive_irq_data structure. This is going to be a problem with MSI domains which we will address later. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-4-clg@kaod.org --- arch/powerpc/sysdev/xive/common.c | 64 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index dbdbbc2f1dc5..420d96deb7b6 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -1366,7 +1366,71 @@ static void xive_irq_domain_debug_show(struct seq_file *m, struct irq_domain *d, } #endif +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY +static int xive_irq_domain_translate(struct irq_domain *d, + struct irq_fwspec *fwspec, + unsigned long *hwirq, + unsigned int *type) +{ + return xive_irq_domain_xlate(d, to_of_node(fwspec->fwnode), + fwspec->param, fwspec->param_count, + hwirq, type); +} + +static int xive_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct irq_fwspec *fwspec = arg; + irq_hw_number_t hwirq; + unsigned int type = IRQ_TYPE_NONE; + int i, rc; + + rc = xive_irq_domain_translate(domain, fwspec, &hwirq, &type); + if (rc) + return rc; + + pr_debug("%s %d/%lx #%d\n", __func__, virq, hwirq, nr_irqs); + + for (i = 0; i < nr_irqs; i++) { + /* TODO: call xive_irq_domain_map() */ + + /* + * Mark interrupts as edge sensitive by default so that resend + * actually works. Will fix that up below if needed. + */ + irq_clear_status_flags(virq, IRQ_LEVEL); + + /* allocates and sets handler data */ + rc = xive_irq_alloc_data(virq + i, hwirq + i); + if (rc) + return rc; + + irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i, + &xive_irq_chip, domain->host_data); + irq_set_handler(virq + i, handle_fasteoi_irq); + } + + return 0; +} + +static void xive_irq_domain_free(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs) +{ + int i; + + pr_debug("%s %d #%d\n", __func__, virq, nr_irqs); + + for (i = 0; i < nr_irqs; i++) + xive_irq_free_data(virq + i); +} +#endif + static const struct irq_domain_ops xive_irq_domain_ops = { +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + .alloc = xive_irq_domain_alloc, + .free = xive_irq_domain_free, + .translate = xive_irq_domain_translate, +#endif .match = xive_irq_domain_match, .map = xive_irq_domain_map, .unmap = xive_irq_domain_unmap, -- cgit v1.2.3-59-g8ed1b From 6c2ab2a5d634d4e30445ee5d52d5d1469bf74aa2 Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Thu, 1 Jul 2021 15:27:22 +0200 Subject: powerpc/xive: Ease debugging of xive_irq_set_affinity() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pr_debug() is easier to activate and it helps to know how the kernel configures the HW when tweaking the IRQ subsystem. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-5-clg@kaod.org --- arch/powerpc/sysdev/xive/common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 420d96deb7b6..823f9fe3542a 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -713,7 +713,7 @@ static int xive_irq_set_affinity(struct irq_data *d, u32 target, old_target; int rc = 0; - pr_devel("xive_irq_set_affinity: irq %d\n", d->irq); + pr_debug("%s: irq %d/%x\n", __func__, d->irq, hw_irq); /* Is this valid ? */ if (cpumask_any_and(cpumask, cpu_online_mask) >= nr_cpu_ids) @@ -758,7 +758,7 @@ static int xive_irq_set_affinity(struct irq_data *d, return rc; } - pr_devel(" target: 0x%x\n", target); + pr_debug(" target: 0x%x\n", target); xd->target = target; /* Give up previous target */ -- cgit v1.2.3-59-g8ed1b From a5f3d2c17b07e69166b93209f34a5fb8271a6810 Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Thu, 1 Jul 2021 15:27:23 +0200 Subject: powerpc/pseries/pci: Add MSI domains MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two IRQ domains are added on top of default machine IRQ domain. First, the top level "pSeries-PCI-MSI" domain deals with the MSI specificities. In this domain, the HW IRQ numbers are generated by the PCI MSI layer, they compose a unique ID for an MSI source with the PCI device identifier and the MSI vector number. These numbers can be quite large on a pSeries machine running under the IBM Hypervisor and /sys/kernel/irq/ and /proc/interrupts will require small fixes to show them correctly. Second domain is the in-the-middle "pSeries-MSI" domain which acts as a proxy between the PCI MSI subsystem and the machine IRQ subsystem. It usually allocate the MSI vector numbers but, on pSeries machines, this is done by the RTAS FW and RTAS returns IRQ numbers in the IRQ number space of the machine. This is why the in-the-middle "pSeries-MSI" domain has the same HW IRQ numbers as its parent domain. Only the XIVE (P9/P10) parent domain is supported for now. We still need to add support for IRQ domain hierarchy under XICS. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-6-clg@kaod.org --- arch/powerpc/include/asm/pci-bridge.h | 5 + arch/powerpc/kernel/pci-common.c | 6 + arch/powerpc/platforms/pseries/msi.c | 185 +++++++++++++++++++++++++++++++ arch/powerpc/platforms/pseries/pseries.h | 1 + arch/powerpc/platforms/pseries/setup.c | 2 + 5 files changed, 199 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index 74424c14515c..90f488fa4c17 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -126,6 +126,11 @@ struct pci_controller { #endif /* CONFIG_PPC64 */ void *private_data; + + /* IRQ domain hierarchy */ + struct irq_domain *dev_domain; + struct irq_domain *msi_domain; + struct fwnode_handle *fwnode; }; /* These are used for config access before all the PCI probing diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 001e90cd8948..c3573430919d 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -1060,11 +1061,16 @@ void pcibios_bus_add_device(struct pci_dev *dev) int pcibios_add_device(struct pci_dev *dev) { + struct irq_domain *d; + #ifdef CONFIG_PCI_IOV if (ppc_md.pcibios_fixup_sriov) ppc_md.pcibios_fixup_sriov(dev); #endif /* CONFIG_PCI_IOV */ + d = dev_get_msi_domain(&dev->bus->dev); + if (d) + dev_set_msi_domain(&dev->dev, d); return 0; } diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index 4bf14f27e1aa..86c6809ebac2 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "pseries.h" @@ -518,6 +519,190 @@ static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec_in, int type) return 0; } +static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev, + int nvec, msi_alloc_info_t *arg) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct msi_desc *desc = first_pci_msi_entry(pdev); + int type = desc->msi_attrib.is_msix ? PCI_CAP_ID_MSIX : PCI_CAP_ID_MSI; + + return rtas_prepare_msi_irqs(pdev, nvec, type, arg); +} + +static struct msi_domain_ops pseries_pci_msi_domain_ops = { + .msi_prepare = pseries_msi_ops_prepare, +}; + +static void pseries_msi_shutdown(struct irq_data *d) +{ + d = d->parent_data; + if (d->chip->irq_shutdown) + d->chip->irq_shutdown(d); +} + +static void pseries_msi_mask(struct irq_data *d) +{ + pci_msi_mask_irq(d); + irq_chip_mask_parent(d); +} + +static void pseries_msi_unmask(struct irq_data *d) +{ + pci_msi_unmask_irq(d); + irq_chip_unmask_parent(d); +} + +static struct irq_chip pseries_pci_msi_irq_chip = { + .name = "pSeries-PCI-MSI", + .irq_shutdown = pseries_msi_shutdown, + .irq_mask = pseries_msi_mask, + .irq_unmask = pseries_msi_unmask, + .irq_eoi = irq_chip_eoi_parent, +}; + +static struct msi_domain_info pseries_msi_domain_info = { + .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX), + .ops = &pseries_pci_msi_domain_ops, + .chip = &pseries_pci_msi_irq_chip, +}; + +static void pseries_msi_compose_msg(struct irq_data *data, struct msi_msg *msg) +{ + __pci_read_msi_msg(irq_data_get_msi_desc(data), msg); +} + +static struct irq_chip pseries_msi_irq_chip = { + .name = "pSeries-MSI", + .irq_shutdown = pseries_msi_shutdown, + .irq_mask = irq_chip_mask_parent, + .irq_unmask = irq_chip_unmask_parent, + .irq_eoi = irq_chip_eoi_parent, + .irq_set_affinity = irq_chip_set_affinity_parent, + .irq_compose_msi_msg = pseries_msi_compose_msg, +}; + +static int pseries_irq_parent_domain_alloc(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq) +{ + struct irq_fwspec parent_fwspec; + int ret; + + parent_fwspec.fwnode = domain->parent->fwnode; + parent_fwspec.param_count = 2; + parent_fwspec.param[0] = hwirq; + parent_fwspec.param[1] = IRQ_TYPE_EDGE_RISING; + + ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &parent_fwspec); + if (ret) + return ret; + + return 0; +} + +static int pseries_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct pci_controller *phb = domain->host_data; + msi_alloc_info_t *info = arg; + struct msi_desc *desc = info->desc; + struct pci_dev *pdev = msi_desc_to_pci_dev(desc); + int hwirq; + int i, ret; + + hwirq = rtas_query_irq_number(pci_get_pdn(pdev), desc->msi_attrib.entry_nr); + if (hwirq < 0) { + dev_err(&pdev->dev, "Failed to query HW IRQ: %d\n", hwirq); + return hwirq; + } + + dev_dbg(&pdev->dev, "%s bridge %pOF %d/%x #%d\n", __func__, + phb->dn, virq, hwirq, nr_irqs); + + for (i = 0; i < nr_irqs; i++) { + ret = pseries_irq_parent_domain_alloc(domain, virq + i, hwirq + i); + if (ret) + goto out; + + irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i, + &pseries_msi_irq_chip, domain->host_data); + } + + return 0; + +out: + /* TODO: handle RTAS cleanup in ->msi_finish() ? */ + irq_domain_free_irqs_parent(domain, virq, i - 1); + return ret; +} + +static void pseries_irq_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + struct irq_data *d = irq_domain_get_irq_data(domain, virq); + struct pci_controller *phb = irq_data_get_irq_chip_data(d); + + pr_debug("%s bridge %pOF %d #%d\n", __func__, phb->dn, virq, nr_irqs); + + irq_domain_free_irqs_parent(domain, virq, nr_irqs); +} + +static const struct irq_domain_ops pseries_irq_domain_ops = { + .alloc = pseries_irq_domain_alloc, + .free = pseries_irq_domain_free, +}; + +static int __pseries_msi_allocate_domains(struct pci_controller *phb, + unsigned int count) +{ + struct irq_domain *parent = irq_get_default_host(); + + phb->fwnode = irq_domain_alloc_named_id_fwnode("pSeries-MSI", + phb->global_number); + if (!phb->fwnode) + return -ENOMEM; + + phb->dev_domain = irq_domain_create_hierarchy(parent, 0, count, + phb->fwnode, + &pseries_irq_domain_ops, phb); + if (!phb->dev_domain) { + pr_err("PCI: failed to create IRQ domain bridge %pOF (domain %d)\n", + phb->dn, phb->global_number); + irq_domain_free_fwnode(phb->fwnode); + return -ENOMEM; + } + + phb->msi_domain = pci_msi_create_irq_domain(of_node_to_fwnode(phb->dn), + &pseries_msi_domain_info, + phb->dev_domain); + if (!phb->msi_domain) { + pr_err("PCI: failed to create MSI IRQ domain bridge %pOF (domain %d)\n", + phb->dn, phb->global_number); + irq_domain_free_fwnode(phb->fwnode); + irq_domain_remove(phb->dev_domain); + return -ENOMEM; + } + + return 0; +} + +int pseries_msi_allocate_domains(struct pci_controller *phb) +{ + int count; + + /* Only supported by the XIVE driver */ + if (!xive_enabled()) + return -ENODEV; + + if (!__find_pe_total_msi(phb->dn, &count)) { + pr_err("PCI: failed to find MSIs for bridge %pOF (domain %d)\n", + phb->dn, phb->global_number); + return -ENOSPC; + } + + return __pseries_msi_allocate_domains(phb, count); +} + static void rtas_msi_pci_irq_fixup(struct pci_dev *pdev) { /* No LSI -> leave MSIs (if any) configured */ diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h index 1f051a786fb3..d9280262588b 100644 --- a/arch/powerpc/platforms/pseries/pseries.h +++ b/arch/powerpc/platforms/pseries/pseries.h @@ -85,6 +85,7 @@ struct pci_host_bridge; int pseries_root_bridge_prepare(struct pci_host_bridge *bridge); extern struct pci_controller_ops pseries_pci_controller_ops; +int pseries_msi_allocate_domains(struct pci_controller *phb); unsigned long pseries_memory_block_size(void); diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 631a0d57b6cd..35724caf8a83 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -486,6 +486,8 @@ static void __init pSeries_discover_phbs(void) /* create pci_dn's for DT nodes under this PHB */ pci_devs_phb_init_dynamic(phb); + + pseries_msi_allocate_domains(phb); } of_node_put(root); -- cgit v1.2.3-59-g8ed1b From 5690bcae186084a8544b1819f0d89399268bd0cf Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Thu, 1 Jul 2021 15:27:24 +0200 Subject: powerpc/xive: Drop unmask of MSIs at startup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit That was a workaround in the XIVE domain because of the lack of MSI domain. This is now handled. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-7-clg@kaod.org --- arch/powerpc/sysdev/xive/common.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 823f9fe3542a..356f584cc51b 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -616,16 +616,6 @@ static unsigned int xive_irq_startup(struct irq_data *d) pr_devel("xive_irq_startup: irq %d [0x%x] data @%p\n", d->irq, hw_irq, d); -#ifdef CONFIG_PCI_MSI - /* - * The generic MSI code returns with the interrupt disabled on the - * card, using the MSI mask bits. Firmware doesn't appear to unmask - * at that level, so we do it here by hand. - */ - if (irq_data_get_msi_desc(d)) - pci_msi_unmask_irq(d); -#endif - /* Pick a target */ target = xive_pick_irq_target(d, irq_data_get_affinity_mask(d)); if (target == XIVE_INVALID_TARGET) { -- cgit v1.2.3-59-g8ed1b From 292145a6e598c1e6633b8f5f607706b46f552ab9 Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Thu, 1 Jul 2021 15:27:25 +0200 Subject: powerpc/xive: Remove irqd_is_started() check when setting the affinity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the early days of XIVE support, commit cffb717ceb8e ("powerpc/xive: Ensure active irqd when setting affinity") tried to fix an issue related to interrupt migration. If the root cause was related to CPU unplug, it should have been fixed and there is no reason to keep the irqd_is_started() check. This test is also breaking affinity setting of MSIs which can set before starting the associated IRQ. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-8-clg@kaod.org --- arch/powerpc/sysdev/xive/common.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 356f584cc51b..0631c97b3a14 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -709,10 +709,6 @@ static int xive_irq_set_affinity(struct irq_data *d, if (cpumask_any_and(cpumask, cpu_online_mask) >= nr_cpu_ids) return -EINVAL; - /* Don't do anything if the interrupt isn't started */ - if (!irqd_is_started(d)) - return IRQ_SET_MASK_OK; - /* * If existing target is already in the new mask, and is * online then do nothing. -- cgit v1.2.3-59-g8ed1b From 07817a578a7a79638537480b8847dc7a12f293c5 Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Thu, 1 Jul 2021 15:27:26 +0200 Subject: powerpc/pseries/pci: Add a domain_free_irqs() handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The RTAS firmware can not disable one MSI at a time. It's all or nothing. We need a custom free IRQ handler for that. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-9-clg@kaod.org --- arch/powerpc/platforms/pseries/msi.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index 86c6809ebac2..591cee9cbc9e 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -529,8 +529,24 @@ static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev return rtas_prepare_msi_irqs(pdev, nvec, type, arg); } +/* + * RTAS can not disable one MSI at a time. It's all or nothing. Do it + * at the end after all IRQs have been freed. + */ +static void pseries_msi_domain_free_irqs(struct irq_domain *domain, + struct device *dev) +{ + if (WARN_ON_ONCE(!dev_is_pci(dev))) + return; + + __msi_domain_free_irqs(domain, dev); + + rtas_disable_msi(to_pci_dev(dev)); +} + static struct msi_domain_ops pseries_pci_msi_domain_ops = { .msi_prepare = pseries_msi_ops_prepare, + .domain_free_irqs = pseries_msi_domain_free_irqs, }; static void pseries_msi_shutdown(struct irq_data *d) -- cgit v1.2.3-59-g8ed1b From 9a014f456881e947bf8cdd8c984a207097e6c096 Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Thu, 1 Jul 2021 15:27:27 +0200 Subject: powerpc/pseries/pci: Add a msi_free() handler to clear XIVE data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The MSI domain clears the IRQ with msi_domain_free(), which calls irq_domain_free_irqs_top(), which clears the handler data. This is a problem for the XIVE controller since we need to unmap MMIO pages and free a specific XIVE structure. The 'msi_free()' handler is called before irq_domain_free_irqs_top() when the handler data is still available. Use that to clear the XIVE controller data. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-10-clg@kaod.org --- arch/powerpc/include/asm/xive.h | 1 + arch/powerpc/platforms/pseries/msi.c | 16 +++++++++++++++- arch/powerpc/sysdev/xive/common.c | 5 ++++- 3 files changed, 20 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h index aa094a8655b0..20ae50ab083c 100644 --- a/arch/powerpc/include/asm/xive.h +++ b/arch/powerpc/include/asm/xive.h @@ -111,6 +111,7 @@ void xive_native_free_vp_block(u32 vp_base); int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data); void xive_cleanup_irq_data(struct xive_irq_data *xd); +void xive_irq_free_data(unsigned int virq); void xive_native_free_irq(u32 irq); int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq); diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index 591cee9cbc9e..f9635b01b2bf 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -529,6 +529,19 @@ static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev return rtas_prepare_msi_irqs(pdev, nvec, type, arg); } +/* + * ->msi_free() is called before irq_domain_free_irqs_top() when the + * handler data is still available. Use that to clear the XIVE + * controller data. + */ +static void pseries_msi_ops_msi_free(struct irq_domain *domain, + struct msi_domain_info *info, + unsigned int irq) +{ + if (xive_enabled()) + xive_irq_free_data(irq); +} + /* * RTAS can not disable one MSI at a time. It's all or nothing. Do it * at the end after all IRQs have been freed. @@ -546,6 +559,7 @@ static void pseries_msi_domain_free_irqs(struct irq_domain *domain, static struct msi_domain_ops pseries_pci_msi_domain_ops = { .msi_prepare = pseries_msi_ops_prepare, + .msi_free = pseries_msi_ops_msi_free, .domain_free_irqs = pseries_msi_domain_free_irqs, }; @@ -660,7 +674,7 @@ static void pseries_irq_domain_free(struct irq_domain *domain, unsigned int virq pr_debug("%s bridge %pOF %d #%d\n", __func__, phb->dn, virq, nr_irqs); - irq_domain_free_irqs_parent(domain, virq, nr_irqs); + /* XIVE domain data is cleared through ->msi_free() */ } static const struct irq_domain_ops pseries_irq_domain_ops = { diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 0631c97b3a14..107f442d3411 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -975,6 +975,8 @@ EXPORT_SYMBOL_GPL(is_xive_irq); void xive_cleanup_irq_data(struct xive_irq_data *xd) { + pr_debug("%s for HW %x\n", __func__, xd->hw_irq); + if (xd->eoi_mmio) { iounmap(xd->eoi_mmio); if (xd->eoi_mmio == xd->trig_mmio) @@ -1016,7 +1018,7 @@ static int xive_irq_alloc_data(unsigned int virq, irq_hw_number_t hw) return 0; } -static void xive_irq_free_data(unsigned int virq) +void xive_irq_free_data(unsigned int virq) { struct xive_irq_data *xd = irq_get_handler_data(virq); @@ -1026,6 +1028,7 @@ static void xive_irq_free_data(unsigned int virq) xive_cleanup_irq_data(xd); kfree(xd); } +EXPORT_SYMBOL_GPL(xive_irq_free_data); #ifdef CONFIG_SMP -- cgit v1.2.3-59-g8ed1b From 174db9e7f775ce06fc6949c9abbe758b3eb8171c Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Thu, 1 Jul 2021 15:27:28 +0200 Subject: powerpc/pseries/pci: Add support of MSI domains to PHB hotplug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simply allocate or release the MSI domains when a PHB is inserted in or removed from the machine. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-11-clg@kaod.org --- arch/powerpc/platforms/pseries/msi.c | 10 ++++++++++ arch/powerpc/platforms/pseries/pci_dlpar.c | 4 ++++ arch/powerpc/platforms/pseries/pseries.h | 1 + 3 files changed, 15 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index f9635b01b2bf..e2127a3f7ebd 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -733,6 +733,16 @@ int pseries_msi_allocate_domains(struct pci_controller *phb) return __pseries_msi_allocate_domains(phb, count); } +void pseries_msi_free_domains(struct pci_controller *phb) +{ + if (phb->msi_domain) + irq_domain_remove(phb->msi_domain); + if (phb->dev_domain) + irq_domain_remove(phb->dev_domain); + if (phb->fwnode) + irq_domain_free_fwnode(phb->fwnode); +} + static void rtas_msi_pci_irq_fixup(struct pci_dev *pdev) { /* No LSI -> leave MSIs (if any) configured */ diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c index a8f9140a24fa..90c9d3531694 100644 --- a/arch/powerpc/platforms/pseries/pci_dlpar.c +++ b/arch/powerpc/platforms/pseries/pci_dlpar.c @@ -33,6 +33,8 @@ struct pci_controller *init_phb_dynamic(struct device_node *dn) pci_devs_phb_init_dynamic(phb); + pseries_msi_allocate_domains(phb); + /* Create EEH devices for the PHB */ eeh_phb_pe_create(phb); @@ -74,6 +76,8 @@ int remove_phb_dynamic(struct pci_controller *phb) } } + pseries_msi_free_domains(phb); + /* Remove the PCI bus and unregister the bridge device from sysfs */ phb->bus = NULL; pci_remove_bus(b); diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h index d9280262588b..3544778e06d0 100644 --- a/arch/powerpc/platforms/pseries/pseries.h +++ b/arch/powerpc/platforms/pseries/pseries.h @@ -86,6 +86,7 @@ int pseries_root_bridge_prepare(struct pci_host_bridge *bridge); extern struct pci_controller_ops pseries_pci_controller_ops; int pseries_msi_allocate_domains(struct pci_controller *phb); +void pseries_msi_free_domains(struct pci_controller *phb); unsigned long pseries_memory_block_size(void); -- cgit v1.2.3-59-g8ed1b From 2c50d7e99e39eba92b93210e740f3f9e5a06ba54 Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Thu, 1 Jul 2021 15:27:29 +0200 Subject: powerpc/powernv/pci: Introduce __pnv_pci_ioda_msi_setup() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It will be used as a 'compose_msg' handler of the MSI domain introduced later. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-12-clg@kaod.org --- arch/powerpc/platforms/powernv/pci-ioda.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 7de464679292..2922674cc934 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2016,15 +2016,17 @@ bool is_pnv_opal_msi(struct irq_chip *chip) } EXPORT_SYMBOL_GPL(is_pnv_opal_msi); -static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, - unsigned int hwirq, unsigned int virq, - unsigned int is_64, struct msi_msg *msg) +static int __pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, + unsigned int xive_num, + unsigned int is_64, struct msi_msg *msg) { struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev); - unsigned int xive_num = hwirq - phb->msi_base; __be32 data; int rc; + dev_dbg(&dev->dev, "%s: setup %s-bit MSI for vector #%d\n", __func__, + is_64 ? "64" : "32", xive_num); + /* No PE assigned ? bail out ... no MSI for you ! */ if (pe == NULL) return -ENXIO; @@ -2072,12 +2074,28 @@ static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, } msg->data = be32_to_cpu(data); + return 0; +} + +static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, + unsigned int hwirq, unsigned int virq, + unsigned int is_64, struct msi_msg *msg) +{ + struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev); + unsigned int xive_num = hwirq - phb->msi_base; + int rc; + + rc = __pnv_pci_ioda_msi_setup(phb, dev, xive_num, is_64, msg); + if (rc) + return rc; + + /* P8 only */ pnv_set_msi_irq_chip(phb, virq); pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d)," " address=%x_%08x data=%x PE# %x\n", pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num, - msg->address_hi, msg->address_lo, data, pe->pe_number); + msg->address_hi, msg->address_lo, msg->data, pe->pe_number); return 0; } -- cgit v1.2.3-59-g8ed1b From 0fcfe2247e75070361af2b6845030cada92cdbf8 Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Thu, 1 Jul 2021 15:27:30 +0200 Subject: powerpc/powernv/pci: Add MSI domains MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is very similar to the MSI domains of the pSeries platform. The MSI allocator is directly handled under the Linux PHB in the in-the-middle "PNV-MSI" domain. Only the XIVE (P9/P10) parent domain is supported for now. Support for XICS will come later. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-13-clg@kaod.org --- arch/powerpc/platforms/powernv/pci-ioda.c | 188 ++++++++++++++++++++++++++++++ 1 file changed, 188 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 2922674cc934..d2a17fcb6002 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -36,6 +36,7 @@ #include #include #include +#include #include @@ -2100,6 +2101,189 @@ static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, return 0; } +/* + * The msi_free() op is called before irq_domain_free_irqs_top() when + * the handler data is still available. Use that to clear the XIVE + * controller. + */ +static void pnv_msi_ops_msi_free(struct irq_domain *domain, + struct msi_domain_info *info, + unsigned int irq) +{ + if (xive_enabled()) + xive_irq_free_data(irq); +} + +static struct msi_domain_ops pnv_pci_msi_domain_ops = { + .msi_free = pnv_msi_ops_msi_free, +}; + +static void pnv_msi_shutdown(struct irq_data *d) +{ + d = d->parent_data; + if (d->chip->irq_shutdown) + d->chip->irq_shutdown(d); +} + +static void pnv_msi_mask(struct irq_data *d) +{ + pci_msi_mask_irq(d); + irq_chip_mask_parent(d); +} + +static void pnv_msi_unmask(struct irq_data *d) +{ + pci_msi_unmask_irq(d); + irq_chip_unmask_parent(d); +} + +static struct irq_chip pnv_pci_msi_irq_chip = { + .name = "PNV-PCI-MSI", + .irq_shutdown = pnv_msi_shutdown, + .irq_mask = pnv_msi_mask, + .irq_unmask = pnv_msi_unmask, + .irq_eoi = irq_chip_eoi_parent, +}; + +static struct msi_domain_info pnv_msi_domain_info = { + .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | + MSI_FLAG_MULTI_PCI_MSI | MSI_FLAG_PCI_MSIX), + .ops = &pnv_pci_msi_domain_ops, + .chip = &pnv_pci_msi_irq_chip, +}; + +static void pnv_msi_compose_msg(struct irq_data *d, struct msi_msg *msg) +{ + struct msi_desc *entry = irq_data_get_msi_desc(d); + struct pci_dev *pdev = msi_desc_to_pci_dev(entry); + struct pci_controller *hose = irq_data_get_irq_chip_data(d); + struct pnv_phb *phb = hose->private_data; + int rc; + + rc = __pnv_pci_ioda_msi_setup(phb, pdev, d->hwirq, + entry->msi_attrib.is_64, msg); + if (rc) + dev_err(&pdev->dev, "Failed to setup %s-bit MSI #%ld : %d\n", + entry->msi_attrib.is_64 ? "64" : "32", d->hwirq, rc); +} + +static struct irq_chip pnv_msi_irq_chip = { + .name = "PNV-MSI", + .irq_shutdown = pnv_msi_shutdown, + .irq_mask = irq_chip_mask_parent, + .irq_unmask = irq_chip_unmask_parent, + .irq_eoi = irq_chip_eoi_parent, + .irq_set_affinity = irq_chip_set_affinity_parent, + .irq_compose_msi_msg = pnv_msi_compose_msg, +}; + +static int pnv_irq_parent_domain_alloc(struct irq_domain *domain, + unsigned int virq, int hwirq) +{ + struct irq_fwspec parent_fwspec; + int ret; + + parent_fwspec.fwnode = domain->parent->fwnode; + parent_fwspec.param_count = 2; + parent_fwspec.param[0] = hwirq; + parent_fwspec.param[1] = IRQ_TYPE_EDGE_RISING; + + ret = irq_domain_alloc_irqs_parent(domain, virq, 1, &parent_fwspec); + if (ret) + return ret; + + return 0; +} + +static int pnv_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct pci_controller *hose = domain->host_data; + struct pnv_phb *phb = hose->private_data; + msi_alloc_info_t *info = arg; + struct pci_dev *pdev = msi_desc_to_pci_dev(info->desc); + int hwirq; + int i, ret; + + hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, nr_irqs); + if (hwirq < 0) { + dev_warn(&pdev->dev, "failed to find a free MSI\n"); + return -ENOSPC; + } + + dev_dbg(&pdev->dev, "%s bridge %pOF %d/%x #%d\n", __func__, + hose->dn, virq, hwirq, nr_irqs); + + for (i = 0; i < nr_irqs; i++) { + ret = pnv_irq_parent_domain_alloc(domain, virq + i, + phb->msi_base + hwirq + i); + if (ret) + goto out; + + irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i, + &pnv_msi_irq_chip, hose); + } + + return 0; + +out: + irq_domain_free_irqs_parent(domain, virq, i - 1); + msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, nr_irqs); + return ret; +} + +static void pnv_irq_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + struct irq_data *d = irq_domain_get_irq_data(domain, virq); + struct pci_controller *hose = irq_data_get_irq_chip_data(d); + struct pnv_phb *phb = hose->private_data; + + pr_debug("%s bridge %pOF %d/%lx #%d\n", __func__, hose->dn, + virq, d->hwirq, nr_irqs); + + msi_bitmap_free_hwirqs(&phb->msi_bmp, d->hwirq, nr_irqs); + /* XIVE domain is cleared through ->msi_free() */ +} + +static const struct irq_domain_ops pnv_irq_domain_ops = { + .alloc = pnv_irq_domain_alloc, + .free = pnv_irq_domain_free, +}; + +static int pnv_msi_allocate_domains(struct pci_controller *hose, unsigned int count) +{ + struct pnv_phb *phb = hose->private_data; + struct irq_domain *parent = irq_get_default_host(); + + hose->fwnode = irq_domain_alloc_named_id_fwnode("PNV-MSI", phb->opal_id); + if (!hose->fwnode) + return -ENOMEM; + + hose->dev_domain = irq_domain_create_hierarchy(parent, 0, count, + hose->fwnode, + &pnv_irq_domain_ops, hose); + if (!hose->dev_domain) { + pr_err("PCI: failed to create IRQ domain bridge %pOF (domain %d)\n", + hose->dn, hose->global_number); + irq_domain_free_fwnode(hose->fwnode); + return -ENOMEM; + } + + hose->msi_domain = pci_msi_create_irq_domain(of_node_to_fwnode(hose->dn), + &pnv_msi_domain_info, + hose->dev_domain); + if (!hose->msi_domain) { + pr_err("PCI: failed to create MSI IRQ domain bridge %pOF (domain %d)\n", + hose->dn, hose->global_number); + irq_domain_free_fwnode(hose->fwnode); + irq_domain_remove(hose->dev_domain); + return -ENOMEM; + } + + return 0; +} + static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) { unsigned int count; @@ -2124,6 +2308,10 @@ static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) phb->msi32_support = 1; pr_info(" Allocated bitmap for %d MSIs (base IRQ 0x%x)\n", count, phb->msi_base); + + /* Only supported by the XIVE driver */ + if (xive_enabled()) + pnv_msi_allocate_domains(phb->hose, count); } static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe, -- cgit v1.2.3-59-g8ed1b From ba418a0278265ad65f2f9544e743b7dbff3b994b Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Thu, 1 Jul 2021 15:27:31 +0200 Subject: KVM: PPC: Book3S HV: Use the new IRQ chip to detect passthrough interrupts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Passthrough PCI MSI interrupts are detected in KVM with a check on a specific EOI handler (P8) or on XIVE (P9). We can now check the PCI-MSI IRQ chip which is cleaner. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-14-clg@kaod.org --- arch/powerpc/kvm/book3s_hv.c | 2 +- arch/powerpc/platforms/powernv/pci-ioda.c | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 9f957ceee58a..1a757c3d33f7 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -5355,7 +5355,7 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) * what our real-mode EOI code does, or a XIVE interrupt */ chip = irq_data_get_irq_chip(&desc->irq_data); - if (!chip || !(is_pnv_opal_msi(chip) || is_xive_irq(chip))) { + if (!chip || !is_pnv_opal_msi(chip)) { pr_warn("kvmppc_set_passthru_irq_hv: Could not assign IRQ map for (%d,%d)\n", host_irq, guest_gsi); mutex_unlock(&kvm->lock); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index d2a17fcb6002..e77caa4dbbdf 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2007,13 +2007,15 @@ void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq) irq_set_chip(virq, &phb->ioda.irq_chip); } +static struct irq_chip pnv_pci_msi_irq_chip; + /* * Returns true iff chip is something that we could call * pnv_opal_pci_msi_eoi for. */ bool is_pnv_opal_msi(struct irq_chip *chip) { - return chip->irq_eoi == pnv_ioda2_msi_eoi; + return chip->irq_eoi == pnv_ioda2_msi_eoi || chip == &pnv_pci_msi_irq_chip; } EXPORT_SYMBOL_GPL(is_pnv_opal_msi); -- cgit v1.2.3-59-g8ed1b From e5e78b15113a73d0294141d9796969fa7b10fa3c Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Thu, 1 Jul 2021 15:27:32 +0200 Subject: KVM: PPC: Book3S HV: XIVE: Change interface of passthrough interrupt routines MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The routine kvmppc_set_passthru_irq() calls kvmppc_xive_set_mapped() and kvmppc_xive_clr_mapped() with an IRQ descriptor. Use directly the host IRQ number to remove a useless conversion. Add some debug. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-15-clg@kaod.org --- arch/powerpc/include/asm/kvm_ppc.h | 4 ++-- arch/powerpc/kvm/book3s_hv.c | 4 ++-- arch/powerpc/kvm/book3s_xive.c | 17 ++++++++--------- 3 files changed, 12 insertions(+), 13 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 2d88944f9f34..671fbd1a765e 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -664,9 +664,9 @@ extern int kvmppc_xive_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu, u32 cpu); extern void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu); extern int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq, - struct irq_desc *host_desc); + unsigned long host_irq); extern int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq, - struct irq_desc *host_desc); + unsigned long host_irq); extern u64 kvmppc_xive_get_icp(struct kvm_vcpu *vcpu); extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval); diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 1a757c3d33f7..05b3a3548c18 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -5398,7 +5398,7 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) pimap->n_mapped++; if (xics_on_xive()) - rc = kvmppc_xive_set_mapped(kvm, guest_gsi, desc); + rc = kvmppc_xive_set_mapped(kvm, guest_gsi, host_irq); else kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq); if (rc) @@ -5439,7 +5439,7 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) } if (xics_on_xive()) - rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, pimap->mapped[i].desc); + rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, host_irq); else kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq); diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 8cfab3547494..45a0434e9d85 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -922,13 +922,12 @@ int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval) } int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq, - struct irq_desc *host_desc) + unsigned long host_irq) { struct kvmppc_xive *xive = kvm->arch.xive; struct kvmppc_xive_src_block *sb; struct kvmppc_xive_irq_state *state; - struct irq_data *host_data = irq_desc_get_irq_data(host_desc); - unsigned int host_irq = irq_desc_get_irq(host_desc); + struct irq_data *host_data = irq_get_irq_data(host_irq); unsigned int hw_irq = (unsigned int)irqd_to_hwirq(host_data); u16 idx; u8 prio; @@ -937,7 +936,8 @@ int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq, if (!xive) return -ENODEV; - pr_devel("set_mapped girq 0x%lx host HW irq 0x%x...\n",guest_irq, hw_irq); + pr_debug("%s: GIRQ 0x%lx host IRQ %ld XIVE HW IRQ 0x%x\n", + __func__, guest_irq, host_irq, hw_irq); sb = kvmppc_xive_find_source(xive, guest_irq, &idx); if (!sb) @@ -959,7 +959,7 @@ int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq, */ rc = irq_set_vcpu_affinity(host_irq, state); if (rc) { - pr_err("Failed to set VCPU affinity for irq %d\n", host_irq); + pr_err("Failed to set VCPU affinity for host IRQ %ld\n", host_irq); return rc; } @@ -1019,12 +1019,11 @@ int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq, EXPORT_SYMBOL_GPL(kvmppc_xive_set_mapped); int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq, - struct irq_desc *host_desc) + unsigned long host_irq) { struct kvmppc_xive *xive = kvm->arch.xive; struct kvmppc_xive_src_block *sb; struct kvmppc_xive_irq_state *state; - unsigned int host_irq = irq_desc_get_irq(host_desc); u16 idx; u8 prio; int rc; @@ -1032,7 +1031,7 @@ int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq, if (!xive) return -ENODEV; - pr_devel("clr_mapped girq 0x%lx...\n", guest_irq); + pr_debug("%s: GIRQ 0x%lx host IRQ %ld\n", __func__, guest_irq, host_irq); sb = kvmppc_xive_find_source(xive, guest_irq, &idx); if (!sb) @@ -1059,7 +1058,7 @@ int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq, /* Release the passed-through interrupt to the host */ rc = irq_set_vcpu_affinity(host_irq, NULL); if (rc) { - pr_err("Failed to clr VCPU affinity for irq %d\n", host_irq); + pr_err("Failed to clr VCPU affinity for host IRQ %ld\n", host_irq); return rc; } -- cgit v1.2.3-59-g8ed1b From 51be9e51a8000ffc6a33083ceca9da9303ed4dc5 Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Thu, 1 Jul 2021 15:27:33 +0200 Subject: KVM: PPC: Book3S HV: XIVE: Fix mapping of passthrough interrupts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PCI MSI interrupt numbers are now mapped in a PCI-MSI domain but the underlying calls handling the passthrough of the interrupt in the guest need a number in the XIVE IRQ domain. Use the IRQ data mapped in the XIVE IRQ domain and not the one in the PCI-MSI domain. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-16-clg@kaod.org --- arch/powerpc/kvm/book3s_xive.c | 3 ++- kernel/irq/irqdomain.c | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 45a0434e9d85..6878026ee8ec 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -927,7 +927,8 @@ int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq, struct kvmppc_xive *xive = kvm->arch.xive; struct kvmppc_xive_src_block *sb; struct kvmppc_xive_irq_state *state; - struct irq_data *host_data = irq_get_irq_data(host_irq); + struct irq_data *host_data = + irq_domain_get_irq_data(irq_get_default_host(), host_irq); unsigned int hw_irq = (unsigned int)irqd_to_hwirq(host_data); u16 idx; u8 prio; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 51c483ce2447..0eee4816edac 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -491,6 +491,7 @@ struct irq_domain *irq_get_default_host(void) { return irq_default_domain; } +EXPORT_SYMBOL_GPL(irq_get_default_host); static bool irq_domain_is_nomap(struct irq_domain *domain) { -- cgit v1.2.3-59-g8ed1b From 298f6f952885eeb1f25461f085c6c238bcd9fc5e Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Thu, 1 Jul 2021 15:27:34 +0200 Subject: powerpc/xics: Remove ICS list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We always had only one ICS per machine. Simplify the XICS driver by removing the ICS list. The ICS stored in the chip data of the XICS domain becomes useless and we don't need it anymore to migrate away IRQs from a CPU. This will be removed in a subsequent patch. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-17-clg@kaod.org --- arch/powerpc/sysdev/xics/xics-common.c | 45 ++++++++++++++-------------------- 1 file changed, 19 insertions(+), 26 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/sysdev/xics/xics-common.c b/arch/powerpc/sysdev/xics/xics-common.c index b14c502e56a8..08c25748efb9 100644 --- a/arch/powerpc/sysdev/xics/xics-common.c +++ b/arch/powerpc/sysdev/xics/xics-common.c @@ -38,7 +38,7 @@ DEFINE_PER_CPU(struct xics_cppr, xics_cppr); struct irq_domain *xics_host; -static LIST_HEAD(ics_list); +static struct ics *xics_ics; void xics_update_irq_servers(void) { @@ -111,12 +111,11 @@ void xics_setup_cpu(void) void xics_mask_unknown_vec(unsigned int vec) { - struct ics *ics; - pr_err("Interrupt 0x%x (real) is invalid, disabling it.\n", vec); - list_for_each_entry(ics, &ics_list, link) - ics->mask_unknown(ics, vec); + if (WARN_ON(!xics_ics)) + return; + xics_ics->mask_unknown(xics_ics, vec); } @@ -198,7 +197,6 @@ void xics_migrate_irqs_away(void) struct irq_chip *chip; long server; unsigned long flags; - struct ics *ics; /* We can't set affinity on ISA interrupts */ if (virq < NR_IRQS_LEGACY) @@ -219,13 +217,10 @@ void xics_migrate_irqs_away(void) raw_spin_lock_irqsave(&desc->lock, flags); /* Locate interrupt server */ - server = -1; - ics = irq_desc_get_chip_data(desc); - if (ics) - server = ics->get_server(ics, irq); + server = xics_ics->get_server(xics_ics, irq); if (server < 0) { - printk(KERN_ERR "%s: Can't find server for irq %d\n", - __func__, irq); + pr_err("%s: Can't find server for irq %d/%x\n", + __func__, virq, irq); goto unlock; } @@ -307,13 +302,9 @@ int xics_get_irq_server(unsigned int virq, const struct cpumask *cpumask, static int xics_host_match(struct irq_domain *h, struct device_node *node, enum irq_domain_bus_token bus_token) { - struct ics *ics; - - list_for_each_entry(ics, &ics_list, link) - if (ics->host_match(ics, node)) - return 1; - - return 0; + if (WARN_ON(!xics_ics)) + return 0; + return xics_ics->host_match(xics_ics, node) ? 1 : 0; } /* Dummies */ @@ -330,8 +321,6 @@ static struct irq_chip xics_ipi_chip = { static int xics_host_map(struct irq_domain *h, unsigned int virq, irq_hw_number_t hw) { - struct ics *ics; - pr_devel("xics: map virq %d, hwirq 0x%lx\n", virq, hw); /* @@ -348,12 +337,14 @@ static int xics_host_map(struct irq_domain *h, unsigned int virq, return 0; } + if (WARN_ON(!xics_ics)) + return -EINVAL; + /* Let the ICS setup the chip data */ - list_for_each_entry(ics, &ics_list, link) - if (ics->map(ics, virq) == 0) - return 0; + if (xics_ics->map(xics_ics, virq)) + return -EINVAL; - return -EINVAL; + return 0; } static int xics_host_xlate(struct irq_domain *h, struct device_node *ct, @@ -427,7 +418,9 @@ static void __init xics_init_host(void) void __init xics_register_ics(struct ics *ics) { - list_add(&ics->link, &ics_list); + if (WARN_ONCE(xics_ics, "XICS: Source Controller is already defined !")) + return; + xics_ics = ics; } static void __init xics_get_server_size(void) -- cgit v1.2.3-59-g8ed1b From 248af248a8f45461662fb633eca4adf24ae704ad Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Thu, 1 Jul 2021 15:27:35 +0200 Subject: powerpc/xics: Rename the map handler in a check handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This moves the IRQ initialization done under the different ICS backends in the common part of XICS. The 'map' handler becomes a simple 'check' on the HW IRQ at the FW level. As we don't need an ICS anymore in xics_migrate_irqs_away(), the XICS domain does not set a chip data for the IRQ. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-18-clg@kaod.org --- arch/powerpc/include/asm/xics.h | 3 ++- arch/powerpc/sysdev/xics/ics-native.c | 13 +++++-------- arch/powerpc/sysdev/xics/ics-opal.c | 27 ++++++++++----------------- arch/powerpc/sysdev/xics/ics-rtas.c | 28 ++++++++++------------------ arch/powerpc/sysdev/xics/xics-common.c | 15 +++++++++------ 5 files changed, 36 insertions(+), 50 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/xics.h b/arch/powerpc/include/asm/xics.h index d9cf192368ad..0ac9bfddf704 100644 --- a/arch/powerpc/include/asm/xics.h +++ b/arch/powerpc/include/asm/xics.h @@ -89,10 +89,11 @@ static inline int ics_opal_init(void) { return -ENODEV; } /* ICS instance, hooked up to chip_data of an irq */ struct ics { struct list_head link; - int (*map)(struct ics *ics, unsigned int virq); + int (*check)(struct ics *ics, unsigned int hwirq); void (*mask_unknown)(struct ics *ics, unsigned long vec); long (*get_server)(struct ics *ics, unsigned long vec); int (*host_match)(struct ics *ics, struct device_node *node); + struct irq_chip *chip; char data[]; }; diff --git a/arch/powerpc/sysdev/xics/ics-native.c b/arch/powerpc/sysdev/xics/ics-native.c index d450502f4053..dec7d93a8ba1 100644 --- a/arch/powerpc/sysdev/xics/ics-native.c +++ b/arch/powerpc/sysdev/xics/ics-native.c @@ -131,19 +131,15 @@ static struct irq_chip ics_native_irq_chip = { .irq_retrigger = xics_retrigger, }; -static int ics_native_map(struct ics *ics, unsigned int virq) +static int ics_native_check(struct ics *ics, unsigned int hw_irq) { - unsigned int vec = (unsigned int)virq_to_hw(virq); struct ics_native *in = to_ics_native(ics); - pr_devel("%s: vec=0x%x\n", __func__, vec); + pr_devel("%s: hw_irq=0x%x\n", __func__, hw_irq); - if (vec < in->ibase || vec >= (in->ibase + in->icount)) + if (hw_irq < in->ibase || hw_irq >= (in->ibase + in->icount)) return -EINVAL; - irq_set_chip_and_handler(virq, &ics_native_irq_chip, handle_fasteoi_irq); - irq_set_chip_data(virq, ics); - return 0; } @@ -177,10 +173,11 @@ static int ics_native_host_match(struct ics *ics, struct device_node *node) } static struct ics ics_native_template = { - .map = ics_native_map, + .check = ics_native_check, .mask_unknown = ics_native_mask_unknown, .get_server = ics_native_get_server, .host_match = ics_native_host_match, + .chip = &ics_native_irq_chip, }; static int __init ics_native_add_one(struct device_node *np) diff --git a/arch/powerpc/sysdev/xics/ics-opal.c b/arch/powerpc/sysdev/xics/ics-opal.c index 823f6c9664cd..8c7ddcc718b6 100644 --- a/arch/powerpc/sysdev/xics/ics-opal.c +++ b/arch/powerpc/sysdev/xics/ics-opal.c @@ -157,26 +157,13 @@ static struct irq_chip ics_opal_irq_chip = { .irq_retrigger = xics_retrigger, }; -static int ics_opal_map(struct ics *ics, unsigned int virq); -static void ics_opal_mask_unknown(struct ics *ics, unsigned long vec); -static long ics_opal_get_server(struct ics *ics, unsigned long vec); - static int ics_opal_host_match(struct ics *ics, struct device_node *node) { return 1; } -/* Only one global & state struct ics */ -static struct ics ics_hal = { - .map = ics_opal_map, - .mask_unknown = ics_opal_mask_unknown, - .get_server = ics_opal_get_server, - .host_match = ics_opal_host_match, -}; - -static int ics_opal_map(struct ics *ics, unsigned int virq) +static int ics_opal_check(struct ics *ics, unsigned int hw_irq) { - unsigned int hw_irq = (unsigned int)virq_to_hw(virq); int64_t rc; __be16 server; int8_t priority; @@ -189,9 +176,6 @@ static int ics_opal_map(struct ics *ics, unsigned int virq) if (rc != OPAL_SUCCESS) return -ENXIO; - irq_set_chip_and_handler(virq, &ics_opal_irq_chip, handle_fasteoi_irq); - irq_set_chip_data(virq, &ics_hal); - return 0; } @@ -222,6 +206,15 @@ static long ics_opal_get_server(struct ics *ics, unsigned long vec) return ics_opal_unmangle_server(be16_to_cpu(server)); } +/* Only one global & state struct ics */ +static struct ics ics_hal = { + .check = ics_opal_check, + .mask_unknown = ics_opal_mask_unknown, + .get_server = ics_opal_get_server, + .host_match = ics_opal_host_match, + .chip = &ics_opal_irq_chip, +}; + int __init ics_opal_init(void) { if (!firmware_has_feature(FW_FEATURE_OPAL)) diff --git a/arch/powerpc/sysdev/xics/ics-rtas.c b/arch/powerpc/sysdev/xics/ics-rtas.c index 4cf18000f07c..6d19d711ed35 100644 --- a/arch/powerpc/sysdev/xics/ics-rtas.c +++ b/arch/powerpc/sysdev/xics/ics-rtas.c @@ -24,19 +24,6 @@ static int ibm_set_xive; static int ibm_int_on; static int ibm_int_off; -static int ics_rtas_map(struct ics *ics, unsigned int virq); -static void ics_rtas_mask_unknown(struct ics *ics, unsigned long vec); -static long ics_rtas_get_server(struct ics *ics, unsigned long vec); -static int ics_rtas_host_match(struct ics *ics, struct device_node *node); - -/* Only one global & state struct ics */ -static struct ics ics_rtas = { - .map = ics_rtas_map, - .mask_unknown = ics_rtas_mask_unknown, - .get_server = ics_rtas_get_server, - .host_match = ics_rtas_host_match, -}; - static void ics_rtas_unmask_irq(struct irq_data *d) { unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); @@ -169,9 +156,8 @@ static struct irq_chip ics_rtas_irq_chip = { .irq_retrigger = xics_retrigger, }; -static int ics_rtas_map(struct ics *ics, unsigned int virq) +static int ics_rtas_check(struct ics *ics, unsigned int hw_irq) { - unsigned int hw_irq = (unsigned int)virq_to_hw(virq); int status[2]; int rc; @@ -183,9 +169,6 @@ static int ics_rtas_map(struct ics *ics, unsigned int virq) if (rc) return -ENXIO; - irq_set_chip_and_handler(virq, &ics_rtas_irq_chip, handle_fasteoi_irq); - irq_set_chip_data(virq, &ics_rtas); - return 0; } @@ -213,6 +196,15 @@ static int ics_rtas_host_match(struct ics *ics, struct device_node *node) return !of_device_is_compatible(node, "chrp,iic"); } +/* Only one global & state struct ics */ +static struct ics ics_rtas = { + .check = ics_rtas_check, + .mask_unknown = ics_rtas_mask_unknown, + .get_server = ics_rtas_get_server, + .host_match = ics_rtas_host_match, + .chip = &ics_rtas_irq_chip, +}; + __init int ics_rtas_init(void) { ibm_get_xive = rtas_token("ibm,get-xive"); diff --git a/arch/powerpc/sysdev/xics/xics-common.c b/arch/powerpc/sysdev/xics/xics-common.c index 08c25748efb9..ed2bc14f56bd 100644 --- a/arch/powerpc/sysdev/xics/xics-common.c +++ b/arch/powerpc/sysdev/xics/xics-common.c @@ -318,10 +318,10 @@ static struct irq_chip xics_ipi_chip = { .irq_unmask = xics_ipi_unmask, }; -static int xics_host_map(struct irq_domain *h, unsigned int virq, - irq_hw_number_t hw) +static int xics_host_map(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq) { - pr_devel("xics: map virq %d, hwirq 0x%lx\n", virq, hw); + pr_devel("xics: map virq %d, hwirq 0x%lx\n", virq, hwirq); /* * Mark interrupts as edge sensitive by default so that resend @@ -331,7 +331,7 @@ static int xics_host_map(struct irq_domain *h, unsigned int virq, irq_clear_status_flags(virq, IRQ_LEVEL); /* Don't call into ICS for IPIs */ - if (hw == XICS_IPI) { + if (hwirq == XICS_IPI) { irq_set_chip_and_handler(virq, &xics_ipi_chip, handle_percpu_irq); return 0; @@ -340,10 +340,13 @@ static int xics_host_map(struct irq_domain *h, unsigned int virq, if (WARN_ON(!xics_ics)) return -EINVAL; - /* Let the ICS setup the chip data */ - if (xics_ics->map(xics_ics, virq)) + if (xics_ics->check(xics_ics, hwirq)) return -EINVAL; + /* No chip data for the XICS domain */ + irq_domain_set_info(domain, virq, hwirq, xics_ics->chip, + NULL, handle_fasteoi_irq, NULL, NULL); + return 0; } -- cgit v1.2.3-59-g8ed1b From 7d14f6c60b76fa7f3f89d81a95385576ca33b483 Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Thu, 1 Jul 2021 15:27:36 +0200 Subject: powerpc/xics: Give a name to the default XICS IRQ domain MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit and clean up the error path. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-19-clg@kaod.org --- arch/powerpc/sysdev/xics/xics-common.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/sysdev/xics/xics-common.c b/arch/powerpc/sysdev/xics/xics-common.c index ed2bc14f56bd..18d3de2f2249 100644 --- a/arch/powerpc/sysdev/xics/xics-common.c +++ b/arch/powerpc/sysdev/xics/xics-common.c @@ -412,11 +412,22 @@ static const struct irq_domain_ops xics_host_ops = { .xlate = xics_host_xlate, }; -static void __init xics_init_host(void) +static int __init xics_allocate_domain(void) { - xics_host = irq_domain_add_tree(NULL, &xics_host_ops, NULL); - BUG_ON(xics_host == NULL); + struct fwnode_handle *fn; + + fn = irq_domain_alloc_named_fwnode("XICS"); + if (!fn) + return -ENOMEM; + + xics_host = irq_domain_create_tree(fn, &xics_host_ops, NULL); + if (!xics_host) { + irq_domain_free_fwnode(fn); + return -ENOMEM; + } + irq_set_default_host(xics_host); + return 0; } void __init xics_register_ics(struct ics *ics) @@ -480,6 +491,8 @@ void __init xics_init(void) /* Initialize common bits */ xics_get_server_size(); xics_update_irq_servers(); - xics_init_host(); + rc = xics_allocate_domain(); + if (rc < 0) + pr_err("XICS: Failed to create IRQ domain"); xics_setup_cpu(); } -- cgit v1.2.3-59-g8ed1b From 53b34e8db73af98fa652641bf490384dc665d0f2 Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Thu, 1 Jul 2021 15:27:37 +0200 Subject: powerpc/xics: Add debug logging to the set_irq_affinity handlers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It really helps to know how the HW is configured when tweaking the IRQ subsystem. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-20-clg@kaod.org --- arch/powerpc/sysdev/xics/ics-opal.c | 2 +- arch/powerpc/sysdev/xics/ics-rtas.c | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/sysdev/xics/ics-opal.c b/arch/powerpc/sysdev/xics/ics-opal.c index 8c7ddcc718b6..bf26cae1b982 100644 --- a/arch/powerpc/sysdev/xics/ics-opal.c +++ b/arch/powerpc/sysdev/xics/ics-opal.c @@ -133,7 +133,7 @@ static int ics_opal_set_affinity(struct irq_data *d, } server = ics_opal_mangle_server(wanted_server); - pr_devel("ics-hal: set-affinity irq %d [hw 0x%x] server: 0x%x/0x%x\n", + pr_debug("ics-hal: set-affinity irq %d [hw 0x%x] server: 0x%x/0x%x\n", d->irq, hw_irq, wanted_server, server); rc = opal_set_xive(hw_irq, server, priority); diff --git a/arch/powerpc/sysdev/xics/ics-rtas.c b/arch/powerpc/sysdev/xics/ics-rtas.c index 6d19d711ed35..b50c6341682e 100644 --- a/arch/powerpc/sysdev/xics/ics-rtas.c +++ b/arch/powerpc/sysdev/xics/ics-rtas.c @@ -133,6 +133,9 @@ static int ics_rtas_set_affinity(struct irq_data *d, return -1; } + pr_debug("%s: irq %d [hw 0x%x] server: 0x%x\n", __func__, d->irq, + hw_irq, irq_server); + status = rtas_call_reentrant(ibm_set_xive, 3, 1, NULL, hw_irq, irq_server, xics_status[1]); -- cgit v1.2.3-59-g8ed1b From e4f0aa3b4731430ad73fb4485e97f751c7500668 Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Thu, 1 Jul 2021 15:27:38 +0200 Subject: powerpc/xics: Add support for IRQ domain hierarchy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit XICS doesn't have any state associated with the IRQ. The support is straightforward and simpler than for XIVE. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-21-clg@kaod.org --- arch/powerpc/sysdev/xics/xics-common.c | 41 ++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/sysdev/xics/xics-common.c b/arch/powerpc/sysdev/xics/xics-common.c index 18d3de2f2249..febab57f060f 100644 --- a/arch/powerpc/sysdev/xics/xics-common.c +++ b/arch/powerpc/sysdev/xics/xics-common.c @@ -406,7 +406,48 @@ int xics_retrigger(struct irq_data *data) return 0; } +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY +static int xics_host_domain_translate(struct irq_domain *d, struct irq_fwspec *fwspec, + unsigned long *hwirq, unsigned int *type) +{ + return xics_host_xlate(d, to_of_node(fwspec->fwnode), fwspec->param, + fwspec->param_count, hwirq, type); +} + +static int xics_host_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct irq_fwspec *fwspec = arg; + irq_hw_number_t hwirq; + unsigned int type = IRQ_TYPE_NONE; + int i, rc; + + rc = xics_host_domain_translate(domain, fwspec, &hwirq, &type); + if (rc) + return rc; + + pr_debug("%s %d/%lx #%d\n", __func__, virq, hwirq, nr_irqs); + + for (i = 0; i < nr_irqs; i++) + irq_domain_set_info(domain, virq + i, hwirq + i, xics_ics->chip, + xics_ics, handle_fasteoi_irq, NULL, NULL); + + return 0; +} + +static void xics_host_domain_free(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs) +{ + pr_debug("%s %d #%d\n", __func__, virq, nr_irqs); +} +#endif + static const struct irq_domain_ops xics_host_ops = { +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + .alloc = xics_host_domain_alloc, + .free = xics_host_domain_free, + .translate = xics_host_domain_translate, +#endif .match = xics_host_match, .map = xics_host_map, .xlate = xics_host_xlate, -- cgit v1.2.3-59-g8ed1b From bbb25af8fbdba4acaf955e412a84eb2eea48697c Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Thu, 1 Jul 2021 15:27:39 +0200 Subject: powerpc/powernv/pci: Customize the MSI EOI handler to support PHB3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PHB3s need an extra OPAL call to EOI the interrupt. The call takes an OPAL HW IRQ number but it is translated into a vector number in OPAL. Here, we directly use the vector number of the in-the-middle "PNV-MSI" domain instead of grabbing the OPAL HW IRQ number in the XICS parent domain. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-22-clg@kaod.org --- arch/powerpc/platforms/powernv/pci-ioda.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index e77caa4dbbdf..b498876a976f 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2169,12 +2169,33 @@ static void pnv_msi_compose_msg(struct irq_data *d, struct msi_msg *msg) entry->msi_attrib.is_64 ? "64" : "32", d->hwirq, rc); } +/* + * The IRQ data is mapped in the MSI domain in which HW IRQ numbers + * correspond to vector numbers. + */ +static void pnv_msi_eoi(struct irq_data *d) +{ + struct pci_controller *hose = irq_data_get_irq_chip_data(d); + struct pnv_phb *phb = hose->private_data; + + if (phb->model == PNV_PHB_MODEL_PHB3) { + /* + * The EOI OPAL call takes an OPAL HW IRQ number but + * since it is translated into a vector number in + * OPAL, use that directly. + */ + WARN_ON_ONCE(opal_pci_msi_eoi(phb->opal_id, d->hwirq)); + } + + irq_chip_eoi_parent(d); +} + static struct irq_chip pnv_msi_irq_chip = { .name = "PNV-MSI", .irq_shutdown = pnv_msi_shutdown, .irq_mask = irq_chip_mask_parent, .irq_unmask = irq_chip_unmask_parent, - .irq_eoi = irq_chip_eoi_parent, + .irq_eoi = pnv_msi_eoi, .irq_set_affinity = irq_chip_set_affinity_parent, .irq_compose_msi_msg = pnv_msi_compose_msg, }; -- cgit v1.2.3-59-g8ed1b From 679e30b9536eeb93bc8c9a39c0ddc77dec536f6b Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Thu, 1 Jul 2021 15:27:40 +0200 Subject: powerpc/pci: Drop XIVE restriction on MSI domains MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PowerNV and pSeries platforms now have support for both the XICS and XIVE IRQ domains. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-23-clg@kaod.org --- arch/powerpc/platforms/powernv/pci-ioda.c | 4 +--- arch/powerpc/platforms/pseries/msi.c | 4 ---- 2 files changed, 1 insertion(+), 7 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index b498876a976f..e2454439e574 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2332,9 +2332,7 @@ static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) pr_info(" Allocated bitmap for %d MSIs (base IRQ 0x%x)\n", count, phb->msi_base); - /* Only supported by the XIVE driver */ - if (xive_enabled()) - pnv_msi_allocate_domains(phb->hose, count); + pnv_msi_allocate_domains(phb->hose, count); } static void pnv_ioda_setup_pe_res(struct pnv_ioda_pe *pe, diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index e2127a3f7ebd..e196cc1b8540 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -720,10 +720,6 @@ int pseries_msi_allocate_domains(struct pci_controller *phb) { int count; - /* Only supported by the XIVE driver */ - if (!xive_enabled()) - return -ENODEV; - if (!__find_pe_total_msi(phb->dn, &count)) { pr_err("PCI: failed to find MSIs for bridge %pOF (domain %d)\n", phb->dn, phb->global_number); -- cgit v1.2.3-59-g8ed1b From 1e661f81a522eadfe4bc5bb1ec9fbae27c13f163 Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Thu, 1 Jul 2021 15:27:41 +0200 Subject: powerpc/xics: Drop unmask of MSIs at startup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit That was a workaround in the XICS domain because of the lack of MSI domain. This is now handled. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-24-clg@kaod.org --- arch/powerpc/sysdev/xics/ics-opal.c | 11 ----------- arch/powerpc/sysdev/xics/ics-rtas.c | 9 --------- 2 files changed, 20 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/sysdev/xics/ics-opal.c b/arch/powerpc/sysdev/xics/ics-opal.c index bf26cae1b982..c4d95d8beb6f 100644 --- a/arch/powerpc/sysdev/xics/ics-opal.c +++ b/arch/powerpc/sysdev/xics/ics-opal.c @@ -62,17 +62,6 @@ static void ics_opal_unmask_irq(struct irq_data *d) static unsigned int ics_opal_startup(struct irq_data *d) { -#ifdef CONFIG_PCI_MSI - /* - * The generic MSI code returns with the interrupt disabled on the - * card, using the MSI mask bits. Firmware doesn't appear to unmask - * at that level, so we do it here by hand. - */ - if (irq_data_get_msi_desc(d)) - pci_msi_unmask_irq(d); -#endif - - /* unmask it */ ics_opal_unmask_irq(d); return 0; } diff --git a/arch/powerpc/sysdev/xics/ics-rtas.c b/arch/powerpc/sysdev/xics/ics-rtas.c index b50c6341682e..b9da317b7a2d 100644 --- a/arch/powerpc/sysdev/xics/ics-rtas.c +++ b/arch/powerpc/sysdev/xics/ics-rtas.c @@ -57,15 +57,6 @@ static void ics_rtas_unmask_irq(struct irq_data *d) static unsigned int ics_rtas_startup(struct irq_data *d) { -#ifdef CONFIG_PCI_MSI - /* - * The generic MSI code returns with the interrupt disabled on the - * card, using the MSI mask bits. Firmware doesn't appear to unmask - * at that level, so we do it here by hand. - */ - if (irq_data_get_msi_desc(d)) - pci_msi_unmask_irq(d); -#endif /* unmask it */ ics_rtas_unmask_irq(d); return 0; -- cgit v1.2.3-59-g8ed1b From 3005123eea0daa18d98602ab64b2ce3ad087d849 Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Thu, 1 Jul 2021 15:27:42 +0200 Subject: powerpc/pseries/pci: Drop unused MSI code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MSIs should be fully managed by the PCI and IRQ subsystems now. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-25-clg@kaod.org --- arch/powerpc/platforms/pseries/msi.c | 87 ------------------------------------ 1 file changed, 87 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index e196cc1b8540..1b305e411862 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -111,21 +111,6 @@ static int rtas_query_irq_number(struct pci_dn *pdn, int offset) return rtas_ret[0]; } -static void rtas_teardown_msi_irqs(struct pci_dev *pdev) -{ - struct msi_desc *entry; - - for_each_pci_msi_entry(entry, pdev) { - if (!entry->irq) - continue; - - irq_set_msi_desc(entry->irq, NULL); - irq_dispose_mapping(entry->irq); - } - - rtas_disable_msi(pdev); -} - static int check_req(struct pci_dev *pdev, int nvec, char *prop_name) { struct device_node *dn; @@ -459,66 +444,6 @@ again: return 0; } -static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec_in, int type) -{ - struct pci_dn *pdn; - int hwirq, virq, i; - int rc; - struct msi_desc *entry; - struct msi_msg msg; - - rc = rtas_prepare_msi_irqs(pdev, nvec_in, type, NULL); - if (rc) - return rc; - - pdn = pci_get_pdn(pdev); - i = 0; - for_each_pci_msi_entry(entry, pdev) { - hwirq = rtas_query_irq_number(pdn, i++); - if (hwirq < 0) { - pr_debug("rtas_msi: error (%d) getting hwirq\n", rc); - return hwirq; - } - - /* - * Depending on the number of online CPUs in the original - * kernel, it is likely for CPU #0 to be offline in a kdump - * kernel. The associated IRQs in the affinity mappings - * provided by irq_create_affinity_masks() are thus not - * started by irq_startup(), as per-design for managed IRQs. - * This can be a problem with multi-queue block devices driven - * by blk-mq : such a non-started IRQ is very likely paired - * with the single queue enforced by blk-mq during kdump (see - * blk_mq_alloc_tag_set()). This causes the device to remain - * silent and likely hangs the guest at some point. - * - * We don't really care for fine-grained affinity when doing - * kdump actually : simply ignore the pre-computed affinity - * masks in this case and let the default mask with all CPUs - * be used when creating the IRQ mappings. - */ - if (is_kdump_kernel()) - virq = irq_create_mapping(NULL, hwirq); - else - virq = irq_create_mapping_affinity(NULL, hwirq, - entry->affinity); - - if (!virq) { - pr_debug("rtas_msi: Failed mapping hwirq %d\n", hwirq); - return -ENOSPC; - } - - dev_dbg(&pdev->dev, "rtas_msi: allocated virq %d\n", virq); - irq_set_msi_desc(virq, entry); - - /* Read config space back so we can restore after reset */ - __pci_read_msi_msg(entry, &msg); - entry->msg = msg; - } - - return 0; -} - static int pseries_msi_ops_prepare(struct irq_domain *domain, struct device *dev, int nvec, msi_alloc_info_t *arg) { @@ -759,8 +684,6 @@ static void rtas_msi_pci_irq_fixup(struct pci_dev *pdev) static int rtas_msi_init(void) { - struct pci_controller *phb; - query_token = rtas_token("ibm,query-interrupt-source-number"); change_token = rtas_token("ibm,change-msi"); @@ -772,16 +695,6 @@ static int rtas_msi_init(void) pr_debug("rtas_msi: Registering RTAS MSI callbacks.\n"); - WARN_ON(pseries_pci_controller_ops.setup_msi_irqs); - pseries_pci_controller_ops.setup_msi_irqs = rtas_setup_msi_irqs; - pseries_pci_controller_ops.teardown_msi_irqs = rtas_teardown_msi_irqs; - - list_for_each_entry(phb, &hose_list, list_node) { - WARN_ON(phb->controller_ops.setup_msi_irqs); - phb->controller_ops.setup_msi_irqs = rtas_setup_msi_irqs; - phb->controller_ops.teardown_msi_irqs = rtas_teardown_msi_irqs; - } - WARN_ON(ppc_md.pci_irq_fixup); ppc_md.pci_irq_fixup = rtas_msi_pci_irq_fixup; -- cgit v1.2.3-59-g8ed1b From 6d9ba6121b1cf453985d08c141970a1b44cd9cf1 Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Thu, 1 Jul 2021 15:27:43 +0200 Subject: powerpc/powernv/pci: Drop unused MSI code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MSIs should be fully managed by the PCI and IRQ subsystems now. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-26-clg@kaod.org --- arch/powerpc/platforms/powernv/pci-ioda.c | 27 ------------- arch/powerpc/platforms/powernv/pci.c | 67 ------------------------------- arch/powerpc/platforms/powernv/pci.h | 6 --- 3 files changed, 100 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index e2454439e574..eb38ce1fd434 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2080,29 +2080,6 @@ static int __pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, return 0; } -static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, - unsigned int hwirq, unsigned int virq, - unsigned int is_64, struct msi_msg *msg) -{ - struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev); - unsigned int xive_num = hwirq - phb->msi_base; - int rc; - - rc = __pnv_pci_ioda_msi_setup(phb, dev, xive_num, is_64, msg); - if (rc) - return rc; - - /* P8 only */ - pnv_set_msi_irq_chip(phb, virq); - - pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d)," - " address=%x_%08x data=%x PE# %x\n", - pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num, - msg->address_hi, msg->address_lo, msg->data, pe->pe_number); - - return 0; -} - /* * The msi_free() op is called before irq_domain_free_irqs_top() when * the handler data is still available. Use that to clear the XIVE @@ -2327,8 +2304,6 @@ static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) return; } - phb->msi_setup = pnv_pci_ioda_msi_setup; - phb->msi32_support = 1; pr_info(" Allocated bitmap for %d MSIs (base IRQ 0x%x)\n", count, phb->msi_base); @@ -2936,8 +2911,6 @@ static const struct pci_controller_ops pnv_pci_ioda_controller_ops = { .dma_dev_setup = pnv_pci_ioda_dma_dev_setup, .dma_bus_setup = pnv_pci_ioda_dma_bus_setup, .iommu_bypass_supported = pnv_pci_ioda_iommu_bypass_supported, - .setup_msi_irqs = pnv_setup_msi_irqs, - .teardown_msi_irqs = pnv_teardown_msi_irqs, .enable_device_hook = pnv_pci_enable_device_hook, .release_device = pnv_pci_release_device, .window_alignment = pnv_pci_window_alignment, diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index 6bb3c52633fb..9a8391b983d1 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -160,73 +160,6 @@ exit: } EXPORT_SYMBOL_GPL(pnv_pci_set_power_state); -int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) -{ - struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); - struct msi_desc *entry; - struct msi_msg msg; - int hwirq; - unsigned int virq; - int rc; - - if (WARN_ON(!phb) || !phb->msi_bmp.bitmap) - return -ENODEV; - - if (pdev->no_64bit_msi && !phb->msi32_support) - return -ENODEV; - - for_each_pci_msi_entry(entry, pdev) { - if (!entry->msi_attrib.is_64 && !phb->msi32_support) { - pr_warn("%s: Supports only 64-bit MSIs\n", - pci_name(pdev)); - return -ENXIO; - } - hwirq = msi_bitmap_alloc_hwirqs(&phb->msi_bmp, 1); - if (hwirq < 0) { - pr_warn("%s: Failed to find a free MSI\n", - pci_name(pdev)); - return -ENOSPC; - } - virq = irq_create_mapping(NULL, phb->msi_base + hwirq); - if (!virq) { - pr_warn("%s: Failed to map MSI to linux irq\n", - pci_name(pdev)); - msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, 1); - return -ENOMEM; - } - rc = phb->msi_setup(phb, pdev, phb->msi_base + hwirq, - virq, entry->msi_attrib.is_64, &msg); - if (rc) { - pr_warn("%s: Failed to setup MSI\n", pci_name(pdev)); - irq_dispose_mapping(virq); - msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq, 1); - return rc; - } - irq_set_msi_desc(virq, entry); - pci_write_msi_msg(virq, &msg); - } - return 0; -} - -void pnv_teardown_msi_irqs(struct pci_dev *pdev) -{ - struct pnv_phb *phb = pci_bus_to_pnvhb(pdev->bus); - struct msi_desc *entry; - irq_hw_number_t hwirq; - - if (WARN_ON(!phb)) - return; - - for_each_pci_msi_entry(entry, pdev) { - if (!entry->irq) - continue; - hwirq = virq_to_hw(entry->irq); - irq_set_msi_desc(entry->irq, NULL); - irq_dispose_mapping(entry->irq); - msi_bitmap_free_hwirqs(&phb->msi_bmp, hwirq - phb->msi_base, 1); - } -} - /* Nicely print the contents of the PE State Tables (PEST). */ static void pnv_pci_dump_pest(__be64 pestA[], __be64 pestB[], int pest_size) { diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index c8d4f222a86f..966a9eb64339 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -123,11 +123,7 @@ struct pnv_phb { #endif unsigned int msi_base; - unsigned int msi32_support; struct msi_bitmap msi_bmp; - int (*msi_setup)(struct pnv_phb *phb, struct pci_dev *dev, - unsigned int hwirq, unsigned int virq, - unsigned int is_64, struct msi_msg *msg); int (*init_m64)(struct pnv_phb *phb); int (*get_pe_state)(struct pnv_phb *phb, int pe_no); void (*freeze_pe)(struct pnv_phb *phb, int pe_no); @@ -289,8 +285,6 @@ extern void pnv_pci_init_npu2_opencapi_phb(struct device_node *np); extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev); extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option); -extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type); -extern void pnv_teardown_msi_irqs(struct pci_dev *pdev); extern struct pnv_ioda_pe *pnv_pci_bdfn_to_pe(struct pnv_phb *phb, u16 bdfn); extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev); extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq); -- cgit v1.2.3-59-g8ed1b From f1a377f86f51b381cfc30bf2270f8a5f81e35ee9 Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Thu, 1 Jul 2021 15:27:44 +0200 Subject: powerpc/powernv/pci: Adapt is_pnv_opal_msi() to detect passthrough interrupt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pnv_ioda2_msi_eoi() chip handler is not used anymore for MSIs. Simply use the check on the PSI-MSI chip. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-27-clg@kaod.org --- arch/powerpc/platforms/powernv/pci-ioda.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index eb38ce1fd434..6c4b37598bcc 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2015,7 +2015,7 @@ static struct irq_chip pnv_pci_msi_irq_chip; */ bool is_pnv_opal_msi(struct irq_chip *chip) { - return chip->irq_eoi == pnv_ioda2_msi_eoi || chip == &pnv_pci_msi_irq_chip; + return chip == &pnv_pci_msi_irq_chip; } EXPORT_SYMBOL_GPL(is_pnv_opal_msi); -- cgit v1.2.3-59-g8ed1b From c80198a21792ac59412871e4e6fad5041c9be8e4 Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Thu, 1 Jul 2021 15:27:45 +0200 Subject: powerpc/xics: Fix IRQ migration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit desc->irq_data points to the top level IRQ data descriptor which is not necessarily in the XICS IRQ domain. MSIs are in another domain for instance. Fix that by looking for a mapping on the low level XICS IRQ domain. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-28-clg@kaod.org --- arch/powerpc/sysdev/xics/xics-common.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/sysdev/xics/xics-common.c b/arch/powerpc/sysdev/xics/xics-common.c index febab57f060f..4a7687caec75 100644 --- a/arch/powerpc/sysdev/xics/xics-common.c +++ b/arch/powerpc/sysdev/xics/xics-common.c @@ -183,6 +183,8 @@ void xics_migrate_irqs_away(void) unsigned int irq, virq; struct irq_desc *desc; + pr_debug("%s: CPU %u\n", __func__, cpu); + /* If we used to be the default server, move to the new "boot_cpuid" */ if (hw_cpu == xics_default_server) xics_update_irq_servers(); @@ -197,6 +199,7 @@ void xics_migrate_irqs_away(void) struct irq_chip *chip; long server; unsigned long flags; + struct irq_data *irqd; /* We can't set affinity on ISA interrupts */ if (virq < NR_IRQS_LEGACY) @@ -204,9 +207,11 @@ void xics_migrate_irqs_away(void) /* We only need to migrate enabled IRQS */ if (!desc->action) continue; - if (desc->irq_data.domain != xics_host) + /* We need a mapping in the XICS IRQ domain */ + irqd = irq_domain_get_irq_data(xics_host, virq); + if (!irqd) continue; - irq = desc->irq_data.hwirq; + irq = irqd_to_hwirq(irqd); /* We need to get IPIs still. */ if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS) continue; -- cgit v1.2.3-59-g8ed1b From 5cd69651ceeed15e021cf7d19f1b1be0a80c0c7a Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Thu, 1 Jul 2021 15:27:46 +0200 Subject: powerpc/powernv/pci: Set the IRQ chip data for P8/CXL devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before MSI domains, the default IRQ chip of PHB3 MSIs was patched by pnv_set_msi_irq_chip() with the custom EOI handler pnv_ioda2_msi_eoi() and the owning PHB was deduced from the 'ioda.irq_chip' field. This path has been deprecated by the MSI domains but it is still in use by the P8 CAPI 'cxl' driver. Rewriting this driver to support MSI would be a waste of time. Nevertheless, we can still remove the IRQ chip patch and set the IRQ chip data instead. This is cleaner. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-29-clg@kaod.org --- arch/powerpc/platforms/powernv/pci-ioda.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 6c4b37598bcc..aa97245eedbf 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1971,19 +1971,23 @@ int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq) return opal_pci_msi_eoi(phb->opal_id, hw_irq); } +/* + * The IRQ data is mapped in the XICS domain, with OPAL HW IRQ numbers + */ static void pnv_ioda2_msi_eoi(struct irq_data *d) { int64_t rc; unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); - struct irq_chip *chip = irq_data_get_irq_chip(d); + struct pci_controller *hose = irq_data_get_irq_chip_data(d); + struct pnv_phb *phb = hose->private_data; - rc = pnv_opal_pci_msi_eoi(chip, hw_irq); + rc = opal_pci_msi_eoi(phb->opal_id, hw_irq); WARN_ON_ONCE(rc); icp_native_eoi(d); } - +/* P8/CXL only */ void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq) { struct irq_data *idata; @@ -2005,6 +2009,7 @@ void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq) phb->ioda.irq_chip.irq_eoi = pnv_ioda2_msi_eoi; } irq_set_chip(virq, &phb->ioda.irq_chip); + irq_set_chip_data(virq, phb->hose); } static struct irq_chip pnv_pci_msi_irq_chip; -- cgit v1.2.3-59-g8ed1b From c325712b5f85e561ea89bae2ba5d0104e797e42c Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Thu, 1 Jul 2021 15:27:47 +0200 Subject: powerpc/powernv/pci: Rework pnv_opal_pci_msi_eoi() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pnv_opal_pci_msi_eoi() is called from KVM to EOI passthrough interrupts when in real mode. Adding MSI domain broke the hack using the 'ioda.irq_chip' field to deduce the owning PHB. Fix that by using the IRQ chip data in the MSI domain. The 'ioda.irq_chip' field is now unused and could be removed from the pnv_phb struct. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-30-clg@kaod.org --- arch/powerpc/include/asm/pnv-pci.h | 2 +- arch/powerpc/kvm/book3s_hv_rm_xics.c | 8 ++++---- arch/powerpc/platforms/powernv/pci-ioda.c | 17 +++++++++++++---- 3 files changed, 18 insertions(+), 9 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/pnv-pci.h b/arch/powerpc/include/asm/pnv-pci.h index d0ee0ede5767..b3f480799352 100644 --- a/arch/powerpc/include/asm/pnv-pci.h +++ b/arch/powerpc/include/asm/pnv-pci.h @@ -33,7 +33,7 @@ int pnv_cxl_alloc_hwirqs(struct pci_dev *dev, int num); void pnv_cxl_release_hwirqs(struct pci_dev *dev, int hwirq, int num); int pnv_cxl_get_irq_count(struct pci_dev *dev); struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev); -int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq); +int64_t pnv_opal_pci_msi_eoi(struct irq_data *d); bool is_pnv_opal_msi(struct irq_chip *chip); #ifdef CONFIG_CXL_BASE diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 0a11ec88a0ae..587c33fc4564 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -706,6 +706,7 @@ static int ics_rm_eoi(struct kvm_vcpu *vcpu, u32 irq) icp->rm_eoied_irq = irq; } + /* Handle passthrough interrupts */ if (state->host_irq) { ++vcpu->stat.pthru_all; if (state->intr_cpu != -1) { @@ -759,12 +760,12 @@ int xics_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) static unsigned long eoi_rc; -static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again) +static void icp_eoi(struct irq_data *d, u32 hwirq, __be32 xirr, bool *again) { void __iomem *xics_phys; int64_t rc; - rc = pnv_opal_pci_msi_eoi(c, hwirq); + rc = pnv_opal_pci_msi_eoi(d); if (rc) eoi_rc = rc; @@ -872,8 +873,7 @@ long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, icp_rm_deliver_irq(xics, icp, irq, false); /* EOI the interrupt */ - icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr, - again); + icp_eoi(irq_desc_get_irq_data(irq_map->desc), irq_map->r_hwirq, xirr, again); if (check_too_hard(xics, icp) == H_TOO_HARD) return 2; diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index aa97245eedbf..2389cd79c3c8 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1963,12 +1963,21 @@ void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, pe->dma_setup_done = true; } -int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq) +/* + * Called from KVM in real mode to EOI passthru interrupts. The ICP + * EOI is handled directly in KVM in kvmppc_deliver_irq_passthru(). + * + * The IRQ data is mapped in the PCI-MSI domain and the EOI OPAL call + * needs an HW IRQ number mapped in the XICS IRQ domain. The HW IRQ + * numbers of the in-the-middle MSI domain are vector numbers and it's + * good enough for OPAL. Use that. + */ +int64_t pnv_opal_pci_msi_eoi(struct irq_data *d) { - struct pnv_phb *phb = container_of(chip, struct pnv_phb, - ioda.irq_chip); + struct pci_controller *hose = irq_data_get_irq_chip_data(d->parent_data); + struct pnv_phb *phb = hose->private_data; - return opal_pci_msi_eoi(phb->opal_id, hw_irq); + return opal_pci_msi_eoi(phb->opal_id, d->parent_data->hwirq); } /* -- cgit v1.2.3-59-g8ed1b From 1753081f2d445f9157550692fcc4221cd3ff0958 Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Thu, 1 Jul 2021 15:27:48 +0200 Subject: KVM: PPC: Book3S HV: XICS: Fix mapping of passthrough interrupts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PCI MSIs now live in an MSI domain but the underlying calls, which will EOI the interrupt in real mode, need an HW IRQ number mapped in the XICS IRQ domain. Grab it there. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-31-clg@kaod.org --- arch/powerpc/kvm/book3s_hv.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 05b3a3548c18..f28f99805c4c 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -5328,6 +5328,7 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) struct kvmppc_passthru_irqmap *pimap; struct irq_chip *chip; int i, rc = 0; + struct irq_data *host_data; if (!kvm_irq_bypass) return 1; @@ -5392,7 +5393,14 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) * the KVM real mode handler. */ smp_wmb(); - irq_map->r_hwirq = desc->irq_data.hwirq; + + /* + * The 'host_irq' number is mapped in the PCI-MSI domain but + * the underlying calls, which will EOI the interrupt in real + * mode, need an HW IRQ number mapped in the XICS IRQ domain. + */ + host_data = irq_domain_get_irq_data(irq_get_default_host(), host_irq); + irq_map->r_hwirq = (unsigned int)irqd_to_hwirq(host_data); if (i == pimap->n_mapped) pimap->n_mapped++; @@ -5400,7 +5408,7 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) if (xics_on_xive()) rc = kvmppc_xive_set_mapped(kvm, guest_gsi, host_irq); else - kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq); + kvmppc_xics_set_mapped(kvm, guest_gsi, irq_map->r_hwirq); if (rc) irq_map->r_hwirq = 0; -- cgit v1.2.3-59-g8ed1b From 59b2bc18b1492b46d45b6b6828ba098f09b9ba67 Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Thu, 1 Jul 2021 15:27:49 +0200 Subject: powerpc/xive: Use XIVE domain under xmon and debugfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The default domain of the PCI/MSIs is not the XIVE domain anymore. To list the IRQ mappings under XMON and debugfs, query the IRQ data from the low level XIVE domain. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210701132750.1475580-32-clg@kaod.org --- arch/powerpc/sysdev/xive/common.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 107f442d3411..d0deaebbfeeb 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -312,11 +312,10 @@ void xmon_xive_get_irq_all(void) struct irq_desc *desc; for_each_irq_desc(i, desc) { - struct irq_data *d = irq_desc_get_irq_data(desc); - unsigned int hwirq = (unsigned int)irqd_to_hwirq(d); + struct irq_data *d = irq_domain_get_irq_data(xive_irq_domain, i); - if (d->domain == xive_irq_domain) - xmon_xive_get_irq_config(hwirq, d); + if (d) + xmon_xive_get_irq_config(irqd_to_hwirq(d), d); } } @@ -1757,9 +1756,9 @@ static int xive_core_debug_show(struct seq_file *m, void *private) xive_debug_show_cpu(m, cpu); for_each_irq_desc(i, desc) { - struct irq_data *d = irq_desc_get_irq_data(desc); + struct irq_data *d = irq_domain_get_irq_data(xive_irq_domain, i); - if (d->domain == xive_irq_domain) + if (d) xive_debug_show_irq(m, d); } return 0; -- cgit v1.2.3-59-g8ed1b From 17df41fec5b80b16ea4774495f1eb730e2225619 Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Mon, 19 Jul 2021 15:06:14 +0200 Subject: powerpc: use IRQF_NO_DEBUG for IPIs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is no need to use the lockup detector ("noirqdebug") for IPIs. The ipistorm benchmark measures a ~10% improvement on high systems when this flag is set. Signed-off-by: Cédric Le Goater Reviewed-by: Thomas Gleixner Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210719130614.195886-1-clg@kaod.org --- arch/powerpc/sysdev/xics/xics-common.c | 2 +- arch/powerpc/sysdev/xive/common.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/sysdev/xics/xics-common.c b/arch/powerpc/sysdev/xics/xics-common.c index 4a7687caec75..5c1a157a83b8 100644 --- a/arch/powerpc/sysdev/xics/xics-common.c +++ b/arch/powerpc/sysdev/xics/xics-common.c @@ -132,7 +132,7 @@ static void xics_request_ipi(void) * IPIs are marked IRQF_PERCPU. The handler was set in map. */ BUG_ON(request_irq(ipi, icp_ops->ipi_action, - IRQF_PERCPU | IRQF_NO_THREAD, "IPI", NULL)); + IRQF_NO_DEBUG | IRQF_PERCPU | IRQF_NO_THREAD, "IPI", NULL)); } void __init xics_smp_probe(void) diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index d0deaebbfeeb..4018964bbd69 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -1149,7 +1149,8 @@ static int __init xive_request_ipi(void) snprintf(xid->name, sizeof(xid->name), "IPI-%d", node); ret = request_irq(xid->irq, xive_muxed_ipi_action, - IRQF_PERCPU | IRQF_NO_THREAD, xid->name, NULL); + IRQF_NO_DEBUG | IRQF_PERCPU | IRQF_NO_THREAD, + xid->name, NULL); WARN(ret < 0, "Failed to request IPI %d: %d\n", xid->irq, ret); } -- cgit v1.2.3-59-g8ed1b From b68c6646cce5ee8caefa6333ee743f960222dcea Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Tue, 20 Jul 2021 15:42:08 +0200 Subject: KVM: PPC: Book3S HV: XIVE: Add a 'flags' field MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use it to hold platform specific features. P9 DD2 introduced single-escalation support. P10 will add others. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210720134209.256133-2-clg@kaod.org --- arch/powerpc/kvm/book3s_xive.c | 19 ++++++++++--------- arch/powerpc/kvm/book3s_xive.h | 9 ++++++++- arch/powerpc/kvm/book3s_xive_native.c | 12 +++++++----- 3 files changed, 25 insertions(+), 15 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 6878026ee8ec..555cc610e7ab 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -363,9 +363,9 @@ static int xive_check_provisioning(struct kvm *kvm, u8 prio) if (!vcpu->arch.xive_vcpu) continue; rc = xive_provision_queue(vcpu, prio); - if (rc == 0 && !xive->single_escalation) + if (rc == 0 && !kvmppc_xive_has_single_escalation(xive)) kvmppc_xive_attach_escalation(vcpu, prio, - xive->single_escalation); + kvmppc_xive_has_single_escalation(xive)); if (rc) return rc; } @@ -1199,7 +1199,7 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) /* Free escalations */ for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { if (xc->esc_virq[i]) { - if (xc->xive->single_escalation) + if (kvmppc_xive_has_single_escalation(xc->xive)) xive_cleanup_single_escalation(vcpu, xc, xc->esc_virq[i]); free_irq(xc->esc_virq[i], vcpu); @@ -1340,7 +1340,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, * Enable the VP first as the single escalation mode will * affect escalation interrupts numbering */ - r = xive_native_enable_vp(xc->vp_id, xive->single_escalation); + r = xive_native_enable_vp(xc->vp_id, kvmppc_xive_has_single_escalation(xive)); if (r) { pr_err("Failed to enable VP in OPAL, err %d\n", r); goto bail; @@ -1357,15 +1357,15 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, struct xive_q *q = &xc->queues[i]; /* Single escalation, no queue 7 */ - if (i == 7 && xive->single_escalation) + if (i == 7 && kvmppc_xive_has_single_escalation(xive)) break; /* Is queue already enabled ? Provision it */ if (xive->qmap & (1 << i)) { r = xive_provision_queue(vcpu, i); - if (r == 0 && !xive->single_escalation) + if (r == 0 && !kvmppc_xive_has_single_escalation(xive)) kvmppc_xive_attach_escalation( - vcpu, i, xive->single_escalation); + vcpu, i, kvmppc_xive_has_single_escalation(xive)); if (r) goto bail; } else { @@ -1380,7 +1380,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, } /* If not done above, attach priority 0 escalation */ - r = kvmppc_xive_attach_escalation(vcpu, 0, xive->single_escalation); + r = kvmppc_xive_attach_escalation(vcpu, 0, kvmppc_xive_has_single_escalation(xive)); if (r) goto bail; @@ -2135,7 +2135,8 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type) */ xive->nr_servers = KVM_MAX_VCPUS; - xive->single_escalation = xive_native_has_single_escalation(); + if (xive_native_has_single_escalation()) + xive->flags |= KVMPPC_XIVE_FLAG_SINGLE_ESCALATION; kvm->arch.xive = xive; return 0; diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h index afe9eeac6d56..73c3cd25093c 100644 --- a/arch/powerpc/kvm/book3s_xive.h +++ b/arch/powerpc/kvm/book3s_xive.h @@ -97,6 +97,8 @@ struct kvmppc_xive_ops { int (*reset_mapped)(struct kvm *kvm, unsigned long guest_irq); }; +#define KVMPPC_XIVE_FLAG_SINGLE_ESCALATION 0x1 + struct kvmppc_xive { struct kvm *kvm; struct kvm_device *dev; @@ -133,7 +135,7 @@ struct kvmppc_xive { u32 q_page_order; /* Flags */ - u8 single_escalation; + u8 flags; /* Number of entries in the VP block */ u32 nr_servers; @@ -308,5 +310,10 @@ void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu, int kvmppc_xive_compute_vp_id(struct kvmppc_xive *xive, u32 cpu, u32 *vp); int kvmppc_xive_set_nr_servers(struct kvmppc_xive *xive, u64 addr); +static inline bool kvmppc_xive_has_single_escalation(struct kvmppc_xive *xive) +{ + return xive->flags & KVMPPC_XIVE_FLAG_SINGLE_ESCALATION; +} + #endif /* CONFIG_KVM_XICS */ #endif /* _KVM_PPC_BOOK3S_XICS_H */ diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index 573ecaab3597..2abb1358a268 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -93,7 +93,7 @@ void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu) for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) { /* Free the escalation irq */ if (xc->esc_virq[i]) { - if (xc->xive->single_escalation) + if (kvmppc_xive_has_single_escalation(xc->xive)) xive_cleanup_single_escalation(vcpu, xc, xc->esc_virq[i]); free_irq(xc->esc_virq[i], vcpu); @@ -172,7 +172,7 @@ int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev, * Enable the VP first as the single escalation mode will * affect escalation interrupts numbering */ - rc = xive_native_enable_vp(xc->vp_id, xive->single_escalation); + rc = xive_native_enable_vp(xc->vp_id, kvmppc_xive_has_single_escalation(xive)); if (rc) { pr_err("Failed to enable VP in OPAL: %d\n", rc); goto bail; @@ -693,7 +693,7 @@ static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive, } rc = kvmppc_xive_attach_escalation(vcpu, priority, - xive->single_escalation); + kvmppc_xive_has_single_escalation(xive)); error: if (rc) kvmppc_xive_native_cleanup_queue(vcpu, priority); @@ -820,7 +820,7 @@ static int kvmppc_xive_reset(struct kvmppc_xive *xive) for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) { /* Single escalation, no queue 7 */ - if (prio == 7 && xive->single_escalation) + if (prio == 7 && kvmppc_xive_has_single_escalation(xive)) break; if (xc->esc_virq[prio]) { @@ -1111,7 +1111,9 @@ static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type) */ xive->nr_servers = KVM_MAX_VCPUS; - xive->single_escalation = xive_native_has_single_escalation(); + if (xive_native_has_single_escalation()) + xive->flags |= KVMPPC_XIVE_FLAG_SINGLE_ESCALATION; + xive->ops = &kvmppc_xive_native_ops; kvm->arch.xive = xive; -- cgit v1.2.3-59-g8ed1b From f5af0a978776b710f16dc99a85496b1e760bf9e0 Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Tue, 20 Jul 2021 15:42:09 +0200 Subject: KVM: PPC: Book3S HV: XIVE: Add support for automatic save-restore MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On P10, the feature doing an automatic "save & restore" of a VCPU interrupt context is set by default in OPAL. When a VP context is pulled out, the state of the interrupt registers are saved by the XIVE interrupt controller under the internal NVP structure representing the VP. This saves a costly store/load in guest entries and exits. If OPAL advertises the "save & restore" feature in the device tree, it should also have set the 'H' bit in the CAM line. Check that when vCPUs are connected to their ICP in KVM before going any further. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210720134209.256133-3-clg@kaod.org --- arch/powerpc/include/asm/xive-regs.h | 3 +++ arch/powerpc/include/asm/xive.h | 1 + arch/powerpc/kvm/book3s_xive.c | 34 ++++++++++++++++++++++++++++++++-- arch/powerpc/kvm/book3s_xive.h | 2 ++ arch/powerpc/kvm/book3s_xive_native.c | 9 +++++++++ arch/powerpc/sysdev/xive/native.c | 10 ++++++++++ 6 files changed, 57 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/xive-regs.h b/arch/powerpc/include/asm/xive-regs.h index 8b211faa0e42..cf8bb6ac4463 100644 --- a/arch/powerpc/include/asm/xive-regs.h +++ b/arch/powerpc/include/asm/xive-regs.h @@ -80,10 +80,13 @@ #define TM_QW0W2_VU PPC_BIT32(0) #define TM_QW0W2_LOGIC_SERV PPC_BITMASK32(1,31) // XX 2,31 ? #define TM_QW1W2_VO PPC_BIT32(0) +#define TM_QW1W2_HO PPC_BIT32(1) /* P10 XIVE2 */ #define TM_QW1W2_OS_CAM PPC_BITMASK32(8,31) #define TM_QW2W2_VP PPC_BIT32(0) +#define TM_QW2W2_HP PPC_BIT32(1) /* P10 XIVE2 */ #define TM_QW2W2_POOL_CAM PPC_BITMASK32(8,31) #define TM_QW3W2_VT PPC_BIT32(0) +#define TM_QW3W2_HT PPC_BIT32(1) /* P10 XIVE2 */ #define TM_QW3W2_LP PPC_BIT32(6) #define TM_QW3W2_LE PPC_BIT32(7) #define TM_QW3W2_T PPC_BIT32(31) diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h index 20ae50ab083c..92930b0b5d0e 100644 --- a/arch/powerpc/include/asm/xive.h +++ b/arch/powerpc/include/asm/xive.h @@ -126,6 +126,7 @@ int xive_native_enable_vp(u32 vp_id, bool single_escalation); int xive_native_disable_vp(u32 vp_id); int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id); bool xive_native_has_single_escalation(void); +bool xive_native_has_save_restore(void); int xive_native_get_queue_info(u32 vp_id, uint32_t prio, u64 *out_qpage, diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 555cc610e7ab..912c1e9eef6b 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -59,6 +59,25 @@ */ #define XIVE_Q_GAP 2 +static bool kvmppc_xive_vcpu_has_save_restore(struct kvm_vcpu *vcpu) +{ + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; + + /* Check enablement at VP level */ + return xc->vp_cam & TM_QW1W2_HO; +} + +bool kvmppc_xive_check_save_restore(struct kvm_vcpu *vcpu) +{ + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; + struct kvmppc_xive *xive = xc->xive; + + if (xive->flags & KVMPPC_XIVE_FLAG_SAVE_RESTORE) + return kvmppc_xive_vcpu_has_save_restore(vcpu); + + return true; +} + /* * Push a vcpu's context to the XIVE on guest entry. * This assumes we are in virtual mode (MMU on) @@ -77,7 +96,8 @@ void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) return; eieio(); - __raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS); + if (!kvmppc_xive_vcpu_has_save_restore(vcpu)) + __raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS); __raw_writel(vcpu->arch.xive_cam_word, tima + TM_QW1_OS + TM_WORD2); vcpu->arch.xive_pushed = 1; eieio(); @@ -149,7 +169,8 @@ void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu) /* First load to pull the context, we ignore the value */ __raw_readl(tima + TM_SPC_PULL_OS_CTX); /* Second load to recover the context state (Words 0 and 1) */ - vcpu->arch.xive_saved_state.w01 = __raw_readq(tima + TM_QW1_OS); + if (!kvmppc_xive_vcpu_has_save_restore(vcpu)) + vcpu->arch.xive_saved_state.w01 = __raw_readq(tima + TM_QW1_OS); /* Fixup some of the state for the next load */ vcpu->arch.xive_saved_state.lsmfb = 0; @@ -1319,6 +1340,12 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev, if (r) goto bail; + if (!kvmppc_xive_check_save_restore(vcpu)) { + pr_err("inconsistent save-restore setup for VCPU %d\n", cpu); + r = -EIO; + goto bail; + } + /* Configure VCPU fields for use by assembly push/pull */ vcpu->arch.xive_saved_state.w01 = cpu_to_be64(0xff000000); vcpu->arch.xive_cam_word = cpu_to_be32(xc->vp_cam | TM_QW1W2_VO); @@ -2138,6 +2165,9 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type) if (xive_native_has_single_escalation()) xive->flags |= KVMPPC_XIVE_FLAG_SINGLE_ESCALATION; + if (xive_native_has_save_restore()) + xive->flags |= KVMPPC_XIVE_FLAG_SAVE_RESTORE; + kvm->arch.xive = xive; return 0; } diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h index 73c3cd25093c..e6a9651c6f1e 100644 --- a/arch/powerpc/kvm/book3s_xive.h +++ b/arch/powerpc/kvm/book3s_xive.h @@ -98,6 +98,7 @@ struct kvmppc_xive_ops { }; #define KVMPPC_XIVE_FLAG_SINGLE_ESCALATION 0x1 +#define KVMPPC_XIVE_FLAG_SAVE_RESTORE 0x2 struct kvmppc_xive { struct kvm *kvm; @@ -309,6 +310,7 @@ void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu, struct kvmppc_xive_vcpu *xc, int irq); int kvmppc_xive_compute_vp_id(struct kvmppc_xive *xive, u32 cpu, u32 *vp); int kvmppc_xive_set_nr_servers(struct kvmppc_xive *xive, u64 addr); +bool kvmppc_xive_check_save_restore(struct kvm_vcpu *vcpu); static inline bool kvmppc_xive_has_single_escalation(struct kvmppc_xive *xive) { diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index 2abb1358a268..af65ea21bde7 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -168,6 +168,12 @@ int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev, goto bail; } + if (!kvmppc_xive_check_save_restore(vcpu)) { + pr_err("inconsistent save-restore setup for VCPU %d\n", server_num); + rc = -EIO; + goto bail; + } + /* * Enable the VP first as the single escalation mode will * affect escalation interrupts numbering @@ -1114,6 +1120,9 @@ static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type) if (xive_native_has_single_escalation()) xive->flags |= KVMPPC_XIVE_FLAG_SINGLE_ESCALATION; + if (xive_native_has_save_restore()) + xive->flags |= KVMPPC_XIVE_FLAG_SAVE_RESTORE; + xive->ops = &kvmppc_xive_native_ops; kvm->arch.xive = xive; diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c index 57e3f1540435..1aec282cd650 100644 --- a/arch/powerpc/sysdev/xive/native.c +++ b/arch/powerpc/sysdev/xive/native.c @@ -41,6 +41,7 @@ static u32 xive_queue_shift; static u32 xive_pool_vps = XIVE_INVALID_VP; static struct kmem_cache *xive_provision_cache; static bool xive_has_single_esc; +static bool xive_has_save_restore; int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data) { @@ -588,6 +589,9 @@ bool __init xive_native_init(void) if (of_get_property(np, "single-escalation-support", NULL) != NULL) xive_has_single_esc = true; + if (of_get_property(np, "vp-save-restore", NULL)) + xive_has_save_restore = true; + /* Configure Thread Management areas for KVM */ for_each_possible_cpu(cpu) kvmppc_set_xive_tima(cpu, r.start, tima); @@ -752,6 +756,12 @@ bool xive_native_has_single_escalation(void) } EXPORT_SYMBOL_GPL(xive_native_has_single_escalation); +bool xive_native_has_save_restore(void) +{ + return xive_has_save_restore; +} +EXPORT_SYMBOL_GPL(xive_native_has_save_restore); + int xive_native_get_queue_info(u32 vp_id, u32 prio, u64 *out_qpage, u64 *out_qsize, -- cgit v1.2.3-59-g8ed1b From 1bce54250045443d55659b0b23be51e87ed2b919 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 2 Aug 2021 17:26:28 +0100 Subject: powerpc: Bulk conversion to generic_handle_domain_irq() Wherever possible, replace constructs that match either generic_handle_irq(irq_find_mapping()) or generic_handle_irq(irq_linear_revmap()) to a single call to generic_handle_domain_irq(). Signed-off-by: Marc Zyngier Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210802162630.2219813-13-maz@kernel.org --- arch/powerpc/platforms/4xx/uic.c | 4 +--- arch/powerpc/platforms/512x/mpc5121_ads_cpld.c | 23 ++++++++++------------- arch/powerpc/platforms/52xx/media5200.c | 9 ++++----- arch/powerpc/platforms/52xx/mpc52xx_gpt.c | 7 ++----- arch/powerpc/platforms/82xx/pq2ads-pci-pic.c | 6 ++---- arch/powerpc/platforms/cell/interrupt.c | 8 ++------ arch/powerpc/platforms/cell/spider-pic.c | 11 +++-------- arch/powerpc/platforms/embedded6xx/hlwd-pic.c | 15 +++++++-------- arch/powerpc/platforms/powernv/opal-irqchip.c | 11 ++++------- arch/powerpc/sysdev/fsl_mpic_err.c | 11 ++++------- arch/powerpc/sysdev/fsl_msi.c | 12 ++++-------- 11 files changed, 43 insertions(+), 74 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/4xx/uic.c b/arch/powerpc/platforms/4xx/uic.c index 36fb66ce54cf..89e2587b1a59 100644 --- a/arch/powerpc/platforms/4xx/uic.c +++ b/arch/powerpc/platforms/4xx/uic.c @@ -198,7 +198,6 @@ static void uic_irq_cascade(struct irq_desc *desc) struct uic *uic = irq_desc_get_handler_data(desc); u32 msr; int src; - int subvirq; raw_spin_lock(&desc->lock); if (irqd_is_level_type(idata)) @@ -213,8 +212,7 @@ static void uic_irq_cascade(struct irq_desc *desc) src = 32 - ffs(msr); - subvirq = irq_linear_revmap(uic->irqhost, src); - generic_handle_irq(subvirq); + generic_handle_domain_irq(uic->irqhost, src); uic_irq_ret: raw_spin_lock(&desc->lock); diff --git a/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c b/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c index b2981634f1f8..ea46870e5d6e 100644 --- a/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c +++ b/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c @@ -81,11 +81,10 @@ static struct irq_chip cpld_pic = { .irq_unmask = cpld_unmask_irq, }; -static int +static unsigned int cpld_pic_get_irq(int offset, u8 ignore, u8 __iomem *statusp, u8 __iomem *maskp) { - int cpld_irq; u8 status = in_8(statusp); u8 mask = in_8(maskp); @@ -93,28 +92,26 @@ cpld_pic_get_irq(int offset, u8 ignore, u8 __iomem *statusp, status |= (ignore | mask); if (status == 0xff) - return 0; - - cpld_irq = ffz(status) + offset; + return ~0; - return irq_linear_revmap(cpld_pic_host, cpld_irq); + return ffz(status) + offset; } static void cpld_pic_cascade(struct irq_desc *desc) { - unsigned int irq; + unsigned int hwirq; - irq = cpld_pic_get_irq(0, PCI_IGNORE, &cpld_regs->pci_status, + hwirq = cpld_pic_get_irq(0, PCI_IGNORE, &cpld_regs->pci_status, &cpld_regs->pci_mask); - if (irq) { - generic_handle_irq(irq); + if (hwirq != ~0) { + generic_handle_domain_irq(cpld_pic_host, hwirq); return; } - irq = cpld_pic_get_irq(8, MISC_IGNORE, &cpld_regs->misc_status, + hwirq = cpld_pic_get_irq(8, MISC_IGNORE, &cpld_regs->misc_status, &cpld_regs->misc_mask); - if (irq) { - generic_handle_irq(irq); + if (hwirq != ~0) { + generic_handle_domain_irq(cpld_pic_host, hwirq); return; } } diff --git a/arch/powerpc/platforms/52xx/media5200.c b/arch/powerpc/platforms/52xx/media5200.c index efb8bdecbcc7..110c444f4bc7 100644 --- a/arch/powerpc/platforms/52xx/media5200.c +++ b/arch/powerpc/platforms/52xx/media5200.c @@ -78,7 +78,7 @@ static struct irq_chip media5200_irq_chip = { static void media5200_irq_cascade(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); - int sub_virq, val; + int val; u32 status, enable; /* Mask off the cascaded IRQ */ @@ -92,11 +92,10 @@ static void media5200_irq_cascade(struct irq_desc *desc) enable = in_be32(media5200_irq.regs + MEDIA5200_IRQ_STATUS); val = ffs((status & enable) >> MEDIA5200_IRQ_SHIFT); if (val) { - sub_virq = irq_linear_revmap(media5200_irq.irqhost, val - 1); - /* pr_debug("%s: virq=%i s=%.8x e=%.8x hwirq=%i subvirq=%i\n", - * __func__, virq, status, enable, val - 1, sub_virq); + generic_handle_domain_irq(media5200_irq.irqhost, val - 1); + /* pr_debug("%s: virq=%i s=%.8x e=%.8x hwirq=%i\n", + * __func__, virq, status, enable, val - 1); */ - generic_handle_irq(sub_virq); } /* Processing done; can reenable the cascade now */ diff --git a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c index 3823df235f25..f862b48b4824 100644 --- a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c +++ b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c @@ -190,14 +190,11 @@ static struct irq_chip mpc52xx_gpt_irq_chip = { static void mpc52xx_gpt_irq_cascade(struct irq_desc *desc) { struct mpc52xx_gpt_priv *gpt = irq_desc_get_handler_data(desc); - int sub_virq; u32 status; status = in_be32(&gpt->regs->status) & MPC52xx_GPT_STATUS_IRQMASK; - if (status) { - sub_virq = irq_linear_revmap(gpt->irqhost, 0); - generic_handle_irq(sub_virq); - } + if (status) + generic_handle_domain_irq(gpt->irqhost, 0); } static int mpc52xx_gpt_irq_map(struct irq_domain *h, unsigned int virq, diff --git a/arch/powerpc/platforms/82xx/pq2ads-pci-pic.c b/arch/powerpc/platforms/82xx/pq2ads-pci-pic.c index f82f75a6085c..285bfe19b798 100644 --- a/arch/powerpc/platforms/82xx/pq2ads-pci-pic.c +++ b/arch/powerpc/platforms/82xx/pq2ads-pci-pic.c @@ -91,10 +91,8 @@ static void pq2ads_pci_irq_demux(struct irq_desc *desc) break; for (bit = 0; pend != 0; ++bit, pend <<= 1) { - if (pend & 0x80000000) { - int virq = irq_linear_revmap(priv->host, bit); - generic_handle_irq(virq); - } + if (pend & 0x80000000) + generic_handle_domain_irq(priv->host, bit); } } } diff --git a/arch/powerpc/platforms/cell/interrupt.c b/arch/powerpc/platforms/cell/interrupt.c index c0ab62ba6f16..0873a7a20271 100644 --- a/arch/powerpc/platforms/cell/interrupt.c +++ b/arch/powerpc/platforms/cell/interrupt.c @@ -106,13 +106,9 @@ static void iic_ioexc_cascade(struct irq_desc *desc) out_be64(&node_iic->iic_is, ack); /* handle them */ for (cascade = 63; cascade >= 0; cascade--) - if (bits & (0x8000000000000000UL >> cascade)) { - unsigned int cirq = - irq_linear_revmap(iic_host, + if (bits & (0x8000000000000000UL >> cascade)) + generic_handle_domain_irq(iic_host, base | cascade); - if (cirq) - generic_handle_irq(cirq); - } /* post-ack level interrupts */ ack = bits & ~IIC_ISR_EDGE_MASK; if (ack) diff --git a/arch/powerpc/platforms/cell/spider-pic.c b/arch/powerpc/platforms/cell/spider-pic.c index 210785f59271..8af75867cb42 100644 --- a/arch/powerpc/platforms/cell/spider-pic.c +++ b/arch/powerpc/platforms/cell/spider-pic.c @@ -190,16 +190,11 @@ static void spider_irq_cascade(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); struct spider_pic *pic = irq_desc_get_handler_data(desc); - unsigned int cs, virq; + unsigned int cs; cs = in_be32(pic->regs + TIR_CS) >> 24; - if (cs == SPIDER_IRQ_INVALID) - virq = 0; - else - virq = irq_linear_revmap(pic->host, cs); - - if (virq) - generic_handle_irq(virq); + if (cs != SPIDER_IRQ_INVALID) + generic_handle_domain_irq(pic->host, cs); chip->irq_eoi(&desc->irq_data); } diff --git a/arch/powerpc/platforms/embedded6xx/hlwd-pic.c b/arch/powerpc/platforms/embedded6xx/hlwd-pic.c index a1b7f79a8a15..15396333a90b 100644 --- a/arch/powerpc/platforms/embedded6xx/hlwd-pic.c +++ b/arch/powerpc/platforms/embedded6xx/hlwd-pic.c @@ -108,7 +108,6 @@ static const struct irq_domain_ops hlwd_irq_domain_ops = { static unsigned int __hlwd_pic_get_irq(struct irq_domain *h) { void __iomem *io_base = h->host_data; - int irq; u32 irq_status; irq_status = in_be32(io_base + HW_BROADWAY_ICR) & @@ -116,23 +115,22 @@ static unsigned int __hlwd_pic_get_irq(struct irq_domain *h) if (irq_status == 0) return 0; /* no more IRQs pending */ - irq = __ffs(irq_status); - return irq_linear_revmap(h, irq); + return __ffs(irq_status); } static void hlwd_pic_irq_cascade(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); struct irq_domain *irq_domain = irq_desc_get_handler_data(desc); - unsigned int virq; + unsigned int hwirq; raw_spin_lock(&desc->lock); chip->irq_mask(&desc->irq_data); /* IRQ_LEVEL */ raw_spin_unlock(&desc->lock); - virq = __hlwd_pic_get_irq(irq_domain); - if (virq) - generic_handle_irq(virq); + hwirq = __hlwd_pic_get_irq(irq_domain); + if (hwirq) + generic_handle_domain_irq(irq_domain, hwirq); else pr_err("spurious interrupt!\n"); @@ -190,7 +188,8 @@ static struct irq_domain *hlwd_pic_init(struct device_node *np) unsigned int hlwd_pic_get_irq(void) { - return __hlwd_pic_get_irq(hlwd_irq_host); + unsigned int hwirq = __hlwd_pic_get_irq(hlwd_irq_host); + return hwirq ? irq_linear_revmap(hlwd_irq_host, hwirq) : 0; } /* diff --git a/arch/powerpc/platforms/powernv/opal-irqchip.c b/arch/powerpc/platforms/powernv/opal-irqchip.c index c164419e254d..d55652b5f6fa 100644 --- a/arch/powerpc/platforms/powernv/opal-irqchip.c +++ b/arch/powerpc/platforms/powernv/opal-irqchip.c @@ -46,18 +46,15 @@ void opal_handle_events(void) e = READ_ONCE(last_outstanding_events) & opal_event_irqchip.mask; again: while (e) { - int virq, hwirq; + int hwirq; hwirq = fls64(e) - 1; e &= ~BIT_ULL(hwirq); local_irq_disable(); - virq = irq_find_mapping(opal_event_irqchip.domain, hwirq); - if (virq) { - irq_enter(); - generic_handle_irq(virq); - irq_exit(); - } + irq_enter(); + generic_handle_domain_irq(opal_event_irqchip.domain, hwirq); + irq_exit(); local_irq_enable(); cond_resched(); diff --git a/arch/powerpc/sysdev/fsl_mpic_err.c b/arch/powerpc/sysdev/fsl_mpic_err.c index 5fa5fa215541..9a98bb212922 100644 --- a/arch/powerpc/sysdev/fsl_mpic_err.c +++ b/arch/powerpc/sysdev/fsl_mpic_err.c @@ -99,7 +99,6 @@ static irqreturn_t fsl_error_int_handler(int irq, void *data) struct mpic *mpic = (struct mpic *) data; u32 eisr, eimr; int errint; - unsigned int cascade_irq; eisr = mpic_fsl_err_read(mpic->err_regs, MPIC_ERR_INT_EISR); eimr = mpic_fsl_err_read(mpic->err_regs, MPIC_ERR_INT_EIMR); @@ -108,13 +107,11 @@ static irqreturn_t fsl_error_int_handler(int irq, void *data) return IRQ_NONE; while (eisr) { + int ret; errint = __builtin_clz(eisr); - cascade_irq = irq_linear_revmap(mpic->irqhost, - mpic->err_int_vecs[errint]); - WARN_ON(!cascade_irq); - if (cascade_irq) { - generic_handle_irq(cascade_irq); - } else { + ret = generic_handle_domain_irq(mpic->irqhost, + mpic->err_int_vecs[errint]); + if (WARN_ON(ret)) { eimr |= 1 << (31 - errint); mpic_fsl_err_write(mpic->err_regs, eimr); } diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c index 808e7118abfc..e6b06c3f8197 100644 --- a/arch/powerpc/sysdev/fsl_msi.c +++ b/arch/powerpc/sysdev/fsl_msi.c @@ -266,7 +266,6 @@ out_free: static irqreturn_t fsl_msi_cascade(int irq, void *data) { - unsigned int cascade_irq; struct fsl_msi *msi_data; int msir_index = -1; u32 msir_value = 0; @@ -279,9 +278,6 @@ static irqreturn_t fsl_msi_cascade(int irq, void *data) msir_index = cascade_data->index; - if (msir_index >= NR_MSI_REG_MAX) - cascade_irq = 0; - switch (msi_data->feature & FSL_PIC_IP_MASK) { case FSL_PIC_IP_MPIC: msir_value = fsl_msi_read(msi_data->msi_regs, @@ -305,15 +301,15 @@ static irqreturn_t fsl_msi_cascade(int irq, void *data) } while (msir_value) { + int err; intr_index = ffs(msir_value) - 1; - cascade_irq = irq_linear_revmap(msi_data->irqhost, + err = generic_handle_domain_irq(msi_data->irqhost, msi_hwirq(msi_data, msir_index, intr_index + have_shift)); - if (cascade_irq) { - generic_handle_irq(cascade_irq); + if (!err) ret = IRQ_HANDLED; - } + have_shift += intr_index + 1; msir_value = msir_value >> (intr_index + 1); } -- cgit v1.2.3-59-g8ed1b From b11748e693166679acc13c8a4328a71efe1d4a89 Mon Sep 17 00:00:00 2001 From: Emmanuel Gil Peyrot Date: Sun, 1 Aug 2021 09:38:20 +0200 Subject: powerpc: wii.dts: Reduce the size of the control area This is wrong, but needed in order to avoid overlapping ranges with the OTP area added in the next commit. A refactor of this part of the device tree is needed: according to Wiibrew[1], this area starts at 0x0d800000 and spans 0x400 bytes (that is, 0x100 32-bit registers), encompassing PIC and GPIO registers, amongst the ones already exposed in this device tree, which should become children of the control@d800000 node. [1] https://wiibrew.org/wiki/Hardware/Hollywood_Registers Signed-off-by: Emmanuel Gil Peyrot Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210801073822.12452-4-linkmauve@linkmauve.fr --- arch/powerpc/boot/dts/wii.dts | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/boot/dts/wii.dts b/arch/powerpc/boot/dts/wii.dts index aaa381da1906..c5fb54f8cc02 100644 --- a/arch/powerpc/boot/dts/wii.dts +++ b/arch/powerpc/boot/dts/wii.dts @@ -216,7 +216,13 @@ control@d800100 { compatible = "nintendo,hollywood-control"; - reg = <0x0d800100 0x300>; + /* + * Both the address and length are wrong, according to + * Wiibrew this should be <0x0d800000 0x400>, but it + * requires refactoring the PIC1 and GPIO nodes before + * changing that. + */ + reg = <0x0d800100 0xa0>; }; disk@d806000 { -- cgit v1.2.3-59-g8ed1b From 562a610b4c5119034aed300f6ae212ec7a20c4b4 Mon Sep 17 00:00:00 2001 From: Emmanuel Gil Peyrot Date: Sun, 1 Aug 2021 09:38:21 +0200 Subject: powerpc: wii.dts: Expose the OTP on this platform This can be used by the newly-added nintendo-otp nvmem module. Signed-off-by: Emmanuel Gil Peyrot Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210801073822.12452-5-linkmauve@linkmauve.fr --- arch/powerpc/boot/dts/wii.dts | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/boot/dts/wii.dts b/arch/powerpc/boot/dts/wii.dts index c5fb54f8cc02..e9c945b123c6 100644 --- a/arch/powerpc/boot/dts/wii.dts +++ b/arch/powerpc/boot/dts/wii.dts @@ -219,12 +219,17 @@ /* * Both the address and length are wrong, according to * Wiibrew this should be <0x0d800000 0x400>, but it - * requires refactoring the PIC1 and GPIO nodes before - * changing that. + * requires refactoring the PIC1, GPIO and OTP nodes + * before changing that. */ reg = <0x0d800100 0xa0>; }; + otp@d8001ec { + compatible = "nintendo,hollywood-otp"; + reg = <0x0d8001ec 0x8>; + }; + disk@d806000 { compatible = "nintendo,hollywood-di"; reg = <0x0d806000 0x40>; -- cgit v1.2.3-59-g8ed1b From 140a89b7bfe65e9649c4a3678f74c32556834ec1 Mon Sep 17 00:00:00 2001 From: Emmanuel Gil Peyrot Date: Sun, 1 Aug 2021 09:38:22 +0200 Subject: powerpc: wii_defconfig: Enable OTP by default This selects the nintendo-otp module when building for this platform. Signed-off-by: Emmanuel Gil Peyrot Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210801073822.12452-6-linkmauve@linkmauve.fr --- arch/powerpc/configs/wii_defconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/configs/wii_defconfig b/arch/powerpc/configs/wii_defconfig index 379c171f3ddd..a0c45bf2bfb1 100644 --- a/arch/powerpc/configs/wii_defconfig +++ b/arch/powerpc/configs/wii_defconfig @@ -99,6 +99,7 @@ CONFIG_LEDS_TRIGGER_HEARTBEAT=y CONFIG_LEDS_TRIGGER_PANIC=y CONFIG_RTC_CLASS=y CONFIG_RTC_DRV_GENERIC=y +CONFIG_NVMEM_NINTENDO_OTP=y CONFIG_EXT2_FS=y CONFIG_EXT4_FS=y CONFIG_FUSE_FS=m -- cgit v1.2.3-59-g8ed1b From 3e188b1ae8807f26cc5a530a9d55f3f643fe050a Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 12 Aug 2021 18:58:30 +0530 Subject: powerpc/book3s64/radix: make tlb_single_page_flush_ceiling a debugfs entry Similar to x86/s390 add a debugfs file to tune tlb_single_page_flush_ceiling. Also add a debugfs entry for tlb_local_single_page_flush_ceiling. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210812132831.233794-1-aneesh.kumar@linux.ibm.com --- arch/powerpc/mm/book3s64/radix_tlb.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index aefc100d79a7..1fa2bc6a969e 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "internal.h" @@ -1106,8 +1107,8 @@ EXPORT_SYMBOL(radix__flush_tlb_kernel_range); * invalidating a full PID, so it has a far lower threshold to change from * individual page flushes to full-pid flushes. */ -static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; -static unsigned long tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2; +static u32 tlb_single_page_flush_ceiling __read_mostly = 33; +static u32 tlb_local_single_page_flush_ceiling __read_mostly = POWER9_TLB_SETS_RADIX * 2; static inline void __radix__flush_tlb_range(struct mm_struct *mm, unsigned long start, unsigned long end) @@ -1524,3 +1525,14 @@ void do_h_rpt_invalidate_prt(unsigned long pid, unsigned long lpid, EXPORT_SYMBOL_GPL(do_h_rpt_invalidate_prt); #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ + +static int __init create_tlb_single_page_flush_ceiling(void) +{ + debugfs_create_u32("tlb_single_page_flush_ceiling", 0600, + powerpc_debugfs_root, &tlb_single_page_flush_ceiling); + debugfs_create_u32("tlb_local_single_page_flush_ceiling", 0600, + powerpc_debugfs_root, &tlb_local_single_page_flush_ceiling); + return 0; +} +late_initcall(create_tlb_single_page_flush_ceiling); + -- cgit v1.2.3-59-g8ed1b From dbf77fed8b302e87561c7c2fc06050c88f4d3120 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 12 Aug 2021 18:58:31 +0530 Subject: powerpc: rename powerpc_debugfs_root to arch_debugfs_dir No functional change in this patch. arch_debugfs_dir is the generic kernel name declared in linux/debugfs.h for arch-specific debugfs directory. Architectures like x86/s390 already use the name. Rename powerpc specific powerpc_debugfs_root to arch_debugfs_dir. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210812132831.233794-2-aneesh.kumar@linux.ibm.com --- arch/powerpc/include/asm/debugfs.h | 13 ------------- arch/powerpc/kernel/Makefile | 3 ++- arch/powerpc/kernel/dawr.c | 3 +-- arch/powerpc/kernel/eeh.c | 16 ++++++++-------- arch/powerpc/kernel/eeh_cache.c | 4 ++-- arch/powerpc/kernel/fadump.c | 4 ++-- arch/powerpc/kernel/hw_breakpoint.c | 1 - arch/powerpc/kernel/kdebugfs.c | 14 ++++++++++++++ arch/powerpc/kernel/security.c | 16 ++++++++-------- arch/powerpc/kernel/setup-common.c | 13 ------------- arch/powerpc/kernel/setup_64.c | 1 - arch/powerpc/kernel/traps.c | 4 ++-- arch/powerpc/kvm/book3s_xics.c | 6 +++--- arch/powerpc/kvm/book3s_xive.c | 3 +-- arch/powerpc/kvm/book3s_xive_native.c | 3 +-- arch/powerpc/mm/book3s64/hash_utils.c | 4 ++-- arch/powerpc/mm/book3s64/pgtable.c | 4 ++-- arch/powerpc/mm/book3s64/radix_tlb.c | 6 +++--- arch/powerpc/mm/ptdump/bats.c | 4 ++-- arch/powerpc/mm/ptdump/segment_regs.c | 4 ++-- arch/powerpc/platforms/cell/axon_msi.c | 4 ++-- arch/powerpc/platforms/powernv/memtrace.c | 3 +-- arch/powerpc/platforms/powernv/opal-imc.c | 4 ++-- arch/powerpc/platforms/powernv/opal-lpc.c | 4 ++-- arch/powerpc/platforms/powernv/opal-xscom.c | 4 ++-- arch/powerpc/platforms/powernv/pci-ioda.c | 4 ++-- arch/powerpc/platforms/pseries/dtl.c | 4 ++-- arch/powerpc/platforms/pseries/lpar.c | 5 +++-- arch/powerpc/sysdev/xive/common.c | 3 +-- arch/powerpc/xmon/xmon.c | 6 +++--- 30 files changed, 75 insertions(+), 92 deletions(-) delete mode 100644 arch/powerpc/include/asm/debugfs.h create mode 100644 arch/powerpc/kernel/kdebugfs.c (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/debugfs.h b/arch/powerpc/include/asm/debugfs.h deleted file mode 100644 index 2c5c48571d75..000000000000 --- a/arch/powerpc/include/asm/debugfs.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef _ASM_POWERPC_DEBUGFS_H -#define _ASM_POWERPC_DEBUGFS_H - -/* - * Copyright 2017, Michael Ellerman, IBM Corporation. - */ - -#include - -extern struct dentry *powerpc_debugfs_root; - -#endif /* _ASM_POWERPC_DEBUGFS_H */ diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index f66b63e81c3b..7be36c1e1db6 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -46,7 +46,8 @@ obj-y := cputable.o syscalls.o \ prom.o traps.o setup-common.o \ udbg.o misc.o io.o misc_$(BITS).o \ of_platform.o prom_parse.o firmware.o \ - hw_breakpoint_constraints.o interrupt.o + hw_breakpoint_constraints.o interrupt.o \ + kdebugfs.o obj-y += ptrace/ obj-$(CONFIG_PPC64) += setup_64.o \ paca.o nvram_64.o note.o diff --git a/arch/powerpc/kernel/dawr.c b/arch/powerpc/kernel/dawr.c index cdc2dccb987d..64e423d2fe0f 100644 --- a/arch/powerpc/kernel/dawr.c +++ b/arch/powerpc/kernel/dawr.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include @@ -101,7 +100,7 @@ static int __init dawr_force_setup(void) if (PVR_VER(mfspr(SPRN_PVR)) == PVR_POWER9) { /* Turn DAWR off by default, but allow admin to turn it on */ debugfs_create_file_unsafe("dawr_enable_dangerous", 0600, - powerpc_debugfs_root, + arch_debugfs_dir, &dawr_force_enable, &dawr_enable_fops); } diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 3bbdcc86d01b..e9b597ed423c 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -21,9 +21,9 @@ #include #include #include +#include #include -#include #include #include #include @@ -1901,24 +1901,24 @@ static int __init eeh_init_proc(void) proc_create_single("powerpc/eeh", 0, NULL, proc_eeh_show); #ifdef CONFIG_DEBUG_FS debugfs_create_file_unsafe("eeh_enable", 0600, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &eeh_enable_dbgfs_ops); debugfs_create_u32("eeh_max_freezes", 0600, - powerpc_debugfs_root, &eeh_max_freezes); + arch_debugfs_dir, &eeh_max_freezes); debugfs_create_bool("eeh_disable_recovery", 0600, - powerpc_debugfs_root, + arch_debugfs_dir, &eeh_debugfs_no_recover); debugfs_create_file_unsafe("eeh_dev_check", 0600, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &eeh_dev_check_fops); debugfs_create_file_unsafe("eeh_dev_break", 0600, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &eeh_dev_break_fops); debugfs_create_file_unsafe("eeh_force_recover", 0600, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &eeh_force_recover_fops); debugfs_create_file_unsafe("eeh_dev_can_recover", 0600, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &eeh_dev_can_recover_fops); eeh_cache_debugfs_init(); #endif diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c index bf3270426d82..9bdaaf7fddc9 100644 --- a/arch/powerpc/kernel/eeh_cache.c +++ b/arch/powerpc/kernel/eeh_cache.c @@ -12,8 +12,8 @@ #include #include #include +#include #include -#include #include @@ -283,6 +283,6 @@ DEFINE_SHOW_ATTRIBUTE(eeh_addr_cache); void eeh_cache_debugfs_init(void) { debugfs_create_file_unsafe("eeh_address_cache", 0400, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &eeh_addr_cache_fops); } diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index b990075285f5..b7ceb041743c 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -24,8 +24,8 @@ #include #include #include +#include -#include #include #include #include @@ -1557,7 +1557,7 @@ static void fadump_init_files(void) return; } - debugfs_create_file("fadump_region", 0444, powerpc_debugfs_root, NULL, + debugfs_create_file("fadump_region", 0444, arch_debugfs_dir, NULL, &fadump_region_fops); if (fw_dump.dump_active) { diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index 21a638aff72f..91a3be14808b 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/powerpc/kernel/kdebugfs.c b/arch/powerpc/kernel/kdebugfs.c new file mode 100644 index 000000000000..36d3124d5a8b --- /dev/null +++ b/arch/powerpc/kernel/kdebugfs.c @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include + +struct dentry *arch_debugfs_dir; +EXPORT_SYMBOL(arch_debugfs_dir); + +static int __init arch_kdebugfs_init(void) +{ + arch_debugfs_dir = debugfs_create_dir("powerpc", NULL); + return 0; +} +arch_initcall(arch_kdebugfs_init); diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c index cc51fa52e783..1a998490fe60 100644 --- a/arch/powerpc/kernel/security.c +++ b/arch/powerpc/kernel/security.c @@ -11,10 +11,10 @@ #include #include #include +#include #include #include -#include #include #include #include @@ -106,7 +106,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_barrier_nospec, barrier_nospec_get, static __init int barrier_nospec_debugfs_init(void) { debugfs_create_file_unsafe("barrier_nospec", 0600, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &fops_barrier_nospec); return 0; } @@ -114,7 +114,7 @@ device_initcall(barrier_nospec_debugfs_init); static __init int security_feature_debugfs_init(void) { - debugfs_create_x64("security_features", 0400, powerpc_debugfs_root, + debugfs_create_x64("security_features", 0400, arch_debugfs_dir, &powerpc_security_features); return 0; } @@ -420,7 +420,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_stf_barrier, stf_barrier_get, stf_barrier_set, static __init int stf_barrier_debugfs_init(void) { - debugfs_create_file_unsafe("stf_barrier", 0600, powerpc_debugfs_root, + debugfs_create_file_unsafe("stf_barrier", 0600, arch_debugfs_dir, NULL, &fops_stf_barrier); return 0; } @@ -748,7 +748,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_count_cache_flush, count_cache_flush_get, static __init int count_cache_flush_debugfs_init(void) { debugfs_create_file_unsafe("count_cache_flush", 0600, - powerpc_debugfs_root, NULL, + arch_debugfs_dir, NULL, &fops_count_cache_flush); return 0; } @@ -834,9 +834,9 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_uaccess_flush, uaccess_flush_get, uaccess_flush_set static __init int rfi_flush_debugfs_init(void) { - debugfs_create_file("rfi_flush", 0600, powerpc_debugfs_root, NULL, &fops_rfi_flush); - debugfs_create_file("entry_flush", 0600, powerpc_debugfs_root, NULL, &fops_entry_flush); - debugfs_create_file("uaccess_flush", 0600, powerpc_debugfs_root, NULL, &fops_uaccess_flush); + debugfs_create_file("rfi_flush", 0600, arch_debugfs_dir, NULL, &fops_rfi_flush); + debugfs_create_file("entry_flush", 0600, arch_debugfs_dir, NULL, &fops_entry_flush); + debugfs_create_file("uaccess_flush", 0600, arch_debugfs_dir, NULL, &fops_uaccess_flush); return 0; } device_initcall(rfi_flush_debugfs_init); diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index aa9c2d01424a..b1e43b69a559 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -33,7 +33,6 @@ #include #include #include -#include #include #include #include @@ -773,18 +772,6 @@ static int __init check_cache_coherency(void) late_initcall(check_cache_coherency); #endif /* CONFIG_CHECK_CACHE_COHERENCY */ -#ifdef CONFIG_DEBUG_FS -struct dentry *powerpc_debugfs_root; -EXPORT_SYMBOL(powerpc_debugfs_root); - -static int powerpc_debugfs_init(void) -{ - powerpc_debugfs_root = debugfs_create_dir("powerpc", NULL); - return 0; -} -arch_initcall(powerpc_debugfs_init); -#endif - void ppc_printk_progress(char *s, unsigned short hex) { pr_info("%s\n", s); diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 1ff258f6c76c..eaa79a0996d1 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -32,7 +32,6 @@ #include #include -#include #include #include #include diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index dfbce527c98e..c8f648727d36 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -37,10 +37,10 @@ #include #include #include +#include #include #include -#include #include #include #include @@ -2267,7 +2267,7 @@ static int __init ppc_warn_emulated_init(void) struct ppc_emulated_entry *entries = (void *)&ppc_emulated; dir = debugfs_create_dir("emulated_instructions", - powerpc_debugfs_root); + arch_debugfs_dir); debugfs_create_u32("do_warn", 0644, dir, &ppc_warn_emulated); diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index 303e3cb096db..ebd5d920de8c 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -10,13 +10,13 @@ #include #include #include - +#include #include + #include #include #include #include -#include #include #include @@ -1024,7 +1024,7 @@ static void xics_debugfs_init(struct kvmppc_xics *xics) return; } - xics->dentry = debugfs_create_file(name, 0444, powerpc_debugfs_root, + xics->dentry = debugfs_create_file(name, 0444, arch_debugfs_dir, xics, &xics_debug_fops); pr_debug("%s: created %s\n", __func__, name); diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 912c1e9eef6b..a18db9e16ea4 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include @@ -2360,7 +2359,7 @@ static void xive_debugfs_init(struct kvmppc_xive *xive) return; } - xive->dentry = debugfs_create_file(name, S_IRUGO, powerpc_debugfs_root, + xive->dentry = debugfs_create_file(name, S_IRUGO, arch_debugfs_dir, xive, &xive_debug_fops); pr_debug("%s: created %s\n", __func__, name); diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index af65ea21bde7..99db9ac49901 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include @@ -1268,7 +1267,7 @@ static void xive_native_debugfs_init(struct kvmppc_xive *xive) return; } - xive->dentry = debugfs_create_file(name, 0444, powerpc_debugfs_root, + xive->dentry = debugfs_create_file(name, 0444, arch_debugfs_dir, xive, &xive_native_debug_fops); pr_debug("%s: created %s\n", __func__, name); diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index ac5720371c0d..c145776d3ae5 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -36,8 +36,8 @@ #include #include #include +#include -#include #include #include #include @@ -2072,7 +2072,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n") static int __init hash64_debugfs(void) { - debugfs_create_file("hpt_order", 0600, powerpc_debugfs_root, NULL, + debugfs_create_file("hpt_order", 0600, arch_debugfs_dir, NULL, &fops_hpt_order); return 0; } diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 300099de553b..9e16c7b1a6c5 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -6,9 +6,9 @@ #include #include #include +#include #include -#include #include #include #include @@ -520,7 +520,7 @@ static int __init pgtable_debugfs_setup(void) * invalidated as expected. */ debugfs_create_bool("tlbie_enabled", 0600, - powerpc_debugfs_root, + arch_debugfs_dir, &tlbie_enabled); return 0; diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index 1fa2bc6a969e..7724af19ed7e 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -17,7 +18,6 @@ #include #include #include -#include #include "internal.h" @@ -1529,9 +1529,9 @@ EXPORT_SYMBOL_GPL(do_h_rpt_invalidate_prt); static int __init create_tlb_single_page_flush_ceiling(void) { debugfs_create_u32("tlb_single_page_flush_ceiling", 0600, - powerpc_debugfs_root, &tlb_single_page_flush_ceiling); + arch_debugfs_dir, &tlb_single_page_flush_ceiling); debugfs_create_u32("tlb_local_single_page_flush_ceiling", 0600, - powerpc_debugfs_root, &tlb_local_single_page_flush_ceiling); + arch_debugfs_dir, &tlb_local_single_page_flush_ceiling); return 0; } late_initcall(create_tlb_single_page_flush_ceiling); diff --git a/arch/powerpc/mm/ptdump/bats.c b/arch/powerpc/mm/ptdump/bats.c index c4c628b03cf8..8bf7383fb26c 100644 --- a/arch/powerpc/mm/ptdump/bats.c +++ b/arch/powerpc/mm/ptdump/bats.c @@ -7,7 +7,7 @@ */ #include -#include +#include #include #include "ptdump.h" @@ -103,7 +103,7 @@ static const struct file_operations bats_fops = { static int __init bats_init(void) { debugfs_create_file("block_address_translation", 0400, - powerpc_debugfs_root, NULL, &bats_fops); + arch_debugfs_dir, NULL, &bats_fops); return 0; } device_initcall(bats_init); diff --git a/arch/powerpc/mm/ptdump/segment_regs.c b/arch/powerpc/mm/ptdump/segment_regs.c index 565048a0c9be..9223dfb85c51 100644 --- a/arch/powerpc/mm/ptdump/segment_regs.c +++ b/arch/powerpc/mm/ptdump/segment_regs.c @@ -6,7 +6,7 @@ * This dumps the content of Segment Registers */ -#include +#include static void seg_show(struct seq_file *m, int i) { @@ -55,7 +55,7 @@ static const struct file_operations sr_fops = { static int __init sr_init(void) { - debugfs_create_file("segment_registers", 0400, powerpc_debugfs_root, + debugfs_create_file("segment_registers", 0400, arch_debugfs_dir, NULL, &sr_fops); return 0; } diff --git a/arch/powerpc/platforms/cell/axon_msi.c b/arch/powerpc/platforms/cell/axon_msi.c index ca2555b8a0c2..82335e364c44 100644 --- a/arch/powerpc/platforms/cell/axon_msi.c +++ b/arch/powerpc/platforms/cell/axon_msi.c @@ -12,8 +12,8 @@ #include #include #include +#include -#include #include #include #include @@ -480,6 +480,6 @@ void axon_msi_debug_setup(struct device_node *dn, struct axon_msic *msic) snprintf(name, sizeof(name), "msic_%d", of_node_to_nid(dn)); - debugfs_create_file(name, 0600, powerpc_debugfs_root, msic, &fops_msic); + debugfs_create_file(name, 0600, arch_debugfs_dir, msic, &fops_msic); } #endif /* DEBUG */ diff --git a/arch/powerpc/platforms/powernv/memtrace.c b/arch/powerpc/platforms/powernv/memtrace.c index 537a4daed614..877720c64515 100644 --- a/arch/powerpc/platforms/powernv/memtrace.c +++ b/arch/powerpc/platforms/powernv/memtrace.c @@ -18,7 +18,6 @@ #include #include #include -#include #include /* This enables us to keep track of the memory removed from each node. */ @@ -330,7 +329,7 @@ DEFINE_SIMPLE_ATTRIBUTE(memtrace_init_fops, memtrace_enable_get, static int memtrace_init(void) { memtrace_debugfs_dir = debugfs_create_dir("memtrace", - powerpc_debugfs_root); + arch_debugfs_dir); debugfs_create_file("enable", 0600, memtrace_debugfs_dir, NULL, &memtrace_init_fops); diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c index ba02a75c1410..05d3832019b9 100644 --- a/arch/powerpc/platforms/powernv/opal-imc.c +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -13,11 +13,11 @@ #include #include #include +#include #include #include #include #include -#include static struct dentry *imc_debugfs_parent; @@ -56,7 +56,7 @@ static void export_imc_mode_and_cmd(struct device_node *node, u32 cb_offset; struct imc_mem_info *ptr = pmu_ptr->mem_info; - imc_debugfs_parent = debugfs_create_dir("imc", powerpc_debugfs_root); + imc_debugfs_parent = debugfs_create_dir("imc", arch_debugfs_dir); if (of_property_read_u32(node, "cb_offset", &cb_offset)) cb_offset = IMC_CNTL_BLK_OFFSET; diff --git a/arch/powerpc/platforms/powernv/opal-lpc.c b/arch/powerpc/platforms/powernv/opal-lpc.c index 608569082ba0..1e5d51db40f8 100644 --- a/arch/powerpc/platforms/powernv/opal-lpc.c +++ b/arch/powerpc/platforms/powernv/opal-lpc.c @@ -10,13 +10,13 @@ #include #include #include +#include #include #include #include #include #include -#include #include static int opal_lpc_chip_id = -1; @@ -371,7 +371,7 @@ static int opal_lpc_init_debugfs(void) if (opal_lpc_chip_id < 0) return -ENODEV; - root = debugfs_create_dir("lpc", powerpc_debugfs_root); + root = debugfs_create_dir("lpc", arch_debugfs_dir); rc |= opal_lpc_debugfs_create_type(root, "io", OPAL_LPC_IO); rc |= opal_lpc_debugfs_create_type(root, "mem", OPAL_LPC_MEM); diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c index fd510d961b8c..6b4eed2ef4fa 100644 --- a/arch/powerpc/platforms/powernv/opal-xscom.c +++ b/arch/powerpc/platforms/powernv/opal-xscom.c @@ -14,11 +14,11 @@ #include #include #include +#include #include #include #include -#include #include static u64 opal_scom_unmangle(u64 addr) @@ -189,7 +189,7 @@ static int scom_debug_init(void) if (!firmware_has_feature(FW_FEATURE_OPAL)) return 0; - root = debugfs_create_dir("scom", powerpc_debugfs_root); + root = debugfs_create_dir("scom", arch_debugfs_dir); if (!root) return -1; diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 2389cd79c3c8..3dd35c327d1c 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -32,7 +33,6 @@ #include #include #include -#include #include #include #include @@ -2475,7 +2475,7 @@ static void pnv_pci_ioda_create_dbgfs(void) phb = hose->private_data; sprintf(name, "PCI%04x", hose->global_number); - phb->dbgfs = debugfs_create_dir(name, powerpc_debugfs_root); + phb->dbgfs = debugfs_create_dir(name, arch_debugfs_dir); debugfs_create_file_unsafe("dump_diag_regs", 0200, phb->dbgfs, phb, &pnv_pci_diag_data_fops); diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c index 982f069e4c31..352af5b14a0f 100644 --- a/arch/powerpc/platforms/pseries/dtl.c +++ b/arch/powerpc/platforms/pseries/dtl.c @@ -11,10 +11,10 @@ #include #include #include +#include #include #include #include -#include #include #include @@ -338,7 +338,7 @@ static int dtl_init(void) /* set up common debugfs structure */ - dtl_dir = debugfs_create_dir("dtl", powerpc_debugfs_root); + dtl_dir = debugfs_create_dir("dtl", arch_debugfs_dir); debugfs_create_x8("dtl_event_mask", 0600, dtl_dir, &dtl_event_mask); debugfs_create_u32("dtl_buf_entries", 0400, dtl_dir, &dtl_buf_entries); diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 869ef638698a..bd1fcb0881ea 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -22,6 +22,8 @@ #include #include #include +#include + #include #include #include @@ -39,7 +41,6 @@ #include #include #include -#include #include #include "pseries.h" @@ -2019,7 +2020,7 @@ static int __init vpa_debugfs_init(void) if (!firmware_has_feature(FW_FEATURE_SPLPAR)) return 0; - vpa_dir = debugfs_create_dir("vpa", powerpc_debugfs_root); + vpa_dir = debugfs_create_dir("vpa", arch_debugfs_dir); /* set up the per-cpu vpa file*/ for_each_possible_cpu(i) { diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c index 4018964bbd69..458645c7a72b 100644 --- a/arch/powerpc/sysdev/xive/common.c +++ b/arch/powerpc/sysdev/xive/common.c @@ -21,7 +21,6 @@ #include #include -#include #include #include #include @@ -1769,7 +1768,7 @@ DEFINE_SHOW_ATTRIBUTE(xive_core_debug); int xive_core_debug_init(void) { if (xive_enabled()) - debugfs_create_file("xive", 0400, powerpc_debugfs_root, + debugfs_create_file("xive", 0400, arch_debugfs_dir, NULL, &xive_core_debug_fops); return 0; } diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index da4d7f225a40..ead460b80905 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -26,8 +26,8 @@ #include #include #include +#include -#include #include #include #include @@ -4077,8 +4077,8 @@ DEFINE_SIMPLE_ATTRIBUTE(xmon_dbgfs_ops, xmon_dbgfs_get, static int __init setup_xmon_dbgfs(void) { - debugfs_create_file("xmon", 0600, powerpc_debugfs_root, NULL, - &xmon_dbgfs_ops); + debugfs_create_file("xmon", 0600, arch_debugfs_dir, NULL, + &xmon_dbgfs_ops); return 0; } device_initcall(setup_xmon_dbgfs); -- cgit v1.2.3-59-g8ed1b From 7e35ef662ca05c42dbc2f401bb76d9219dd7fd02 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 12 Aug 2021 18:52:19 +0530 Subject: powerpc/pseries: rename min_common_depth to primary_domain_index No functional change in this patch. Signed-off-by: Aneesh Kumar K.V Reviewed-by: David Gibson Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210812132223.225214-2-aneesh.kumar@linux.ibm.com --- arch/powerpc/mm/numa.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 094a1076fd1f..79132744b728 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -51,7 +51,7 @@ EXPORT_SYMBOL(numa_cpu_lookup_table); EXPORT_SYMBOL(node_to_cpumask_map); EXPORT_SYMBOL(node_data); -static int min_common_depth; +static int primary_domain_index; static int n_mem_addr_cells, n_mem_size_cells; static int form1_affinity; @@ -232,8 +232,8 @@ static int associativity_to_nid(const __be32 *associativity) if (!numa_enabled) goto out; - if (of_read_number(associativity, 1) >= min_common_depth) - nid = of_read_number(&associativity[min_common_depth], 1); + if (of_read_number(associativity, 1) >= primary_domain_index) + nid = of_read_number(&associativity[primary_domain_index], 1); /* POWER4 LPAR uses 0xffff as invalid node */ if (nid == 0xffff || nid >= nr_node_ids) @@ -284,9 +284,9 @@ int of_node_to_nid(struct device_node *device) } EXPORT_SYMBOL(of_node_to_nid); -static int __init find_min_common_depth(void) +static int __init find_primary_domain_index(void) { - int depth; + int index; struct device_node *root; if (firmware_has_feature(FW_FEATURE_OPAL)) @@ -326,7 +326,7 @@ static int __init find_min_common_depth(void) } if (form1_affinity) { - depth = of_read_number(distance_ref_points, 1); + index = of_read_number(distance_ref_points, 1); } else { if (distance_ref_points_depth < 2) { printk(KERN_WARNING "NUMA: " @@ -334,7 +334,7 @@ static int __init find_min_common_depth(void) goto err; } - depth = of_read_number(&distance_ref_points[1], 1); + index = of_read_number(&distance_ref_points[1], 1); } /* @@ -348,7 +348,7 @@ static int __init find_min_common_depth(void) } of_node_put(root); - return depth; + return index; err: of_node_put(root); @@ -437,16 +437,16 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb) int nid = default_nid; int rc, index; - if ((min_common_depth < 0) || !numa_enabled) + if ((primary_domain_index < 0) || !numa_enabled) return default_nid; rc = of_get_assoc_arrays(&aa); if (rc) return default_nid; - if (min_common_depth <= aa.array_sz && + if (primary_domain_index <= aa.array_sz && !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) { - index = lmb->aa_index * aa.array_sz + min_common_depth - 1; + index = lmb->aa_index * aa.array_sz + primary_domain_index - 1; nid = of_read_number(&aa.arrays[index], 1); if (nid == 0xffff || nid >= nr_node_ids) @@ -708,18 +708,18 @@ static int __init parse_numa_properties(void) return -1; } - min_common_depth = find_min_common_depth(); + primary_domain_index = find_primary_domain_index(); - if (min_common_depth < 0) { + if (primary_domain_index < 0) { /* - * if we fail to parse min_common_depth from device tree + * if we fail to parse primary_domain_index from device tree * mark the numa disabled, boot with numa disabled. */ numa_enabled = false; - return min_common_depth; + return primary_domain_index; } - dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth); + dbg("NUMA associativity depth for CPU/Memory: %d\n", primary_domain_index); /* * Even though we connect cpus to numa domains later in SMP @@ -924,7 +924,7 @@ static void __init find_possible_nodes(void) goto out; } - max_nodes = of_read_number(&domains[min_common_depth], 1); + max_nodes = of_read_number(&domains[primary_domain_index], 1); pr_info("Partition configured for %d NUMA nodes.\n", max_nodes); for (i = 0; i < max_nodes; i++) { @@ -933,7 +933,7 @@ static void __init find_possible_nodes(void) } prop_length /= sizeof(int); - if (prop_length > min_common_depth + 2) + if (prop_length > primary_domain_index + 2) coregroup_enabled = 1; out: @@ -1266,7 +1266,7 @@ int cpu_to_coregroup_id(int cpu) goto out; index = of_read_number(associativity, 1); - if (index > min_common_depth + 1) + if (index > primary_domain_index + 1) return of_read_number(&associativity[index - 1], 1); out: -- cgit v1.2.3-59-g8ed1b From 0eacd06bb8adea8dd9edb0a30144166d9f227e64 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 12 Aug 2021 18:52:20 +0530 Subject: powerpc/pseries: Rename TYPE1_AFFINITY to FORM1_AFFINITY Also make related code cleanup that will allow adding FORM2_AFFINITY in later patches. No functional change in this patch. Signed-off-by: Aneesh Kumar K.V Reviewed-by: David Gibson Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210812132223.225214-3-aneesh.kumar@linux.ibm.com --- arch/powerpc/include/asm/firmware.h | 4 ++-- arch/powerpc/include/asm/prom.h | 2 +- arch/powerpc/kernel/prom_init.c | 2 +- arch/powerpc/mm/numa.c | 35 ++++++++++++++++++------------- arch/powerpc/platforms/pseries/firmware.c | 2 +- 5 files changed, 26 insertions(+), 19 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h index 7604673787d6..60b631161360 100644 --- a/arch/powerpc/include/asm/firmware.h +++ b/arch/powerpc/include/asm/firmware.h @@ -44,7 +44,7 @@ #define FW_FEATURE_OPAL ASM_CONST(0x0000000010000000) #define FW_FEATURE_SET_MODE ASM_CONST(0x0000000040000000) #define FW_FEATURE_BEST_ENERGY ASM_CONST(0x0000000080000000) -#define FW_FEATURE_TYPE1_AFFINITY ASM_CONST(0x0000000100000000) +#define FW_FEATURE_FORM1_AFFINITY ASM_CONST(0x0000000100000000) #define FW_FEATURE_PRRN ASM_CONST(0x0000000200000000) #define FW_FEATURE_DRMEM_V2 ASM_CONST(0x0000000400000000) #define FW_FEATURE_DRC_INFO ASM_CONST(0x0000000800000000) @@ -69,7 +69,7 @@ enum { FW_FEATURE_SPLPAR | FW_FEATURE_LPAR | FW_FEATURE_CMO | FW_FEATURE_VPHN | FW_FEATURE_XCMO | FW_FEATURE_SET_MODE | FW_FEATURE_BEST_ENERGY | - FW_FEATURE_TYPE1_AFFINITY | FW_FEATURE_PRRN | + FW_FEATURE_FORM1_AFFINITY | FW_FEATURE_PRRN | FW_FEATURE_HPT_RESIZE | FW_FEATURE_DRMEM_V2 | FW_FEATURE_DRC_INFO | FW_FEATURE_BLOCK_REMOVE | FW_FEATURE_PAPR_SCM | FW_FEATURE_ULTRAVISOR | diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h index 324a13351749..df9fec9d232c 100644 --- a/arch/powerpc/include/asm/prom.h +++ b/arch/powerpc/include/asm/prom.h @@ -147,7 +147,7 @@ extern int of_read_drc_info_cell(struct property **prop, #define OV5_MSI 0x0201 /* PCIe/MSI support */ #define OV5_CMO 0x0480 /* Cooperative Memory Overcommitment */ #define OV5_XCMO 0x0440 /* Page Coalescing */ -#define OV5_TYPE1_AFFINITY 0x0580 /* Type 1 NUMA affinity */ +#define OV5_FORM1_AFFINITY 0x0580 /* FORM1 NUMA affinity */ #define OV5_PRRN 0x0540 /* Platform Resource Reassignment */ #define OV5_HP_EVT 0x0604 /* Hot Plug Event support */ #define OV5_RESIZE_HPT 0x0601 /* Hash Page Table resizing */ diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index a5bf355ce1d6..57db605ad33a 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -1096,7 +1096,7 @@ static const struct ibm_arch_vec ibm_architecture_vec_template __initconst = { #else 0, #endif - .associativity = OV5_FEAT(OV5_TYPE1_AFFINITY) | OV5_FEAT(OV5_PRRN), + .associativity = OV5_FEAT(OV5_FORM1_AFFINITY) | OV5_FEAT(OV5_PRRN), .bin_opts = OV5_FEAT(OV5_RESIZE_HPT) | OV5_FEAT(OV5_HP_EVT), .micro_checkpoint = 0, .reserved0 = 0, diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 79132744b728..0bad11b3e929 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -53,7 +53,10 @@ EXPORT_SYMBOL(node_data); static int primary_domain_index; static int n_mem_addr_cells, n_mem_size_cells; -static int form1_affinity; + +#define FORM0_AFFINITY 0 +#define FORM1_AFFINITY 1 +static int affinity_form; #define MAX_DISTANCE_REF_POINTS 4 static int distance_ref_points_depth; @@ -190,7 +193,7 @@ int __node_distance(int a, int b) int i; int distance = LOCAL_DISTANCE; - if (!form1_affinity) + if (affinity_form == FORM0_AFFINITY) return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE); for (i = 0; i < distance_ref_points_depth; i++) { @@ -210,7 +213,7 @@ static void initialize_distance_lookup_table(int nid, { int i; - if (!form1_affinity) + if (affinity_form != FORM1_AFFINITY) return; for (i = 0; i < distance_ref_points_depth; i++) { @@ -289,6 +292,17 @@ static int __init find_primary_domain_index(void) int index; struct device_node *root; + /* + * Check for which form of affinity. + */ + if (firmware_has_feature(FW_FEATURE_OPAL)) { + affinity_form = FORM1_AFFINITY; + } else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) { + dbg("Using form 1 affinity\n"); + affinity_form = FORM1_AFFINITY; + } else + affinity_form = FORM0_AFFINITY; + if (firmware_has_feature(FW_FEATURE_OPAL)) root = of_find_node_by_path("/ibm,opal"); else @@ -318,23 +332,16 @@ static int __init find_primary_domain_index(void) } distance_ref_points_depth /= sizeof(int); - - if (firmware_has_feature(FW_FEATURE_OPAL) || - firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) { - dbg("Using form 1 affinity\n"); - form1_affinity = 1; - } - - if (form1_affinity) { - index = of_read_number(distance_ref_points, 1); - } else { + if (affinity_form == FORM0_AFFINITY) { if (distance_ref_points_depth < 2) { printk(KERN_WARNING "NUMA: " - "short ibm,associativity-reference-points\n"); + "short ibm,associativity-reference-points\n"); goto err; } index = of_read_number(&distance_ref_points[1], 1); + } else { + index = of_read_number(distance_ref_points, 1); } /* diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c index 4c7b7f5a2ebc..5d4c2bc20bba 100644 --- a/arch/powerpc/platforms/pseries/firmware.c +++ b/arch/powerpc/platforms/pseries/firmware.c @@ -119,7 +119,7 @@ struct vec5_fw_feature { static __initdata struct vec5_fw_feature vec5_fw_features_table[] = { - {FW_FEATURE_TYPE1_AFFINITY, OV5_TYPE1_AFFINITY}, + {FW_FEATURE_FORM1_AFFINITY, OV5_FORM1_AFFINITY}, {FW_FEATURE_PRRN, OV5_PRRN}, {FW_FEATURE_DRMEM_V2, OV5_DRMEM_V2}, {FW_FEATURE_DRC_INFO, OV5_DRC_INFO}, -- cgit v1.2.3-59-g8ed1b From 8ddc6448ec5a5ef50eaa581a7dec0e12a02850ff Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 12 Aug 2021 18:52:21 +0530 Subject: powerpc/pseries: Consolidate different NUMA distance update code paths The associativity details of the newly added resourced are collected from the hypervisor via "ibm,configure-connector" rtas call. Update the numa distance details of the newly added numa node after the above call. Instead of updating NUMA distance every time we lookup a node id from the associativity property, add helpers that can be used during boot which does this only once. Also remove the distance update from node id lookup helpers. Currently, we duplicate parsing code for ibm,associativity and ibm,associativity-lookup-arrays in the kernel. The associativity array provided by these device tree properties are very similar and hence can use a helper to parse the node id and numa distance details. Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210812132223.225214-4-aneesh.kumar@linux.ibm.com --- arch/powerpc/include/asm/topology.h | 2 + arch/powerpc/mm/numa.c | 212 +++++++++++++++++------- arch/powerpc/platforms/pseries/hotplug-cpu.c | 2 + arch/powerpc/platforms/pseries/hotplug-memory.c | 2 + 4 files changed, 161 insertions(+), 57 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index e4db64c0e184..a6425a70c37b 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -64,6 +64,7 @@ static inline int early_cpu_to_node(int cpu) } int of_drconf_to_nid_single(struct drmem_lmb *lmb); +void update_numa_distance(struct device_node *node); #else @@ -93,6 +94,7 @@ static inline int of_drconf_to_nid_single(struct drmem_lmb *lmb) return first_online_node; } +static inline void update_numa_distance(struct device_node *node) {} #endif /* CONFIG_NUMA */ #if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 0bad11b3e929..3cda37c52f26 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -208,50 +208,35 @@ int __node_distance(int a, int b) } EXPORT_SYMBOL(__node_distance); -static void initialize_distance_lookup_table(int nid, - const __be32 *associativity) +static int __associativity_to_nid(const __be32 *associativity, + int max_array_sz) { - int i; + int nid; + /* + * primary_domain_index is 1 based array index. + */ + int index = primary_domain_index - 1; - if (affinity_form != FORM1_AFFINITY) - return; + if (!numa_enabled || index >= max_array_sz) + return NUMA_NO_NODE; - for (i = 0; i < distance_ref_points_depth; i++) { - const __be32 *entry; + nid = of_read_number(&associativity[index], 1); - entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1]; - distance_lookup_table[nid][i] = of_read_number(entry, 1); - } + /* POWER4 LPAR uses 0xffff as invalid node */ + if (nid == 0xffff || nid >= nr_node_ids) + nid = NUMA_NO_NODE; + return nid; } - /* * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA * info is found. */ static int associativity_to_nid(const __be32 *associativity) { - int nid = NUMA_NO_NODE; - - if (!numa_enabled) - goto out; - - if (of_read_number(associativity, 1) >= primary_domain_index) - nid = of_read_number(&associativity[primary_domain_index], 1); - - /* POWER4 LPAR uses 0xffff as invalid node */ - if (nid == 0xffff || nid >= nr_node_ids) - nid = NUMA_NO_NODE; - - if (nid > 0 && - of_read_number(associativity, 1) >= distance_ref_points_depth) { - /* - * Skip the length field and send start of associativity array - */ - initialize_distance_lookup_table(nid, associativity + 1); - } + int array_sz = of_read_number(associativity, 1); -out: - return nid; + /* Skip the first element in the associativity array */ + return __associativity_to_nid((associativity + 1), array_sz); } /* Returns the nid associated with the given device tree node, @@ -287,6 +272,60 @@ int of_node_to_nid(struct device_node *device) } EXPORT_SYMBOL(of_node_to_nid); +static void __initialize_form1_numa_distance(const __be32 *associativity, + int max_array_sz) +{ + int i, nid; + + if (affinity_form != FORM1_AFFINITY) + return; + + nid = __associativity_to_nid(associativity, max_array_sz); + if (nid != NUMA_NO_NODE) { + for (i = 0; i < distance_ref_points_depth; i++) { + const __be32 *entry; + int index = be32_to_cpu(distance_ref_points[i]) - 1; + + /* + * broken hierarchy, return with broken distance table + */ + if (WARN(index >= max_array_sz, "Broken ibm,associativity property")) + return; + + entry = &associativity[index]; + distance_lookup_table[nid][i] = of_read_number(entry, 1); + } + } +} + +static void initialize_form1_numa_distance(const __be32 *associativity) +{ + int array_sz; + + array_sz = of_read_number(associativity, 1); + /* Skip the first element in the associativity array */ + __initialize_form1_numa_distance(associativity + 1, array_sz); +} + +/* + * Used to update distance information w.r.t newly added node. + */ +void update_numa_distance(struct device_node *node) +{ + if (affinity_form == FORM0_AFFINITY) + return; + else if (affinity_form == FORM1_AFFINITY) { + const __be32 *associativity; + + associativity = of_get_associativity(node); + if (!associativity) + return; + + initialize_form1_numa_distance(associativity); + return; + } +} + static int __init find_primary_domain_index(void) { int index; @@ -433,6 +472,38 @@ static int of_get_assoc_arrays(struct assoc_arrays *aa) return 0; } +static int get_nid_and_numa_distance(struct drmem_lmb *lmb) +{ + struct assoc_arrays aa = { .arrays = NULL }; + int default_nid = NUMA_NO_NODE; + int nid = default_nid; + int rc, index; + + if ((primary_domain_index < 0) || !numa_enabled) + return default_nid; + + rc = of_get_assoc_arrays(&aa); + if (rc) + return default_nid; + + if (primary_domain_index <= aa.array_sz && + !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) { + const __be32 *associativity; + + index = lmb->aa_index * aa.array_sz; + associativity = &aa.arrays[index]; + nid = __associativity_to_nid(associativity, aa.array_sz); + if (nid > 0 && affinity_form == FORM1_AFFINITY) { + /* + * lookup array associativity entries have + * no length of the array as the first element. + */ + __initialize_form1_numa_distance(associativity, aa.array_sz); + } + } + return nid; +} + /* * This is like of_node_to_nid_single() for memory represented in the * ibm,dynamic-reconfiguration-memory node. @@ -453,26 +524,19 @@ int of_drconf_to_nid_single(struct drmem_lmb *lmb) if (primary_domain_index <= aa.array_sz && !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) { - index = lmb->aa_index * aa.array_sz + primary_domain_index - 1; - nid = of_read_number(&aa.arrays[index], 1); - - if (nid == 0xffff || nid >= nr_node_ids) - nid = default_nid; + const __be32 *associativity; - if (nid > 0) { - index = lmb->aa_index * aa.array_sz; - initialize_distance_lookup_table(nid, - &aa.arrays[index]); - } + index = lmb->aa_index * aa.array_sz; + associativity = &aa.arrays[index]; + nid = __associativity_to_nid(associativity, aa.array_sz); } - return nid; } #ifdef CONFIG_PPC_SPLPAR -static int vphn_get_nid(long lcpu) + +static int __vphn_get_associativity(long lcpu, __be32 *associativity) { - __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; long rc, hwid; /* @@ -492,12 +556,30 @@ static int vphn_get_nid(long lcpu) rc = hcall_vphn(hwid, VPHN_FLAG_VCPU, associativity); if (rc == H_SUCCESS) - return associativity_to_nid(associativity); + return 0; } + return -1; +} + +static int vphn_get_nid(long lcpu) +{ + __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; + + + if (!__vphn_get_associativity(lcpu, associativity)) + return associativity_to_nid(associativity); + return NUMA_NO_NODE; + } #else + +static int __vphn_get_associativity(long lcpu, __be32 *associativity) +{ + return -1; +} + static int vphn_get_nid(long unused) { return NUMA_NO_NODE; @@ -692,7 +774,7 @@ static int __init numa_setup_drmem_lmb(struct drmem_lmb *lmb, size = read_n_cells(n_mem_size_cells, usm); } - nid = of_drconf_to_nid_single(lmb); + nid = get_nid_and_numa_distance(lmb); fake_numa_create_new_node(((base + size) >> PAGE_SHIFT), &nid); node_set_online(nid); @@ -709,6 +791,7 @@ static int __init parse_numa_properties(void) struct device_node *memory; int default_nid = 0; unsigned long i; + const __be32 *associativity; if (numa_enabled == 0) { printk(KERN_WARNING "NUMA disabled by user\n"); @@ -734,18 +817,30 @@ static int __init parse_numa_properties(void) * each node to be onlined must have NODE_DATA etc backing it. */ for_each_present_cpu(i) { + __be32 vphn_assoc[VPHN_ASSOC_BUFSIZE]; struct device_node *cpu; - int nid = vphn_get_nid(i); + int nid = NUMA_NO_NODE; - /* - * Don't fall back to default_nid yet -- we will plug - * cpus into nodes once the memory scan has discovered - * the topology. - */ - if (nid == NUMA_NO_NODE) { + memset(vphn_assoc, 0, VPHN_ASSOC_BUFSIZE * sizeof(__be32)); + + if (__vphn_get_associativity(i, vphn_assoc) == 0) { + nid = associativity_to_nid(vphn_assoc); + initialize_form1_numa_distance(vphn_assoc); + } else { + + /* + * Don't fall back to default_nid yet -- we will plug + * cpus into nodes once the memory scan has discovered + * the topology. + */ cpu = of_get_cpu_node(i, NULL); BUG_ON(!cpu); - nid = of_node_to_nid_single(cpu); + + associativity = of_get_associativity(cpu); + if (associativity) { + nid = associativity_to_nid(associativity); + initialize_form1_numa_distance(associativity); + } of_node_put(cpu); } @@ -781,8 +876,11 @@ new_range: * have associativity properties. If none, then * everything goes to default_nid. */ - nid = of_node_to_nid_single(memory); - if (nid < 0) + associativity = of_get_associativity(memory); + if (associativity) { + nid = associativity_to_nid(associativity); + initialize_form1_numa_distance(associativity); + } else nid = default_nid; fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid); diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index e1f224320102..1ef40ef699a6 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -580,6 +580,8 @@ static ssize_t dlpar_cpu_add(u32 drc_index) return saved_rc; } + update_numa_distance(dn); + rc = dlpar_online_cpu(dn); if (rc) { saved_rc = rc; diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 0beb3ca2b549..14fccd7b9c99 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -180,6 +180,8 @@ static int update_lmb_associativity_index(struct drmem_lmb *lmb) return -ENODEV; } + update_numa_distance(lmb_node); + dr_node = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); if (!dr_node) { dlpar_free_cc_nodes(lmb_node); -- cgit v1.2.3-59-g8ed1b From ef31cb83d19c4c589d650747cd5a7e502be9f665 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 12 Aug 2021 18:52:22 +0530 Subject: powerpc/pseries: Add a helper for form1 cpu distance This helper is only used with the dispatch trace log collection. A later patch will add Form2 affinity support and this change helps in keeping that simpler. Also add a comment explaining we don't expect the code to be called with FORM0 Signed-off-by: Aneesh Kumar K.V Reviewed-by: David Gibson Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210812132223.225214-5-aneesh.kumar@linux.ibm.com --- arch/powerpc/include/asm/topology.h | 4 ++-- arch/powerpc/mm/numa.c | 10 +++++++++- arch/powerpc/platforms/pseries/lpar.c | 4 ++-- 3 files changed, 13 insertions(+), 5 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index a6425a70c37b..a4712ecad3e9 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -36,7 +36,7 @@ static inline int pcibus_to_node(struct pci_bus *bus) cpu_all_mask : \ cpumask_of_node(pcibus_to_node(bus))) -extern int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc); +int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc); extern int __node_distance(int, int); #define node_distance(a, b) __node_distance(a, b) @@ -84,7 +84,7 @@ static inline void sysfs_remove_device_from_node(struct device *dev, static inline void update_numa_cpu_lookup_table(unsigned int cpu, int node) {} -static inline int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) +static inline int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) { return 0; } diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 3cda37c52f26..fe3a9a38eddf 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -166,7 +166,7 @@ static void unmap_cpu_from_node(unsigned long cpu) } #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */ -int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) +static int __cpu_form1_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) { int dist = 0; @@ -182,6 +182,14 @@ int cpu_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) return dist; } +int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) +{ + /* We should not get called with FORM0 */ + VM_WARN_ON(affinity_form == FORM0_AFFINITY); + + return __cpu_form1_relative_distance(cpu1_assoc, cpu2_assoc); +} + /* must hold reference to node during call */ static const __be32 *of_get_associativity(struct device_node *dev) { diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index bd1fcb0881ea..3df6bdfea475 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -262,7 +262,7 @@ static int cpu_relative_dispatch_distance(int last_disp_cpu, int cur_disp_cpu) if (!last_disp_cpu_assoc || !cur_disp_cpu_assoc) return -EIO; - return cpu_distance(last_disp_cpu_assoc, cur_disp_cpu_assoc); + return cpu_relative_distance(last_disp_cpu_assoc, cur_disp_cpu_assoc); } static int cpu_home_node_dispatch_distance(int disp_cpu) @@ -282,7 +282,7 @@ static int cpu_home_node_dispatch_distance(int disp_cpu) if (!disp_cpu_assoc || !vcpu_assoc) return -EIO; - return cpu_distance(disp_cpu_assoc, vcpu_assoc); + return cpu_relative_distance(disp_cpu_assoc, vcpu_assoc); } static void update_vcpu_disp_stat(int disp_cpu) -- cgit v1.2.3-59-g8ed1b From 1c6b5a7e74052768977855f95d6b8812f6e7772c Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 12 Aug 2021 18:52:23 +0530 Subject: powerpc/pseries: Add support for FORM2 associativity PAPR interface currently supports two different ways of communicating resource grouping details to the OS. These are referred to as Form 0 and Form 1 associativity grouping. Form 0 is the older format and is now considered deprecated. This patch adds another resource grouping named FORM2. Signed-off-by: Daniel Henrique Barboza Signed-off-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210812132223.225214-6-aneesh.kumar@linux.ibm.com --- Documentation/powerpc/associativity.rst | 104 +++++++++++++++++ arch/powerpc/include/asm/firmware.h | 3 +- arch/powerpc/include/asm/prom.h | 1 + arch/powerpc/kernel/prom_init.c | 3 +- arch/powerpc/mm/numa.c | 187 ++++++++++++++++++++++++------ arch/powerpc/platforms/pseries/firmware.c | 1 + 6 files changed, 262 insertions(+), 37 deletions(-) create mode 100644 Documentation/powerpc/associativity.rst (limited to 'arch/powerpc') diff --git a/Documentation/powerpc/associativity.rst b/Documentation/powerpc/associativity.rst new file mode 100644 index 000000000000..07e7dd3d6c87 --- /dev/null +++ b/Documentation/powerpc/associativity.rst @@ -0,0 +1,104 @@ +============================ +NUMA resource associativity +============================= + +Associativity represents the groupings of the various platform resources into +domains of substantially similar mean performance relative to resources outside +of that domain. Resources subsets of a given domain that exhibit better +performance relative to each other than relative to other resources subsets +are represented as being members of a sub-grouping domain. This performance +characteristic is presented in terms of NUMA node distance within the Linux kernel. +From the platform view, these groups are also referred to as domains. + +PAPR interface currently supports different ways of communicating these resource +grouping details to the OS. These are referred to as Form 0, Form 1 and Form2 +associativity grouping. Form 0 is the oldest format and is now considered deprecated. + +Hypervisor indicates the type/form of associativity used via "ibm,architecture-vec-5 property". +Bit 0 of byte 5 in the "ibm,architecture-vec-5" property indicates usage of Form 0 or Form 1. +A value of 1 indicates the usage of Form 1 associativity. For Form 2 associativity +bit 2 of byte 5 in the "ibm,architecture-vec-5" property is used. + +Form 0 +----- +Form 0 associativity supports only two NUMA distances (LOCAL and REMOTE). + +Form 1 +----- +With Form 1 a combination of ibm,associativity-reference-points, and ibm,associativity +device tree properties are used to determine the NUMA distance between resource groups/domains. + +The “ibm,associativity” property contains a list of one or more numbers (domainID) +representing the resource’s platform grouping domains. + +The “ibm,associativity-reference-points” property contains a list of one or more numbers +(domainID index) that represents the 1 based ordinal in the associativity lists. +The list of domainID indexes represents an increasing hierarchy of resource grouping. + +ex: +{ primary domainID index, secondary domainID index, tertiary domainID index.. } + +Linux kernel uses the domainID at the primary domainID index as the NUMA node id. +Linux kernel computes NUMA distance between two domains by recursively comparing +if they belong to the same higher-level domains. For mismatch at every higher +level of the resource group, the kernel doubles the NUMA distance between the +comparing domains. + +Form 2 +------- +Form 2 associativity format adds separate device tree properties representing NUMA node distance +thereby making the node distance computation flexible. Form 2 also allows flexible primary +domain numbering. With numa distance computation now detached from the index value in +"ibm,associativity-reference-points" property, Form 2 allows a large number of primary domain +ids at the same domainID index representing resource groups of different performance/latency +characteristics. + +Hypervisor indicates the usage of FORM2 associativity using bit 2 of byte 5 in the +"ibm,architecture-vec-5" property. + +"ibm,numa-lookup-index-table" property contains a list of one or more numbers representing +the domainIDs present in the system. The offset of the domainID in this property is +used as an index while computing numa distance information via "ibm,numa-distance-table". + +prop-encoded-array: The number N of the domainIDs encoded as with encode-int, followed by +N domainID encoded as with encode-int + +For ex: +"ibm,numa-lookup-index-table" = {4, 0, 8, 250, 252}. The offset of domainID 8 (2) is used when +computing the distance of domain 8 from other domains present in the system. For the rest of +this document, this offset will be referred to as domain distance offset. + +"ibm,numa-distance-table" property contains a list of one or more numbers representing the NUMA +distance between resource groups/domains present in the system. + +prop-encoded-array: The number N of the distance values encoded as with encode-int, followed by +N distance values encoded as with encode-bytes. The max distance value we could encode is 255. +The number N must be equal to the square of m where m is the number of domainIDs in the +numa-lookup-index-table. + +For ex: +ibm,numa-lookup-index-table = <3 0 8 40>; +ibm,numa-distace-table = <9>, /bits/ 8 < 10 20 80 + 20 10 160 + 80 160 10>; + | 0 8 40 +--|------------ + | +0 | 10 20 80 + | +8 | 20 10 160 + | +40| 80 160 10 + +A possible "ibm,associativity" property for resources in node 0, 8 and 40 + +{ 3, 6, 7, 0 } +{ 3, 6, 9, 8 } +{ 3, 6, 7, 40} + +With "ibm,associativity-reference-points" { 0x3 } + +"ibm,lookup-index-table" helps in having a compact representation of distance matrix. +Since domainID can be sparse, the matrix of distances can also be effectively sparse. +With "ibm,lookup-index-table" we can achieve a compact representation of +distance information. diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h index 60b631161360..97a3bd9ffeb9 100644 --- a/arch/powerpc/include/asm/firmware.h +++ b/arch/powerpc/include/asm/firmware.h @@ -53,6 +53,7 @@ #define FW_FEATURE_ULTRAVISOR ASM_CONST(0x0000004000000000) #define FW_FEATURE_STUFF_TCE ASM_CONST(0x0000008000000000) #define FW_FEATURE_RPT_INVALIDATE ASM_CONST(0x0000010000000000) +#define FW_FEATURE_FORM2_AFFINITY ASM_CONST(0x0000020000000000) #ifndef __ASSEMBLY__ @@ -73,7 +74,7 @@ enum { FW_FEATURE_HPT_RESIZE | FW_FEATURE_DRMEM_V2 | FW_FEATURE_DRC_INFO | FW_FEATURE_BLOCK_REMOVE | FW_FEATURE_PAPR_SCM | FW_FEATURE_ULTRAVISOR | - FW_FEATURE_RPT_INVALIDATE, + FW_FEATURE_RPT_INVALIDATE | FW_FEATURE_FORM2_AFFINITY, FW_FEATURE_PSERIES_ALWAYS = 0, FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL | FW_FEATURE_ULTRAVISOR, FW_FEATURE_POWERNV_ALWAYS = 0, diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h index df9fec9d232c..5c80152e8f18 100644 --- a/arch/powerpc/include/asm/prom.h +++ b/arch/powerpc/include/asm/prom.h @@ -149,6 +149,7 @@ extern int of_read_drc_info_cell(struct property **prop, #define OV5_XCMO 0x0440 /* Page Coalescing */ #define OV5_FORM1_AFFINITY 0x0580 /* FORM1 NUMA affinity */ #define OV5_PRRN 0x0540 /* Platform Resource Reassignment */ +#define OV5_FORM2_AFFINITY 0x0520 /* Form2 NUMA affinity */ #define OV5_HP_EVT 0x0604 /* Hot Plug Event support */ #define OV5_RESIZE_HPT 0x0601 /* Hash Page Table resizing */ #define OV5_PFO_HW_RNG 0x1180 /* PFO Random Number Generator */ diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 57db605ad33a..95a42d49e291 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -1096,7 +1096,8 @@ static const struct ibm_arch_vec ibm_architecture_vec_template __initconst = { #else 0, #endif - .associativity = OV5_FEAT(OV5_FORM1_AFFINITY) | OV5_FEAT(OV5_PRRN), + .associativity = OV5_FEAT(OV5_FORM1_AFFINITY) | OV5_FEAT(OV5_PRRN) | + OV5_FEAT(OV5_FORM2_AFFINITY), .bin_opts = OV5_FEAT(OV5_RESIZE_HPT) | OV5_FEAT(OV5_HP_EVT), .micro_checkpoint = 0, .reserved0 = 0, diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index fe3a9a38eddf..fe9c6ced40b2 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -56,12 +56,17 @@ static int n_mem_addr_cells, n_mem_size_cells; #define FORM0_AFFINITY 0 #define FORM1_AFFINITY 1 +#define FORM2_AFFINITY 2 static int affinity_form; #define MAX_DISTANCE_REF_POINTS 4 static int distance_ref_points_depth; static const __be32 *distance_ref_points; static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS]; +static int numa_distance_table[MAX_NUMNODES][MAX_NUMNODES] = { + [0 ... MAX_NUMNODES - 1] = { [0 ... MAX_NUMNODES - 1] = -1 } +}; +static int numa_id_index_table[MAX_NUMNODES] = { [0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE }; /* * Allocate node_to_cpumask_map based on number of available nodes @@ -166,6 +171,54 @@ static void unmap_cpu_from_node(unsigned long cpu) } #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */ +static int __associativity_to_nid(const __be32 *associativity, + int max_array_sz) +{ + int nid; + /* + * primary_domain_index is 1 based array index. + */ + int index = primary_domain_index - 1; + + if (!numa_enabled || index >= max_array_sz) + return NUMA_NO_NODE; + + nid = of_read_number(&associativity[index], 1); + + /* POWER4 LPAR uses 0xffff as invalid node */ + if (nid == 0xffff || nid >= nr_node_ids) + nid = NUMA_NO_NODE; + return nid; +} +/* + * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA + * info is found. + */ +static int associativity_to_nid(const __be32 *associativity) +{ + int array_sz = of_read_number(associativity, 1); + + /* Skip the first element in the associativity array */ + return __associativity_to_nid((associativity + 1), array_sz); +} + +static int __cpu_form2_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) +{ + int dist; + int node1, node2; + + node1 = associativity_to_nid(cpu1_assoc); + node2 = associativity_to_nid(cpu2_assoc); + + dist = numa_distance_table[node1][node2]; + if (dist <= LOCAL_DISTANCE) + return 0; + else if (dist <= REMOTE_DISTANCE) + return 1; + else + return 2; +} + static int __cpu_form1_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) { int dist = 0; @@ -186,8 +239,9 @@ int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc) { /* We should not get called with FORM0 */ VM_WARN_ON(affinity_form == FORM0_AFFINITY); - - return __cpu_form1_relative_distance(cpu1_assoc, cpu2_assoc); + if (affinity_form == FORM1_AFFINITY) + return __cpu_form1_relative_distance(cpu1_assoc, cpu2_assoc); + return __cpu_form2_relative_distance(cpu1_assoc, cpu2_assoc); } /* must hold reference to node during call */ @@ -201,7 +255,9 @@ int __node_distance(int a, int b) int i; int distance = LOCAL_DISTANCE; - if (affinity_form == FORM0_AFFINITY) + if (affinity_form == FORM2_AFFINITY) + return numa_distance_table[a][b]; + else if (affinity_form == FORM0_AFFINITY) return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE); for (i = 0; i < distance_ref_points_depth; i++) { @@ -216,37 +272,6 @@ int __node_distance(int a, int b) } EXPORT_SYMBOL(__node_distance); -static int __associativity_to_nid(const __be32 *associativity, - int max_array_sz) -{ - int nid; - /* - * primary_domain_index is 1 based array index. - */ - int index = primary_domain_index - 1; - - if (!numa_enabled || index >= max_array_sz) - return NUMA_NO_NODE; - - nid = of_read_number(&associativity[index], 1); - - /* POWER4 LPAR uses 0xffff as invalid node */ - if (nid == 0xffff || nid >= nr_node_ids) - nid = NUMA_NO_NODE; - return nid; -} -/* - * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA - * info is found. - */ -static int associativity_to_nid(const __be32 *associativity) -{ - int array_sz = of_read_number(associativity, 1); - - /* Skip the first element in the associativity array */ - return __associativity_to_nid((associativity + 1), array_sz); -} - /* Returns the nid associated with the given device tree node, * or -1 if not found. */ @@ -320,6 +345,8 @@ static void initialize_form1_numa_distance(const __be32 *associativity) */ void update_numa_distance(struct device_node *node) { + int nid; + if (affinity_form == FORM0_AFFINITY) return; else if (affinity_form == FORM1_AFFINITY) { @@ -332,6 +359,84 @@ void update_numa_distance(struct device_node *node) initialize_form1_numa_distance(associativity); return; } + + /* FORM2 affinity */ + nid = of_node_to_nid_single(node); + if (nid == NUMA_NO_NODE) + return; + + /* + * With FORM2 we expect NUMA distance of all possible NUMA + * nodes to be provided during boot. + */ + WARN(numa_distance_table[nid][nid] == -1, + "NUMA distance details for node %d not provided\n", nid); +} + +/* + * ibm,numa-lookup-index-table= {N, domainid1, domainid2, ..... domainidN} + * ibm,numa-distance-table = { N, 1, 2, 4, 5, 1, 6, .... N elements} + */ +static void initialize_form2_numa_distance_lookup_table(void) +{ + int i, j; + struct device_node *root; + const __u8 *numa_dist_table; + const __be32 *numa_lookup_index; + int numa_dist_table_length; + int max_numa_index, distance_index; + + if (firmware_has_feature(FW_FEATURE_OPAL)) + root = of_find_node_by_path("/ibm,opal"); + else + root = of_find_node_by_path("/rtas"); + if (!root) + root = of_find_node_by_path("/"); + + numa_lookup_index = of_get_property(root, "ibm,numa-lookup-index-table", NULL); + max_numa_index = of_read_number(&numa_lookup_index[0], 1); + + /* first element of the array is the size and is encode-int */ + numa_dist_table = of_get_property(root, "ibm,numa-distance-table", NULL); + numa_dist_table_length = of_read_number((const __be32 *)&numa_dist_table[0], 1); + /* Skip the size which is encoded int */ + numa_dist_table += sizeof(__be32); + + pr_debug("numa_dist_table_len = %d, numa_dist_indexes_len = %d\n", + numa_dist_table_length, max_numa_index); + + for (i = 0; i < max_numa_index; i++) + /* +1 skip the max_numa_index in the property */ + numa_id_index_table[i] = of_read_number(&numa_lookup_index[i + 1], 1); + + + if (numa_dist_table_length != max_numa_index * max_numa_index) { + WARN(1, "Wrong NUMA distance information\n"); + /* consider everybody else just remote. */ + for (i = 0; i < max_numa_index; i++) { + for (j = 0; j < max_numa_index; j++) { + int nodeA = numa_id_index_table[i]; + int nodeB = numa_id_index_table[j]; + + if (nodeA == nodeB) + numa_distance_table[nodeA][nodeB] = LOCAL_DISTANCE; + else + numa_distance_table[nodeA][nodeB] = REMOTE_DISTANCE; + } + } + } + + distance_index = 0; + for (i = 0; i < max_numa_index; i++) { + for (j = 0; j < max_numa_index; j++) { + int nodeA = numa_id_index_table[i]; + int nodeB = numa_id_index_table[j]; + + numa_distance_table[nodeA][nodeB] = numa_dist_table[distance_index++]; + pr_debug("dist[%d][%d]=%d ", nodeA, nodeB, numa_distance_table[nodeA][nodeB]); + } + } + of_node_put(root); } static int __init find_primary_domain_index(void) @@ -344,6 +449,9 @@ static int __init find_primary_domain_index(void) */ if (firmware_has_feature(FW_FEATURE_OPAL)) { affinity_form = FORM1_AFFINITY; + } else if (firmware_has_feature(FW_FEATURE_FORM2_AFFINITY)) { + dbg("Using form 2 affinity\n"); + affinity_form = FORM2_AFFINITY; } else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) { dbg("Using form 1 affinity\n"); affinity_form = FORM1_AFFINITY; @@ -388,9 +496,12 @@ static int __init find_primary_domain_index(void) index = of_read_number(&distance_ref_points[1], 1); } else { + /* + * Both FORM1 and FORM2 affinity find the primary domain details + * at the same offset. + */ index = of_read_number(distance_ref_points, 1); } - /* * Warn and cap if the hardware supports more than * MAX_DISTANCE_REF_POINTS domains. @@ -819,6 +930,12 @@ static int __init parse_numa_properties(void) dbg("NUMA associativity depth for CPU/Memory: %d\n", primary_domain_index); + /* + * If it is FORM2 initialize the distance table here. + */ + if (affinity_form == FORM2_AFFINITY) + initialize_form2_numa_distance_lookup_table(); + /* * Even though we connect cpus to numa domains later in SMP * init, we need to know the node ids now. This is because diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c index 5d4c2bc20bba..f162156b7b68 100644 --- a/arch/powerpc/platforms/pseries/firmware.c +++ b/arch/powerpc/platforms/pseries/firmware.c @@ -123,6 +123,7 @@ vec5_fw_features_table[] = { {FW_FEATURE_PRRN, OV5_PRRN}, {FW_FEATURE_DRMEM_V2, OV5_DRMEM_V2}, {FW_FEATURE_DRC_INFO, OV5_DRC_INFO}, + {FW_FEATURE_FORM2_AFFINITY, OV5_FORM2_AFFINITY}, }; static void __init fw_vec5_feature_init(const char *vec5, unsigned long len) -- cgit v1.2.3-59-g8ed1b From db87a7199229b75c9996bf78117eceb81854fce2 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 13 Apr 2021 16:38:09 +0000 Subject: powerpc/bug: Remove specific powerpc BUG_ON() and WARN_ON() on PPC32 powerpc BUG_ON() and WARN_ON() are based on using twnei instruction. For catching simple conditions like a variable having value 0, this is efficient because it does the test and the trap at the same time. But most conditions used with BUG_ON or WARN_ON are more complex and forces GCC to format the condition into a 0 or 1 value in a register. This will usually require 2 to 3 instructions. The most efficient solution would be to use __builtin_trap() because GCC is able to optimise the use of the different trap instructions based on the requested condition, but this is complex if not impossible for the following reasons: - __builtin_trap() is a non-recoverable instruction, so it can't be used for WARN_ON - Knowing which line of code generated the trap would require the analysis of DWARF information. This is not a feature we have today. As mentioned in commit 8d4fbcfbe0a4 ("Fix WARN_ON() on bitfield ops") the way WARN_ON() is implemented is suboptimal. That commit also mentions an issue with 'long long' condition. It fixed it for WARN_ON() but the same problem still exists today with BUG_ON() on PPC32. It will be fixed by using the generic implementation. By using the generic implementation, gcc will naturally generate a branch to the unconditional trap generated by BUG(). As modern powerpc implement zero-cycle branch, that's even more efficient. And for the functions using WARN_ON() and its return, the test on return from WARN_ON() is now also used for the WARN_ON() itself. On PPC64 we don't want it because we want to be able to use CFAR register to track how we entered the code that trapped. The CFAR register would be clobbered by the branch. A simple test function: unsigned long test9w(unsigned long a, unsigned long b) { if (WARN_ON(!b)) return 0; return a / b; } Before the patch: 0000046c : 46c: 7c 89 00 34 cntlzw r9,r4 470: 55 29 d9 7e rlwinm r9,r9,27,5,31 474: 0f 09 00 00 twnei r9,0 478: 2c 04 00 00 cmpwi r4,0 47c: 41 82 00 0c beq 488 480: 7c 63 23 96 divwu r3,r3,r4 484: 4e 80 00 20 blr 488: 38 60 00 00 li r3,0 48c: 4e 80 00 20 blr After the patch: 00000468 : 468: 2c 04 00 00 cmpwi r4,0 46c: 41 82 00 0c beq 478 470: 7c 63 23 96 divwu r3,r3,r4 474: 4e 80 00 20 blr 478: 0f e0 00 00 twui r0,0 47c: 38 60 00 00 li r3,0 480: 4e 80 00 20 blr So we see before the patch we need 3 instructions on the likely path to handle the WARN_ON(). With the patch the trap goes on the unlikely path. See below the difference at the entry of system_call_exception where we have several BUG_ON(), allthough less impressing. With the patch: 00000000 : 0: 81 6a 00 84 lwz r11,132(r10) 4: 90 6a 00 88 stw r3,136(r10) 8: 71 60 00 02 andi. r0,r11,2 c: 41 82 00 70 beq 7c 10: 71 60 40 00 andi. r0,r11,16384 14: 41 82 00 6c beq 80 18: 71 6b 80 00 andi. r11,r11,32768 1c: 41 82 00 68 beq 84 20: 94 21 ff e0 stwu r1,-32(r1) 24: 93 e1 00 1c stw r31,28(r1) 28: 7d 8c 42 e6 mftb r12 ... 7c: 0f e0 00 00 twui r0,0 80: 0f e0 00 00 twui r0,0 84: 0f e0 00 00 twui r0,0 Without the patch: 00000000 : 0: 94 21 ff e0 stwu r1,-32(r1) 4: 93 e1 00 1c stw r31,28(r1) 8: 90 6a 00 88 stw r3,136(r10) c: 81 6a 00 84 lwz r11,132(r10) 10: 69 60 00 02 xori r0,r11,2 14: 54 00 ff fe rlwinm r0,r0,31,31,31 18: 0f 00 00 00 twnei r0,0 1c: 69 60 40 00 xori r0,r11,16384 20: 54 00 97 fe rlwinm r0,r0,18,31,31 24: 0f 00 00 00 twnei r0,0 28: 69 6b 80 00 xori r11,r11,32768 2c: 55 6b 8f fe rlwinm r11,r11,17,31,31 30: 0f 0b 00 00 twnei r11,0 34: 7d 8c 42 e6 mftb r12 Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/b286e07fb771a664b631cd07a40b09c06f26e64b.1618331881.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/bug.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h index 0b2162890d8b..d844de5adfcb 100644 --- a/arch/powerpc/include/asm/bug.h +++ b/arch/powerpc/include/asm/bug.h @@ -68,7 +68,11 @@ BUG_ENTRY("twi 31, 0, 0", 0); \ unreachable(); \ } while (0) +#define HAVE_ARCH_BUG + +#define __WARN_FLAGS(flags) BUG_ENTRY("twi 31, 0, 0", BUGFLAG_WARNING | (flags)) +#ifdef CONFIG_PPC64 #define BUG_ON(x) do { \ if (__builtin_constant_p(x)) { \ if (x) \ @@ -78,8 +82,6 @@ } \ } while (0) -#define __WARN_FLAGS(flags) BUG_ENTRY("twi 31, 0, 0", BUGFLAG_WARNING | (flags)) - #define WARN_ON(x) ({ \ int __ret_warn_on = !!(x); \ if (__builtin_constant_p(__ret_warn_on)) { \ @@ -93,9 +95,10 @@ unlikely(__ret_warn_on); \ }) -#define HAVE_ARCH_BUG #define HAVE_ARCH_BUG_ON #define HAVE_ARCH_WARN_ON +#endif + #endif /* __ASSEMBLY __ */ #else #ifdef __ASSEMBLY__ -- cgit v1.2.3-59-g8ed1b From 1e688dd2a3d6759d416616ff07afc4bb836c4213 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 13 Apr 2021 16:38:10 +0000 Subject: powerpc/bug: Provide better flexibility to WARN_ON/__WARN_FLAGS() with asm goto Using asm goto in __WARN_FLAGS() and WARN_ON() allows more flexibility to GCC. For that add an entry to the exception table so that program_check_exception() knowns where to resume execution after a WARNING. Here are two exemples. The first one is done on PPC32 (which benefits from the previous patch), the second is on PPC64. unsigned long test(struct pt_regs *regs) { int ret; WARN_ON(regs->msr & MSR_PR); return regs->gpr[3]; } unsigned long test9w(unsigned long a, unsigned long b) { if (WARN_ON(!b)) return 0; return a / b; } Before the patch: 000003a8 : 3a8: 81 23 00 84 lwz r9,132(r3) 3ac: 71 29 40 00 andi. r9,r9,16384 3b0: 40 82 00 0c bne 3bc 3b4: 80 63 00 0c lwz r3,12(r3) 3b8: 4e 80 00 20 blr 3bc: 0f e0 00 00 twui r0,0 3c0: 80 63 00 0c lwz r3,12(r3) 3c4: 4e 80 00 20 blr 0000000000000bf0 <.test9w>: bf0: 7c 89 00 74 cntlzd r9,r4 bf4: 79 29 d1 82 rldicl r9,r9,58,6 bf8: 0b 09 00 00 tdnei r9,0 bfc: 2c 24 00 00 cmpdi r4,0 c00: 41 82 00 0c beq c0c <.test9w+0x1c> c04: 7c 63 23 92 divdu r3,r3,r4 c08: 4e 80 00 20 blr c0c: 38 60 00 00 li r3,0 c10: 4e 80 00 20 blr After the patch: 000003a8 : 3a8: 81 23 00 84 lwz r9,132(r3) 3ac: 71 29 40 00 andi. r9,r9,16384 3b0: 40 82 00 0c bne 3bc 3b4: 80 63 00 0c lwz r3,12(r3) 3b8: 4e 80 00 20 blr 3bc: 0f e0 00 00 twui r0,0 0000000000000c50 <.test9w>: c50: 7c 89 00 74 cntlzd r9,r4 c54: 79 29 d1 82 rldicl r9,r9,58,6 c58: 0b 09 00 00 tdnei r9,0 c5c: 7c 63 23 92 divdu r3,r3,r4 c60: 4e 80 00 20 blr c70: 38 60 00 00 li r3,0 c74: 4e 80 00 20 blr In the first exemple, we see GCC doesn't need to duplicate what happens after the trap. In the second exemple, we see that GCC doesn't need to emit a test and a branch in the likely path in addition to the trap. We've got some WARN_ON() in .softirqentry.text section so it needs to be added in the OTHER_TEXT_SECTIONS in modpost.c Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/389962b1b702e3c78d169e59bcfac56282889173.1618331882.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/book3s/64/kup.h | 2 +- arch/powerpc/include/asm/bug.h | 54 ++++++++++++++++++---- arch/powerpc/include/asm/extable.h | 14 ++++++ arch/powerpc/include/asm/ppc_asm.h | 11 +---- arch/powerpc/kernel/entry_64.S | 2 +- arch/powerpc/kernel/misc_32.S | 2 +- arch/powerpc/kernel/traps.c | 9 +++- scripts/mod/modpost.c | 2 +- .../selftests/powerpc/primitives/asm/extable.h | 1 + 9 files changed, 72 insertions(+), 25 deletions(-) create mode 120000 tools/testing/selftests/powerpc/primitives/asm/extable.h (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/book3s/64/kup.h b/arch/powerpc/include/asm/book3s/64/kup.h index a1cc73a88710..170339969b7c 100644 --- a/arch/powerpc/include/asm/book3s/64/kup.h +++ b/arch/powerpc/include/asm/book3s/64/kup.h @@ -90,7 +90,7 @@ /* Prevent access to userspace using any key values */ LOAD_REG_IMMEDIATE(\gpr2, AMR_KUAP_BLOCKED) 999: tdne \gpr1, \gpr2 - EMIT_BUG_ENTRY 999b, __FILE__, __LINE__, (BUGFLAG_WARNING | BUGFLAG_ONCE) + EMIT_WARN_ENTRY 999b, __FILE__, __LINE__, (BUGFLAG_WARNING | BUGFLAG_ONCE) END_MMU_FTR_SECTION_NESTED_IFSET(MMU_FTR_BOOK3S_KUAP, 67) #endif .endm diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h index d844de5adfcb..1ee0f22313ee 100644 --- a/arch/powerpc/include/asm/bug.h +++ b/arch/powerpc/include/asm/bug.h @@ -4,6 +4,7 @@ #ifdef __KERNEL__ #include +#include #ifdef CONFIG_BUG @@ -30,6 +31,11 @@ .endm #endif /* verbose */ +.macro EMIT_WARN_ENTRY addr,file,line,flags + EX_TABLE(\addr,\addr+4) + EMIT_BUG_ENTRY \addr,\file,\line,\flags +.endm + #else /* !__ASSEMBLY__ */ /* _EMIT_BUG_ENTRY expects args %0,%1,%2,%3 to be FILE, LINE, flags and sizeof(struct bug_entry), respectively */ @@ -58,6 +64,16 @@ "i" (sizeof(struct bug_entry)), \ ##__VA_ARGS__) +#define WARN_ENTRY(insn, flags, label, ...) \ + asm_volatile_goto( \ + "1: " insn "\n" \ + EX_TABLE(1b, %l[label]) \ + _EMIT_BUG_ENTRY \ + : : "i" (__FILE__), "i" (__LINE__), \ + "i" (flags), \ + "i" (sizeof(struct bug_entry)), \ + ##__VA_ARGS__ : : label) + /* * BUG_ON() and WARN_ON() do their best to cooperate with compile-time * optimisations. However depending on the complexity of the condition @@ -70,7 +86,15 @@ } while (0) #define HAVE_ARCH_BUG -#define __WARN_FLAGS(flags) BUG_ENTRY("twi 31, 0, 0", BUGFLAG_WARNING | (flags)) +#define __WARN_FLAGS(flags) do { \ + __label__ __label_warn_on; \ + \ + WARN_ENTRY("twi 31, 0, 0", BUGFLAG_WARNING | (flags), __label_warn_on); \ + unreachable(); \ + \ +__label_warn_on: \ + break; \ +} while (0) #ifdef CONFIG_PPC64 #define BUG_ON(x) do { \ @@ -83,15 +107,24 @@ } while (0) #define WARN_ON(x) ({ \ - int __ret_warn_on = !!(x); \ - if (__builtin_constant_p(__ret_warn_on)) { \ - if (__ret_warn_on) \ + bool __ret_warn_on = false; \ + do { \ + if (__builtin_constant_p((x))) { \ + if (!(x)) \ + break; \ __WARN(); \ - } else { \ - BUG_ENTRY(PPC_TLNEI " %4, 0", \ - BUGFLAG_WARNING | BUGFLAG_TAINT(TAINT_WARN), \ - "r" (__ret_warn_on)); \ - } \ + __ret_warn_on = true; \ + } else { \ + __label__ __label_warn_on; \ + \ + WARN_ENTRY(PPC_TLNEI " %4, 0", \ + BUGFLAG_WARNING | BUGFLAG_TAINT(TAINT_WARN), \ + __label_warn_on, "r" (x)); \ + break; \ +__label_warn_on: \ + __ret_warn_on = true; \ + } \ + } while (0); \ unlikely(__ret_warn_on); \ }) @@ -104,8 +137,11 @@ #ifdef __ASSEMBLY__ .macro EMIT_BUG_ENTRY addr,file,line,flags .endm +.macro EMIT_WARN_ENTRY addr,file,line,flags +.endm #else /* !__ASSEMBLY__ */ #define _EMIT_BUG_ENTRY +#define _EMIT_WARN_ENTRY #endif #endif /* CONFIG_BUG */ diff --git a/arch/powerpc/include/asm/extable.h b/arch/powerpc/include/asm/extable.h index eb91b2d2935a..26ce2e5c0fa8 100644 --- a/arch/powerpc/include/asm/extable.h +++ b/arch/powerpc/include/asm/extable.h @@ -17,6 +17,8 @@ #define ARCH_HAS_RELATIVE_EXTABLE +#ifndef __ASSEMBLY__ + struct exception_table_entry { int insn; int fixup; @@ -28,3 +30,15 @@ static inline unsigned long extable_fixup(const struct exception_table_entry *x) } #endif + +/* + * Helper macro for exception table entries + */ +#define EX_TABLE(_fault, _target) \ + stringify_in_c(.section __ex_table,"a";)\ + stringify_in_c(.balign 4;) \ + stringify_in_c(.long (_fault) - . ;) \ + stringify_in_c(.long (_target) - . ;) \ + stringify_in_c(.previous) + +#endif diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index 116c1519728a..ffe712307e11 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -10,6 +10,7 @@ #include #include #include +#include #ifdef __ASSEMBLY__ @@ -752,16 +753,6 @@ END_FTR_SECTION_NESTED(CPU_FTR_CELL_TB_BUG, CPU_FTR_CELL_TB_BUG, 96) #endif /* __ASSEMBLY__ */ -/* - * Helper macro for exception table entries - */ -#define EX_TABLE(_fault, _target) \ - stringify_in_c(.section __ex_table,"a";)\ - stringify_in_c(.balign 4;) \ - stringify_in_c(.long (_fault) - . ;) \ - stringify_in_c(.long (_target) - . ;) \ - stringify_in_c(.previous) - #define SOFT_MASK_TABLE(_start, _end) \ stringify_in_c(.section __soft_mask_table,"a";)\ stringify_in_c(.balign 8;) \ diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 15720f8661a1..70cff7b49e17 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -309,7 +309,7 @@ _GLOBAL(enter_rtas) */ lbz r0,PACAIRQSOFTMASK(r13) 1: tdeqi r0,IRQS_ENABLED - EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,BUGFLAG_WARNING + EMIT_WARN_ENTRY 1b,__FILE__,__LINE__,BUGFLAG_WARNING #endif /* Hard-disable interrupts */ diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index 39ab15419592..d8645efff902 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -237,7 +237,7 @@ _GLOBAL(copy_page) addi r3,r3,-4 0: twnei r5, 0 /* WARN if r3 is not cache aligned */ - EMIT_BUG_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING + EMIT_WARN_ENTRY 0b,__FILE__,__LINE__, BUGFLAG_WARNING addi r4,r4,-4 diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index c8f648727d36..51d4f5faf425 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -1477,8 +1477,13 @@ static void do_program_check(struct pt_regs *regs) if (!(regs->msr & MSR_PR) && /* not user-mode */ report_bug(bugaddr, regs) == BUG_TRAP_TYPE_WARN) { - regs_add_return_ip(regs, 4); - return; + const struct exception_table_entry *entry; + + entry = search_exception_tables(bugaddr); + if (entry) { + regs_set_return_ip(regs, extable_fixup(entry) + regs->nip - bugaddr); + return; + } } _exception(SIGTRAP, regs, TRAP_BRKPT, regs->nip); return; diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 270a7df898e2..1209e1786af7 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -931,7 +931,7 @@ static void check_section(const char *modname, struct elf_info *elf, ".kprobes.text", ".cpuidle.text", ".noinstr.text" #define OTHER_TEXT_SECTIONS ".ref.text", ".head.text", ".spinlock.text", \ ".fixup", ".entry.text", ".exception.text", ".text.*", \ - ".coldtext" + ".coldtext", ".softirqentry.text" #define INIT_SECTIONS ".init.*" #define MEM_INIT_SECTIONS ".meminit.*" diff --git a/tools/testing/selftests/powerpc/primitives/asm/extable.h b/tools/testing/selftests/powerpc/primitives/asm/extable.h new file mode 120000 index 000000000000..6385f059a951 --- /dev/null +++ b/tools/testing/selftests/powerpc/primitives/asm/extable.h @@ -0,0 +1 @@ +../../../../../../arch/powerpc/include/asm/extable.h \ No newline at end of file -- cgit v1.2.3-59-g8ed1b From 0355785313e2191be4e1108cdbda94ddb0238c48 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Fri, 13 Aug 2021 13:05:11 -0700 Subject: powerpc: Add "-z notext" flag to disable diagnostic Object files used to link .tmp_vmlinux.kallsyms1 have many R_PPC64_ADDR64 relocations in non-SHF_WRITE sections. There are many text relocations (e.g. in .rela___ksymtab_gpl+* and .rela__mcount_loc sections) in a -pie link and are disallowed by LLD: ld.lld: error: can't create dynamic relocation R_PPC64_ADDR64 against local symbol in readonly segment; recompile object files with -fPIC or pass '-Wl,-z,notext' to allow text relocations in the output >>> defined in arch/powerpc/kernel/head_64.o >>> referenced by arch/powerpc/kernel/head_64.o:(__restart_table+0x10) Newer GNU ld configured with "--enable-textrel-check=error" will report an error as well: $ ld-new -EL -m elf64lppc -pie ... -o .tmp_vmlinux.kallsyms1 ... ld-new: read-only segment has dynamic relocations Add "-z notext" to suppress the errors. Non-CONFIG_RELOCATABLE builds use the default -no-pie mode and thus R_PPC64_ADDR64 relocations can be resolved at link-time. Reported-by: Itaru Kitayama Co-developed-by: Bill Wendling Signed-off-by: Fangrui Song Signed-off-by: Bill Wendling Reviewed-by: Nick Desaulniers Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210813200511.1905703-1-morbo@google.com --- arch/powerpc/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 9aaf1abbc641..aa6808e70647 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -122,6 +122,7 @@ endif LDFLAGS_vmlinux-y := -Bstatic LDFLAGS_vmlinux-$(CONFIG_RELOCATABLE) := -pie +LDFLAGS_vmlinux-$(CONFIG_RELOCATABLE) += -z notext LDFLAGS_vmlinux := $(LDFLAGS_vmlinux-y) ifdef CONFIG_PPC64 -- cgit v1.2.3-59-g8ed1b From 8b893ef190b0c440877de04f767efca4bf4d6af8 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 16 Aug 2021 12:30:11 +1000 Subject: powerpc/pseries: Fix build error when NUMA=n As reported by lkp, if NUMA=n we see a build error: arch/powerpc/platforms/pseries/hotplug-cpu.c: In function 'pseries_cpu_hotplug_init': arch/powerpc/platforms/pseries/hotplug-cpu.c:1022:8: error: 'node_to_cpumask_map' undeclared 1022 | node_to_cpumask_map[node]); Use cpumask_of_node() which has an empty stub for NUMA=n, and when NUMA=y does a lookup from node_to_cpumask_map[]. Fixes: bd1dd4c5f528 ("powerpc/pseries: Prevent free CPU ids being reused on another node") Reported-by: kernel test robot Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210816041032.2839343-1-mpe@ellerman.id.au --- arch/powerpc/platforms/pseries/hotplug-cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index 1ef40ef699a6..d646c22e94ab 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -1021,7 +1021,7 @@ static int __init pseries_cpu_hotplug_init(void) /* Record ids of CPU added at boot time */ cpumask_or(node_recorded_ids_map[node], node_recorded_ids_map[node], - node_to_cpumask_map[node]); + cpumask_of_node(node)); } of_reconfig_notifier_register(&pseries_smp_nb); -- cgit v1.2.3-59-g8ed1b From 47c258d71ebfc832a760a1dc6540cf3c33968023 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 15 Aug 2021 15:23:34 -0700 Subject: powerpc/head_check: use stdout for error messages Prefer stderr instead of stdout for error messages. This is a good practice and can help CI error detecting and reporting (0day in this case). Signed-off-by: Randy Dunlap Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210815222334.9575-1-rdunlap@infradead.org --- arch/powerpc/tools/head_check.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/tools/head_check.sh b/arch/powerpc/tools/head_check.sh index e32d3162e5ed..e477837fdc58 100644 --- a/arch/powerpc/tools/head_check.sh +++ b/arch/powerpc/tools/head_check.sh @@ -56,9 +56,9 @@ expected_start_head_addr=$vma start_head_addr=$(cat .tmp_symbols.txt | grep " t start_first_256B$" | cut -d' ' -f1) if [ "$start_head_addr" != "$expected_start_head_addr" ]; then - echo "ERROR: head code starts at $start_head_addr, should be $expected_start_head_addr" - echo "ERROR: try to enable LD_HEAD_STUB_CATCH config option" - echo "ERROR: see comments in arch/powerpc/tools/head_check.sh" + echo "ERROR: head code starts at $start_head_addr, should be $expected_start_head_addr" 1>&2 + echo "ERROR: try to enable LD_HEAD_STUB_CATCH config option" 1>&2 + echo "ERROR: see comments in arch/powerpc/tools/head_check.sh" 1>&2 exit 1 fi @@ -70,9 +70,9 @@ expected_start_text_addr=$(cat .tmp_symbols.txt | grep " a text_start$" | cut -d start_text_addr=$(cat .tmp_symbols.txt | grep " t start_text$" | cut -d' ' -f1) if [ "$start_text_addr" != "$expected_start_text_addr" ]; then - echo "ERROR: start_text address is $start_text_addr, should be $expected_start_text_addr" - echo "ERROR: try to enable LD_HEAD_STUB_CATCH config option" - echo "ERROR: see comments in arch/powerpc/tools/head_check.sh" + echo "ERROR: start_text address is $start_text_addr, should be $expected_start_text_addr" 1>&2 + echo "ERROR: try to enable LD_HEAD_STUB_CATCH config option" 1>&2 + echo "ERROR: see comments in arch/powerpc/tools/head_check.sh" 1>&2 exit 1 fi -- cgit v1.2.3-59-g8ed1b From e95ad5f21693a37c0d318b5988c5a0de324eb3e3 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 16 Aug 2021 16:36:02 +1000 Subject: powerpc/head_check: Fix shellcheck errors Replace "cat file | grep pattern" with "grep pattern file", and quote a few variables. Together that fixes all shellcheck errors. Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210817125154.3369884-1-mpe@ellerman.id.au --- arch/powerpc/tools/head_check.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/tools/head_check.sh b/arch/powerpc/tools/head_check.sh index e477837fdc58..689907cda996 100644 --- a/arch/powerpc/tools/head_check.sh +++ b/arch/powerpc/tools/head_check.sh @@ -49,11 +49,11 @@ vmlinux="$2" $nm "$vmlinux" | grep -e " [TA] _stext$" -e " t start_first_256B$" -e " a text_start$" -e " t start_text$" > .tmp_symbols.txt -vma=$(cat .tmp_symbols.txt | grep -e " [TA] _stext$" | cut -d' ' -f1) +vma=$(grep -e " [TA] _stext$" .tmp_symbols.txt | cut -d' ' -f1) -expected_start_head_addr=$vma +expected_start_head_addr="$vma" -start_head_addr=$(cat .tmp_symbols.txt | grep " t start_first_256B$" | cut -d' ' -f1) +start_head_addr=$(grep " t start_first_256B$" .tmp_symbols.txt | cut -d' ' -f1) if [ "$start_head_addr" != "$expected_start_head_addr" ]; then echo "ERROR: head code starts at $start_head_addr, should be $expected_start_head_addr" 1>&2 @@ -63,11 +63,11 @@ if [ "$start_head_addr" != "$expected_start_head_addr" ]; then exit 1 fi -top_vma=$(echo $vma | cut -d'0' -f1) +top_vma=$(echo "$vma" | cut -d'0' -f1) -expected_start_text_addr=$(cat .tmp_symbols.txt | grep " a text_start$" | cut -d' ' -f1 | sed "s/^0/$top_vma/") +expected_start_text_addr=$(grep " a text_start$" .tmp_symbols.txt | cut -d' ' -f1 | sed "s/^0/$top_vma/") -start_text_addr=$(cat .tmp_symbols.txt | grep " t start_text$" | cut -d' ' -f1) +start_text_addr=$(grep " t start_text$" .tmp_symbols.txt | cut -d' ' -f1) if [ "$start_text_addr" != "$expected_start_text_addr" ]; then echo "ERROR: start_text address is $start_text_addr, should be $expected_start_text_addr" 1>&2 -- cgit v1.2.3-59-g8ed1b From c5ac55b6cbc604ad4144c380feae20f69453df91 Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Tue, 17 Aug 2021 14:24:05 +0930 Subject: powerpc/config: Fix IPV6 warning in mpc855_ads When building this config there's a warning: 79:warning: override: reassigning to symbol IPV6 Commit 9a1762a4a4ff ("powerpc/8xx: Update mpc885_ads_defconfig to improve CI") added CONFIG_IPV6=y, but left '# CONFIG_IPV6 is not set' in. IPV6 is default y, so remove both to clean up the build. Fixes: 9a1762a4a4ff ("powerpc/8xx: Update mpc885_ads_defconfig to improve CI") Signed-off-by: Joel Stanley Acked-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210817045407.2445664-2-joel@jms.id.au --- arch/powerpc/configs/mpc885_ads_defconfig | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/configs/mpc885_ads_defconfig b/arch/powerpc/configs/mpc885_ads_defconfig index d21f266cea9a..5cd17adf903f 100644 --- a/arch/powerpc/configs/mpc885_ads_defconfig +++ b/arch/powerpc/configs/mpc885_ads_defconfig @@ -21,7 +21,6 @@ CONFIG_INET=y CONFIG_IP_MULTICAST=y CONFIG_IP_PNP=y CONFIG_SYN_COOKIES=y -# CONFIG_IPV6 is not set # CONFIG_FW_LOADER is not set CONFIG_MTD=y CONFIG_MTD_BLOCK=y @@ -76,7 +75,6 @@ CONFIG_PERF_EVENTS=y CONFIG_MATH_EMULATION=y CONFIG_VIRT_CPU_ACCOUNTING_NATIVE=y CONFIG_STRICT_KERNEL_RWX=y -CONFIG_IPV6=y CONFIG_BPF_JIT=y CONFIG_DEBUG_VM_PGTABLE=y CONFIG_BDI_SWITCH=y -- cgit v1.2.3-59-g8ed1b From d0e28a6145c3455b69991245e7f6147eb914b34a Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Tue, 17 Aug 2021 14:24:06 +0930 Subject: powerpc/config: Renable MTD_PHYSMAP_OF CONFIG_MTD_PHYSMAP_OF is not longer enabled as it depends on MTD_PHYSMAP which is not enabled. This is a regression from commit 642b1e8dbed7 ("mtd: maps: Merge physmap_of.c into physmap-core.c"), which added the extra dependency. Add CONFIG_MTD_PHYSMAP=y so this stays in the config, as Christophe said it is useful for build coverage. Fixes: 642b1e8dbed7 ("mtd: maps: Merge physmap_of.c into physmap-core.c") Signed-off-by: Joel Stanley Acked-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210817045407.2445664-3-joel@jms.id.au --- arch/powerpc/configs/mpc885_ads_defconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/configs/mpc885_ads_defconfig b/arch/powerpc/configs/mpc885_ads_defconfig index 5cd17adf903f..cd08f9ed2c8d 100644 --- a/arch/powerpc/configs/mpc885_ads_defconfig +++ b/arch/powerpc/configs/mpc885_ads_defconfig @@ -33,6 +33,7 @@ CONFIG_MTD_CFI_GEOMETRY=y # CONFIG_MTD_CFI_I2 is not set CONFIG_MTD_CFI_I4=y CONFIG_MTD_CFI_AMDSTD=y +CONFIG_MTD_PHYSMAP=y CONFIG_MTD_PHYSMAP_OF=y # CONFIG_BLK_DEV is not set CONFIG_NETDEVICES=y -- cgit v1.2.3-59-g8ed1b From 87e0d46bf68913f4c87dba94aadc00da658a874b Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Tue, 17 Aug 2021 14:24:07 +0930 Subject: powerpc/configs: Regenerate mpc885_ads_defconfig Regenerate atop v5.14-rc6 by doing a make savedefconfig. The changes a re-ordering except for the following (which are still set indirectly): - CONFIG_DEBUG_KERNEL=y selected by EXPERT - CONFIG_PPC_EARLY_DEBUG_CPM_ADDR=0xff002008 which is the default setting Signed-off-by: Joel Stanley Acked-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210817045407.2445664-4-joel@jms.id.au --- arch/powerpc/configs/mpc885_ads_defconfig | 46 +++++++++++++++---------------- 1 file changed, 22 insertions(+), 24 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/configs/mpc885_ads_defconfig b/arch/powerpc/configs/mpc885_ads_defconfig index cd08f9ed2c8d..c74dc76b1d0d 100644 --- a/arch/powerpc/configs/mpc885_ads_defconfig +++ b/arch/powerpc/configs/mpc885_ads_defconfig @@ -1,19 +1,30 @@ -CONFIG_PPC_8xx=y # CONFIG_SWAP is not set CONFIG_SYSVIPC=y CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y +CONFIG_BPF_JIT=y +CONFIG_VIRT_CPU_ACCOUNTING_NATIVE=y CONFIG_LOG_BUF_SHIFT=14 CONFIG_EXPERT=y # CONFIG_ELF_CORE is not set # CONFIG_BASE_FULL is not set # CONFIG_FUTEX is not set +CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set -# CONFIG_BLK_DEV_BSG is not set -CONFIG_PARTITION_ADVANCED=y +CONFIG_PPC_8xx=y +CONFIG_8xx_GPIO=y +CONFIG_SMC_UCODE_PATCH=y +CONFIG_PIN_TLB=y CONFIG_GEN_RTC=y CONFIG_HZ_100=y +CONFIG_MATH_EMULATION=y +CONFIG_PPC_16K_PAGES=y +CONFIG_ADVANCED_OPTIONS=y # CONFIG_SECCOMP is not set +CONFIG_STRICT_KERNEL_RWX=y +CONFIG_MODULES=y +# CONFIG_BLK_DEV_BSG is not set +CONFIG_PARTITION_ADVANCED=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y @@ -46,38 +57,25 @@ CONFIG_DAVICOM_PHY=y # CONFIG_LEGACY_PTYS is not set CONFIG_SERIAL_CPM=y CONFIG_SERIAL_CPM_CONSOLE=y +CONFIG_SPI=y +CONFIG_SPI_FSL_SPI=y # CONFIG_HWMON is not set +CONFIG_WATCHDOG=y +CONFIG_8xxx_WDT=y # CONFIG_USB_SUPPORT is not set # CONFIG_DNOTIFY is not set CONFIG_TMPFS=y CONFIG_CRAMFS=y CONFIG_NFS_FS=y CONFIG_ROOT_NFS=y +CONFIG_CRYPTO=y +CONFIG_CRYPTO_DEV_TALITOS=y CONFIG_CRC32_SLICEBY4=y CONFIG_DEBUG_INFO=y CONFIG_MAGIC_SYSRQ=y -CONFIG_DETECT_HUNG_TASK=y -CONFIG_PPC_16K_PAGES=y -CONFIG_DEBUG_KERNEL=y CONFIG_DEBUG_FS=y -CONFIG_PPC_PTDUMP=y -CONFIG_MODULES=y -CONFIG_SPI=y -CONFIG_SPI_FSL_SPI=y -CONFIG_CRYPTO=y -CONFIG_CRYPTO_DEV_TALITOS=y -CONFIG_8xx_GPIO=y -CONFIG_WATCHDOG=y -CONFIG_8xxx_WDT=y -CONFIG_SMC_UCODE_PATCH=y -CONFIG_ADVANCED_OPTIONS=y -CONFIG_PIN_TLB=y -CONFIG_PERF_EVENTS=y -CONFIG_MATH_EMULATION=y -CONFIG_VIRT_CPU_ACCOUNTING_NATIVE=y -CONFIG_STRICT_KERNEL_RWX=y -CONFIG_BPF_JIT=y CONFIG_DEBUG_VM_PGTABLE=y +CONFIG_DETECT_HUNG_TASK=y CONFIG_BDI_SWITCH=y CONFIG_PPC_EARLY_DEBUG=y -CONFIG_PPC_EARLY_DEBUG_CPM_ADDR=0xff002008 +CONFIG_PPC_PTDUMP=y -- cgit v1.2.3-59-g8ed1b From e225c4d6bc389701f2f63fc246420a1da3465ab5 Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Tue, 23 Mar 2021 14:29:05 +0800 Subject: powerpc: Remove duplicate includes interrupt.c: asm/interrupt.h has been included at line 12, so remove the duplicate one at line 10. time.c: linux/sched/clock.h has been included at line 33,so remove the duplicate one at line 56 and move sched/cputime.h under sched including segament. Signed-off-by: Wan Jiabing Reviewed-by: Daniel Axtens Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210323062916.295346-1-wanjiabing@vivo.com --- arch/powerpc/kernel/interrupt.c | 1 - arch/powerpc/kernel/time.c | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c index 21bbd615ca41..abc2b3dc3f20 100644 --- a/arch/powerpc/kernel/interrupt.c +++ b/arch/powerpc/kernel/interrupt.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index e45ce427bffb..77bae85e2fae 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -52,8 +53,6 @@ #include #include #include -#include -#include #include #include -- cgit v1.2.3-59-g8ed1b From 51ed00e71f0130e0f3534b8e7d78facd16829426 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 18 Aug 2021 08:47:28 +0000 Subject: powerpc/32: Remove unneccessary calculations in load_up_{fpu/altivec} No need to re-read SPRN_THREAD, we can calculate thread address from current (r2). And remove a reload of value 1 into r4 as r4 is already 1. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/c04cce578b97a76a9e69a096698b1d89f721768a.1629276437.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/fpu.S | 3 +-- arch/powerpc/kernel/vector.S | 4 +--- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S index 6010adcee16e..ba4afe3b5a9c 100644 --- a/arch/powerpc/kernel/fpu.S +++ b/arch/powerpc/kernel/fpu.S @@ -91,8 +91,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) isync /* enable use of FP after return */ #ifdef CONFIG_PPC32 - mfspr r5,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ - tovirt(r5, r5) + addi r5,r2,THREAD lwz r4,THREAD_FPEXC_MODE(r5) ori r9,r9,MSR_FP /* enable FP for current */ or r9,r9,r4 diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S index fc120fac1910..ba03eedfdcd8 100644 --- a/arch/powerpc/kernel/vector.S +++ b/arch/powerpc/kernel/vector.S @@ -65,9 +65,8 @@ _GLOBAL(load_up_altivec) 1: /* enable use of VMX after return */ #ifdef CONFIG_PPC32 - mfspr r5,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ + addi r5,r2,THREAD oris r9,r9,MSR_VEC@h - tovirt(r5, r5) #else ld r4,PACACURRENT(r13) addi r5,r4,THREAD /* Get THREAD */ @@ -81,7 +80,6 @@ _GLOBAL(load_up_altivec) li r4,1 stb r4,THREAD_LOAD_VEC(r5) addi r6,r5,THREAD_VRSTATE - li r4,1 li r10,VRSTATE_VSCR stw r4,THREAD_USED_VR(r5) lvx v0,r10,r6 -- cgit v1.2.3-59-g8ed1b From c26d4c5d4f0df7207da3975458261927f9305465 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Thu, 19 Aug 2021 13:39:53 +0200 Subject: powerpc/kvm: Remove obsolete and unneeded select Commit a278e7ea608b ("powerpc: Fix compile issue with force DAWR") selects the non-existing config PPC_DAWR_FORCE_ENABLE for config KVM_BOOK3S_64_HANDLER. As this commit also introduces a config PPC_DAWR and this config PPC_DAWR is selected with PPC if PPC64, there is no need for any further select in the KVM_BOOK3S_64_HANDLER. Remove an obsolete and unneeded select in config KVM_BOOK3S_64_HANDLER. The issue was identified with ./scripts/checkkconfigsymbols.py. Fixes: a278e7ea608b ("powerpc: Fix compile issue with force DAWR") Signed-off-by: Lukas Bulwahn Reviewed-by: Daniel Axtens Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210819113954.17515-2-lukas.bulwahn@gmail.com --- arch/powerpc/kvm/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index e45644657d49..ff581d70f20c 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -38,7 +38,6 @@ config KVM_BOOK3S_32_HANDLER config KVM_BOOK3S_64_HANDLER bool select KVM_BOOK3S_HANDLER - select PPC_DAWR_FORCE_ENABLE config KVM_BOOK3S_PR_POSSIBLE bool -- cgit v1.2.3-59-g8ed1b From 6cd717fe9b3a787f8e8f9d0bc9b7634a9c104b3e Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Thu, 19 Aug 2021 10:46:54 +1000 Subject: powerpc/tau: Add 'static' storage qualifier to 'tau_work' definition This patch prevents the following sparse warning. arch/powerpc/kernel/tau_6xx.c:199:1: sparse: sparse: symbol 'tau_work' was not declared. Should it be static? Reported-by: kernel test robot Signed-off-by: Finn Thain Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/44ab381741916a51e783c4a50d0b186abdd8f280.1629334014.git.fthain@linux-m68k.org --- arch/powerpc/kernel/tau_6xx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/tau_6xx.c b/arch/powerpc/kernel/tau_6xx.c index b9a047d92ec0..8e83d19fe8fa 100644 --- a/arch/powerpc/kernel/tau_6xx.c +++ b/arch/powerpc/kernel/tau_6xx.c @@ -164,7 +164,7 @@ static void tau_work_func(struct work_struct *work) queue_work(tau_workq, work); } -DECLARE_WORK(tau_work, tau_work_func); +static DECLARE_WORK(tau_work, tau_work_func); /* * setup the TAU -- cgit v1.2.3-59-g8ed1b From f9addd85fbfacf0d155e83dbee8696d6df5ed0c7 Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Fri, 13 Aug 2021 13:51:58 +0530 Subject: powerpc/perf/hv-gpci: Fix counter value parsing H_GetPerformanceCounterInfo (0xF080) hcall returns the counter data in the result buffer. Result buffer has specific format defined in the PAPR specification. One of the fields is counter offset and width of the counter data returned. Counter data are returned in a unsigned char array in big endian byte order. To get the final counter data, the values must be left shifted byte at a time. But commit 220a0c609ad17 ("powerpc/perf: Add support for the hv gpci (get performance counter info) interface") made the shifting bitwise and also assumed little endian order. Because of that, hcall counters values are reported incorrectly. In particular this can lead to counters go backwards which messes up the counter prev vs now calculation and leads to huge counter value reporting: #: perf stat -e hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ -C 0 -I 1000 time counts unit events 1.000078854 18,446,744,073,709,535,232 hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ 2.000213293 0 hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ 3.000320107 0 hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ 4.000428392 0 hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ 5.000537864 0 hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ 6.000649087 0 hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ 7.000760312 0 hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ 8.000865218 16,448 hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ 9.000978985 18,446,744,073,709,535,232 hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ 10.001088891 16,384 hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ 11.001201435 0 hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ 12.001307937 18,446,744,073,709,535,232 hv_gpci/system_tlbie_count_and_time_tlbie_instructions_issued/ Fix the shifting logic to correct match the format, ie. read bytes in big endian order. Fixes: e4f226b1580b ("powerpc/perf/hv-gpci: Increase request buffer size") Cc: stable@vger.kernel.org # v4.6+ Reported-by: Nageswara R Sastry Signed-off-by: Kajol Jain Tested-by: Nageswara R Sastry Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210813082158.429023-1-kjain@linux.ibm.com --- arch/powerpc/perf/hv-gpci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/perf/hv-gpci.c b/arch/powerpc/perf/hv-gpci.c index d48413e28c39..c756228a081f 100644 --- a/arch/powerpc/perf/hv-gpci.c +++ b/arch/powerpc/perf/hv-gpci.c @@ -175,7 +175,7 @@ static unsigned long single_gpci_request(u32 req, u32 starting_index, */ count = 0; for (i = offset; i < offset + length; i++) - count |= arg->bytes[i] << (i - offset); + count |= (u64)(arg->bytes[i]) << ((length - 1 - (i - offset)) * 8); *value = count; out: -- cgit v1.2.3-59-g8ed1b From 4cb266074aa17e9cafed3a92e9f43b161516569f Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Thu, 19 Aug 2021 14:56:52 +0200 Subject: powerpc/pseries/vas: Declare pseries_vas_fault_thread_fn() as static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes a compile error with W=1. Fixes: 6d0aaf5e0de0 ("powerpc/pseries/vas: Setup IRQ and fault handling") Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210819125656.14498-3-clg@kaod.org --- arch/powerpc/platforms/pseries/vas.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/pseries/vas.c b/arch/powerpc/platforms/pseries/vas.c index b5c1cf1bc64d..b043e3936d21 100644 --- a/arch/powerpc/platforms/pseries/vas.c +++ b/arch/powerpc/platforms/pseries/vas.c @@ -184,7 +184,7 @@ static int h_get_nx_fault(u32 winid, u64 buffer) * Note: The hypervisor forwards an interrupt for each fault request. * So one fault CRB to process for each H_GET_NX_FAULT hcall. */ -irqreturn_t pseries_vas_fault_thread_fn(int irq, void *data) +static irqreturn_t pseries_vas_fault_thread_fn(int irq, void *data) { struct pseries_vas_window *txwin = data; struct coprocessor_request_block crb; -- cgit v1.2.3-59-g8ed1b From cb53a93e33e108bfec00a13cd12696e1c0ccc8b6 Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Thu, 19 Aug 2021 14:56:53 +0200 Subject: KVM: PPC: Book3S PR: Declare kvmppc_handle_exit_pr() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes a compile error with W=1. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210819125656.14498-4-clg@kaod.org --- arch/powerpc/kvm/book3s.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h index 740e51def5a5..58391b4b32ed 100644 --- a/arch/powerpc/kvm/book3s.h +++ b/arch/powerpc/kvm/book3s.h @@ -23,7 +23,8 @@ extern int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, extern int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val); extern int kvmppc_book3s_init_pr(void); -extern void kvmppc_book3s_exit_pr(void); +void kvmppc_book3s_exit_pr(void); +extern int kvmppc_handle_exit_pr(struct kvm_vcpu *vcpu, unsigned int exit_nr); #ifdef CONFIG_PPC_TRANSACTIONAL_MEM extern void kvmppc_emulate_tabort(struct kvm_vcpu *vcpu, int ra_val); -- cgit v1.2.3-59-g8ed1b From b352ddae7b2ccd2cb29f495ca0a7c9b6848b623f Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Thu, 19 Aug 2021 14:56:54 +0200 Subject: KVM: PPC: Book3S PR: Remove unused variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes a compile error with W=1. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210819125656.14498-5-clg@kaod.org --- arch/powerpc/kvm/book3s_64_mmu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c index 26b8b27a3755..feee40cb2ba1 100644 --- a/arch/powerpc/kvm/book3s_64_mmu.c +++ b/arch/powerpc/kvm/book3s_64_mmu.c @@ -196,7 +196,7 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, hva_t ptegp; u64 pteg[16]; u64 avpn = 0; - u64 v, r; + u64 r; u64 v_val, v_mask; u64 eaddr_mask; int i; @@ -285,7 +285,6 @@ do_second: goto do_second; } - v = be64_to_cpu(pteg[i]); r = be64_to_cpu(pteg[i+1]); pp = (r & HPTE_R_PP) | key; if (r & HPTE_R_PP0) -- cgit v1.2.3-59-g8ed1b From 898a1ef06ad4a2a8e3c9490ce62624fcd1a7b1f8 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 20 Aug 2021 09:28:19 +0000 Subject: powerpc/audit: Avoid unneccessary #ifdef in syscall_get_arguments() Use is_32bit_task() which already handles CONFIG_COMPAT. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/ba49cdd574558a0363300c3f6b5b062b397cb071.1629451483.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/syscall.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/syscall.h b/arch/powerpc/include/asm/syscall.h index ba0f88f3a30d..7ea3c4044186 100644 --- a/arch/powerpc/include/asm/syscall.h +++ b/arch/powerpc/include/asm/syscall.h @@ -90,10 +90,9 @@ static inline void syscall_get_arguments(struct task_struct *task, unsigned long val, mask = -1UL; unsigned int n = 6; -#ifdef CONFIG_COMPAT - if (test_tsk_thread_flag(task, TIF_32BIT)) + if (is_32bit_task()) mask = 0xffffffff; -#endif + while (n--) { if (n == 0) val = regs->orig_gpr3; -- cgit v1.2.3-59-g8ed1b From 770cec16cdc9d15555e67896dd2214a4fec9a3fa Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 20 Aug 2021 09:39:14 +0000 Subject: powerpc/audit: Simplify syscall_get_arch() Make use of is_32bit_task() and CONFIG_CPU_LITTLE_ENDIAN to simplify syscall_get_arch(). Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/4be53b9187a4d8c163968f4d224267e41a7fcc33.1629451479.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/syscall.h | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/syscall.h b/arch/powerpc/include/asm/syscall.h index 7ea3c4044186..c60ebd04b2ed 100644 --- a/arch/powerpc/include/asm/syscall.h +++ b/arch/powerpc/include/asm/syscall.h @@ -115,16 +115,11 @@ static inline void syscall_set_arguments(struct task_struct *task, static inline int syscall_get_arch(struct task_struct *task) { - int arch; - - if (IS_ENABLED(CONFIG_PPC64) && !test_tsk_thread_flag(task, TIF_32BIT)) - arch = AUDIT_ARCH_PPC64; + if (is_32bit_task()) + return AUDIT_ARCH_PPC; + else if (IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN)) + return AUDIT_ARCH_PPC64LE; else - arch = AUDIT_ARCH_PPC; - -#ifdef __LITTLE_ENDIAN__ - arch |= __AUDIT_ARCH_LE; -#endif - return arch; + return AUDIT_ARCH_PPC64; } #endif /* _ASM_SYSCALL_H */ -- cgit v1.2.3-59-g8ed1b From a00ea5b6f2bbef8b004b0b7228c61680a50c7c3f Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 23 Aug 2021 06:45:20 +0000 Subject: powerpc/syscalls: Remove __NR__exit __NR__exit is nowhere used. On most architectures it was removed by commit 135ab6ec8fda ("[PATCH] remove remaining errno and __KERNEL_SYSCALLS__ references") but not on powerpc. powerpc removed __KERNEL_SYSCALLS__ in commit 3db03b4afb3e ("[PATCH] rename the provided execve functions to kernel_execve"), but __NR__exit was left over. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/6457eb4f327313323ed1f70e540bbb4ddc9178fa.1629701106.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/unistd.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index b541c690a31c..5eb462af6766 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -9,8 +9,6 @@ #define NR_syscalls __NR_syscalls -#define __NR__exit __NR_exit - #ifndef __ASSEMBLY__ #include -- cgit v1.2.3-59-g8ed1b From 3accc0faef081b6813967b34f7d05a3edb855cbd Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Mon, 23 Aug 2021 11:00:38 +0200 Subject: powerpc/prom: Fix unused variable ‘reserve_map’ when CONFIG_PPC32 is not set MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes a compile error with W=1. arch/powerpc/kernel/prom.c: In function ‘early_reserve_mem’: arch/powerpc/kernel/prom.c:625:10: error: variable ‘reserve_map’ set but not used [-Werror=unused-but-set-variable] __be64 *reserve_map; ^~~~~~~~~~~ cc1: all warnings being treated as errors Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210823090039.166120-2-clg@kaod.org --- arch/powerpc/kernel/prom.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index f620e04dc9bf..44b2cdc0aae3 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -640,7 +640,9 @@ static void __init early_reserve_mem(void) } #endif /* CONFIG_BLK_DEV_INITRD */ -#ifdef CONFIG_PPC32 + if (!IS_ENABLED(CONFIG_PPC32)) + return; + /* * Handle the case where we might be booting from an old kexec * image that setup the mem_rsvmap as pairs of 32-bit values @@ -661,7 +663,6 @@ static void __init early_reserve_mem(void) } return; } -#endif } #ifdef CONFIG_PPC_TRANSACTIONAL_MEM -- cgit v1.2.3-59-g8ed1b From cc47ad409ba9cc950e9c492c8ba653dabd392148 Mon Sep 17 00:00:00 2001 From: Cédric Le Goater Date: Mon, 23 Aug 2021 11:00:39 +0200 Subject: powerpc/compat_sys: Declare syscalls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes a compile error with W=1. Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210823090039.166120-3-clg@kaod.org --- arch/powerpc/include/asm/syscalls.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/syscalls.h b/arch/powerpc/include/asm/syscalls.h index 398171fdcd9f..7ee66ae5444d 100644 --- a/arch/powerpc/include/asm/syscalls.h +++ b/arch/powerpc/include/asm/syscalls.h @@ -6,6 +6,7 @@ #include #include #include +#include struct rtas_args; @@ -18,5 +19,34 @@ asmlinkage long sys_mmap2(unsigned long addr, size_t len, asmlinkage long ppc64_personality(unsigned long personality); asmlinkage long sys_rtas(struct rtas_args __user *uargs); +#ifdef CONFIG_COMPAT +unsigned long compat_sys_mmap2(unsigned long addr, size_t len, + unsigned long prot, unsigned long flags, + unsigned long fd, unsigned long pgoff); + +compat_ssize_t compat_sys_pread64(unsigned int fd, char __user *ubuf, compat_size_t count, + u32 reg6, u32 pos1, u32 pos2); + +compat_ssize_t compat_sys_pwrite64(unsigned int fd, const char __user *ubuf, compat_size_t count, + u32 reg6, u32 pos1, u32 pos2); + +compat_ssize_t compat_sys_readahead(int fd, u32 r4, u32 offset1, u32 offset2, u32 count); + +int compat_sys_truncate64(const char __user *path, u32 reg4, + unsigned long len1, unsigned long len2); + +long compat_sys_fallocate(int fd, int mode, u32 offset1, u32 offset2, u32 len1, u32 len2); + +int compat_sys_ftruncate64(unsigned int fd, u32 reg4, unsigned long len1, + unsigned long len2); + +long ppc32_fadvise64(int fd, u32 unused, u32 offset1, u32 offset2, + size_t len, int advice); + +long compat_sys_sync_file_range2(int fd, unsigned int flags, + unsigned int offset1, unsigned int offset2, + unsigned int nbytes1, unsigned int nbytes2); +#endif + #endif /* __KERNEL__ */ #endif /* __ASM_POWERPC_SYSCALLS_H */ -- cgit v1.2.3-59-g8ed1b From 5d7d6dac8fe99ed59eee2300e4a03370f94d5222 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 5 Aug 2021 18:26:14 -0300 Subject: KVM: PPC: Book3S HV: Fix copy_tofrom_guest routines The __kvmhv_copy_tofrom_guest_radix function was introduced along with nested HV guest support. It uses the platform's Radix MMU quadrants to provide a nested hypervisor with fast access to its nested guests memory (H_COPY_TOFROM_GUEST hypercall). It has also since been added as a fast path for the kvmppc_ld/st routines which are used during instruction emulation. The commit def0bfdbd603 ("powerpc: use probe_user_read() and probe_user_write()") changed the low level copy function from raw_copy_from_user to probe_user_read, which adds a check to access_ok. In powerpc that is: static inline bool __access_ok(unsigned long addr, unsigned long size) { return addr < TASK_SIZE_MAX && size <= TASK_SIZE_MAX - addr; } and TASK_SIZE_MAX is 0x0010000000000000UL for 64-bit, which means that setting the two MSBs of the effective address (which correspond to the quadrant) now cause access_ok to reject the access. This was not caught earlier because the most common code path via kvmppc_ld/st contains a fallback (kvm_read_guest) that is likely to succeed for L1 guests. For nested guests there is no fallback. Another issue is that probe_user_read (now __copy_from_user_nofault) does not return the number of bytes not copied in case of failure, so the destination memory is not being cleared anymore in kvmhv_copy_from_guest_radix: ret = kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, to, NULL, n); if (ret > 0) <-- always false! memset(to + (n - ret), 0, ret); This patch fixes both issues by skipping access_ok and open-coding the low level __copy_to/from_user_inatomic. Fixes: def0bfdbd603 ("powerpc: use probe_user_read() and probe_user_write()") Signed-off-by: Fabiano Rosas Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210805212616.2641017-2-farosas@linux.ibm.com --- arch/powerpc/kvm/book3s_64_mmu_radix.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index b5905ae4377c..44eb7b1ef289 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -65,10 +65,12 @@ unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid, } isync(); + pagefault_disable(); if (is_load) - ret = copy_from_user_nofault(to, (const void __user *)from, n); + ret = __copy_from_user_inatomic(to, (const void __user *)from, n); else - ret = copy_to_user_nofault((void __user *)to, from, n); + ret = __copy_to_user_inatomic((void __user *)to, from, n); + pagefault_enable(); /* switch the pid first to avoid running host with unallocated pid */ if (quadrant == 1 && pid != old_pid) -- cgit v1.2.3-59-g8ed1b From c232461c0c3b0aca637f0d7658a7f5d0bb393a4e Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 5 Aug 2021 18:26:15 -0300 Subject: KVM: PPC: Book3S HV: Add sanity check to copy_tofrom_guest Both paths into __kvmhv_copy_tofrom_guest_radix ensure that we arrive with an effective address that is smaller than our total addressable space and addresses quadrant 0. - The H_COPY_TOFROM_GUEST hypercall path rejects the call with H_PARAMETER if the effective address has any of the twelve most significant bits set. - The kvmhv_copy_tofrom_guest_radix path clears the top twelve bits before calling the internal function. Although the callers make sure that the effective address is sane, any future use of the function is exposed to a programming error, so add a sanity check. Suggested-by: Nicholas Piggin Signed-off-by: Fabiano Rosas Reviewed-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210805212616.2641017-3-farosas@linux.ibm.com --- arch/powerpc/kvm/book3s_64_mmu_radix.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 44eb7b1ef289..1b1c9e9e539b 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -44,6 +44,9 @@ unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid, (to != NULL) ? __pa(to): 0, (from != NULL) ? __pa(from): 0, n); + if (eaddr & (0xFFFUL << 52)) + return ret; + quadrant = 1; if (!pid) quadrant = 2; -- cgit v1.2.3-59-g8ed1b From 0eb596f1e6103ebe122792a425b88c5dc21c4087 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 5 Aug 2021 18:26:16 -0300 Subject: KVM: PPC: Book3S HV: Stop exporting symbols from book3s_64_mmu_radix The book3s_64_mmu_radix.o object is not part of the KVM builtins and all the callers of the exported symbols are in the same kvm-hv.ko module so we should not need to export any symbols. Signed-off-by: Fabiano Rosas Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210805212616.2641017-4-farosas@linux.ibm.com --- arch/powerpc/kvm/book3s_64_mmu_radix.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 1b1c9e9e539b..16359525a40f 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -86,7 +86,6 @@ unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid, return ret; } -EXPORT_SYMBOL_GPL(__kvmhv_copy_tofrom_guest_radix); static long kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *to, void *from, unsigned long n) @@ -122,14 +121,12 @@ long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *to, return ret; } -EXPORT_SYMBOL_GPL(kvmhv_copy_from_guest_radix); long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *from, unsigned long n) { return kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, NULL, from, n); } -EXPORT_SYMBOL_GPL(kvmhv_copy_to_guest_radix); int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr, struct kvmppc_pte *gpte, u64 root, -- cgit v1.2.3-59-g8ed1b From 113ec9ccc8049c3772f0eab46b62c5d6654c09f7 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 20 Aug 2021 05:16:05 +0000 Subject: powerpc/32: indirect function call use bctrl rather than blrl in ret_from_kernel_thread Copied from commit 89bbe4c798bc ("powerpc/64: indirect function call use bctrl rather than blrl in ret_from_kernel_thread") blrl is not recommended to use as an indirect function call, as it may corrupt the link stack predictor. This is not a performance critical path but this should be fixed for consistency. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/91b1d242525307ceceec7ef6e832bfbacdd4501b.1629436472.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/entry_32.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 0273a1349006..61fdd53cdd9a 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -161,10 +161,10 @@ ret_from_fork: ret_from_kernel_thread: REST_NVGPRS(r1) bl schedule_tail - mtlr r14 + mtctr r14 mr r3,r15 PPC440EP_ERR42 - blrl + bctrl li r3,0 b ret_from_syscall -- cgit v1.2.3-59-g8ed1b From f5007dbf4da729baa850b33a64dc3cc53757bdf8 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 24 Aug 2021 07:56:26 +0000 Subject: powerpc/booke: Avoid link stack corruption in several places Use bcl 20,31,+4 instead of bl in order to preserve link stack. See commit c974809a26a1 ("powerpc/vdso: Avoid link stack corruption in __get_datapage()") for details. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/e9fbc285eceb720e6c0e032ef47fe8b05f669b48.1629791751.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/ppc_asm.h | 2 +- arch/powerpc/kernel/exceptions-64e.S | 6 +++--- arch/powerpc/kernel/fsl_booke_entry_mapping.S | 8 ++++---- arch/powerpc/kernel/head_44x.S | 6 +++--- arch/powerpc/kernel/head_fsl_booke.S | 6 +++--- arch/powerpc/mm/nohash/tlb_low.S | 4 ++-- 6 files changed, 16 insertions(+), 16 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h index ffe712307e11..1c538a9a11e0 100644 --- a/arch/powerpc/include/asm/ppc_asm.h +++ b/arch/powerpc/include/asm/ppc_asm.h @@ -260,7 +260,7 @@ n: /* Be careful, this will clobber the lr register. */ #define LOAD_REG_ADDR_PIC(reg, name) \ - bl 0f; \ + bcl 20,31,$+4; \ 0: mflr reg; \ addis reg,reg,(name - 0b)@ha; \ addi reg,reg,(name - 0b)@l; diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index 1401787b0b93..7e0943d9f9b0 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -1127,7 +1127,7 @@ found_iprot: * r3 = MAS0_TLBSEL (for the iprot array) * r4 = SPRN_TLBnCFG */ - bl invstr /* Find our address */ + bcl 20,31,$+4 /* Find our address */ invstr: mflr r6 /* Make it accessible */ mfmsr r7 rlwinm r5,r7,27,31,31 /* extract MSR[IS] */ @@ -1196,7 +1196,7 @@ skpinv: addi r6,r6,1 /* Increment */ mfmsr r6 xori r6,r6,MSR_IS mtspr SPRN_SRR1,r6 - bl 1f /* Find our address */ + bcl 20,31,$+4 /* Find our address */ 1: mflr r6 addi r6,r6,(2f - 1b) mtspr SPRN_SRR0,r6 @@ -1256,7 +1256,7 @@ skpinv: addi r6,r6,1 /* Increment */ * r4 = MAS0 w/TLBSEL & ESEL for the temp mapping */ /* Now we branch the new virtual address mapped by this entry */ - bl 1f /* Find our address */ + bcl 20,31,$+4 /* Find our address */ 1: mflr r6 addi r6,r6,(2f - 1b) tovirt(r6,r6) diff --git a/arch/powerpc/kernel/fsl_booke_entry_mapping.S b/arch/powerpc/kernel/fsl_booke_entry_mapping.S index 8bccce6544b5..dedc17fac8f8 100644 --- a/arch/powerpc/kernel/fsl_booke_entry_mapping.S +++ b/arch/powerpc/kernel/fsl_booke_entry_mapping.S @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* 1. Find the index of the entry we're executing in */ - bl invstr /* Find our address */ + bcl 20,31,$+4 /* Find our address */ invstr: mflr r6 /* Make it accessible */ mfmsr r7 rlwinm r4,r7,27,31,31 /* extract MSR[IS] */ @@ -85,7 +85,7 @@ skpinv: addi r6,r6,1 /* Increment */ addi r6,r6,10 slw r6,r8,r6 /* convert to mask */ - bl 1f /* Find our address */ + bcl 20,31,$+4 /* Find our address */ 1: mflr r7 mfspr r8,SPRN_MAS3 @@ -117,7 +117,7 @@ skpinv: addi r6,r6,1 /* Increment */ xori r6,r4,1 slwi r6,r6,5 /* setup new context with other address space */ - bl 1f /* Find our address */ + bcl 20,31,$+4 /* Find our address */ 1: mflr r9 rlwimi r7,r9,0,20,31 addi r7,r7,(2f - 1b) @@ -207,7 +207,7 @@ next_tlb_setup: lis r7,MSR_KERNEL@h ori r7,r7,MSR_KERNEL@l - bl 1f /* Find our address */ + bcl 20,31,$+4 /* Find our address */ 1: mflr r9 rlwimi r6,r9,0,20,31 addi r6,r6,(2f - 1b) diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S index ddc978a2d381..02d2928d1e01 100644 --- a/arch/powerpc/kernel/head_44x.S +++ b/arch/powerpc/kernel/head_44x.S @@ -70,7 +70,7 @@ _ENTRY(_start); * address. * r21 will be loaded with the physical runtime address of _stext */ - bl 0f /* Get our runtime address */ + bcl 20,31,$+4 /* Get our runtime address */ 0: mflr r21 /* Make it accessible */ addis r21,r21,(_stext - 0b)@ha addi r21,r21,(_stext - 0b)@l /* Get our current runtime base */ @@ -853,7 +853,7 @@ _GLOBAL(init_cpu_state) wmmucr: mtspr SPRN_MMUCR,r3 /* Put MMUCR */ sync - bl invstr /* Find our address */ + bcl 20,31,$+4 /* Find our address */ invstr: mflr r5 /* Make it accessible */ tlbsx r23,0,r5 /* Find entry we are in */ li r4,0 /* Start at TLB entry 0 */ @@ -1045,7 +1045,7 @@ head_start_47x: sync /* Find the entry we are running from */ - bl 1f + bcl 20,31,$+4 1: mflr r23 tlbsx r23,0,r23 tlbre r24,r23,0 diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S index 0f9642f36b49..dbf3b89e543c 100644 --- a/arch/powerpc/kernel/head_fsl_booke.S +++ b/arch/powerpc/kernel/head_fsl_booke.S @@ -79,7 +79,7 @@ _ENTRY(_start); mr r23,r3 mr r25,r4 - bl 0f + bcl 20,31,$+4 0: mflr r8 addis r3,r8,(is_second_reloc - 0b)@ha lwz r19,(is_second_reloc - 0b)@l(r3) @@ -1132,7 +1132,7 @@ _GLOBAL(switch_to_as1) bne 1b /* Get the tlb entry used by the current running code */ - bl 0f + bcl 20,31,$+4 0: mflr r4 tlbsx 0,r4 @@ -1166,7 +1166,7 @@ _GLOBAL(switch_to_as1) _GLOBAL(restore_to_as0) mflr r0 - bl 0f + bcl 20,31,$+4 0: mflr r9 addi r9,r9,1f - 0b diff --git a/arch/powerpc/mm/nohash/tlb_low.S b/arch/powerpc/mm/nohash/tlb_low.S index 4613bf8e9aae..5add4a51e51f 100644 --- a/arch/powerpc/mm/nohash/tlb_low.S +++ b/arch/powerpc/mm/nohash/tlb_low.S @@ -199,7 +199,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_476_DD2) * Touch enough instruction cache lines to ensure cache hits */ 1: mflr r9 - bl 2f + bcl 20,31,$+4 2: mflr r6 li r7,32 PPC_ICBT(0,R6,R7) /* touch next cache line */ @@ -414,7 +414,7 @@ _GLOBAL(loadcam_multi) * Set up temporary TLB entry that is the same as what we're * running from, but in AS=1. */ - bl 1f + bcl 20,31,$+4 1: mflr r6 tlbsx 0,r8 mfspr r6,SPRN_MAS1 -- cgit v1.2.3-59-g8ed1b From 33e1402435cb9f3021439a15935ea2dc69ec1844 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 24 Aug 2021 07:56:35 +0000 Subject: powerpc: Avoid link stack corruption in misc asm functions bl;mflr is used at several places to get code position. Use bcl 20,31,+4 instead of bl in order to preserve link stack. See commit c974809a26a1 ("powerpc/vdso: Avoid link stack corruption in __get_datapage()") for details. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/c6eabb4fb6c156f75d56dcbcc6f243e5ac0fba42.1629791763.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/misc.S | 2 +- arch/powerpc/kernel/misc_32.S | 2 +- arch/powerpc/kernel/misc_64.S | 2 +- arch/powerpc/kernel/reloc_32.S | 2 +- arch/powerpc/kexec/relocate_32.S | 12 ++++++------ 5 files changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S index 5be96feccb55..fb7de3543c03 100644 --- a/arch/powerpc/kernel/misc.S +++ b/arch/powerpc/kernel/misc.S @@ -29,7 +29,7 @@ _GLOBAL(reloc_offset) li r3, 0 _GLOBAL(add_reloc_offset) mflr r0 - bl 1f + bcl 20,31,$+4 1: mflr r5 PPC_LL r4,(2f-1b)(r5) subf r5,r4,r5 diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index d8645efff902..e5127b19fec2 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -67,7 +67,7 @@ _GLOBAL(reloc_got2) srwi. r8,r8,2 beqlr mtctr r8 - bl 1f + bcl 20,31,$+4 1: mflr r0 lis r4,1b@ha addi r4,r4,1b@l diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S index 4b761a18a74d..d38a019b38e1 100644 --- a/arch/powerpc/kernel/misc_64.S +++ b/arch/powerpc/kernel/misc_64.S @@ -255,7 +255,7 @@ _GLOBAL(scom970_write) * Physical (hardware) cpu id should be in r3. */ _GLOBAL(kexec_wait) - bl 1f + bcl 20,31,$+4 1: mflr r5 addi r5,r5,kexec_flag-1b diff --git a/arch/powerpc/kernel/reloc_32.S b/arch/powerpc/kernel/reloc_32.S index 10e96f3e22fe..0508c14b4c28 100644 --- a/arch/powerpc/kernel/reloc_32.S +++ b/arch/powerpc/kernel/reloc_32.S @@ -30,7 +30,7 @@ R_PPC_RELATIVE = 22 _GLOBAL(relocate) mflr r0 /* Save our LR */ - bl 0f /* Find our current runtime address */ + bcl 20,31,$+4 /* Find our current runtime address */ 0: mflr r12 /* Make it accessible */ mtlr r0 diff --git a/arch/powerpc/kexec/relocate_32.S b/arch/powerpc/kexec/relocate_32.S index 61946c19e07c..cf6e52bdf8d8 100644 --- a/arch/powerpc/kexec/relocate_32.S +++ b/arch/powerpc/kexec/relocate_32.S @@ -93,7 +93,7 @@ wmmucr: * Invalidate all the TLB entries except the current entry * where we are running from */ - bl 0f /* Find our address */ + bcl 20,31,$+4 /* Find our address */ 0: mflr r5 /* Make it accessible */ tlbsx r23,0,r5 /* Find entry we are in */ li r4,0 /* Start at TLB entry 0 */ @@ -158,7 +158,7 @@ write_out: /* Switch to other address space in MSR */ insrwi r9, r7, 1, 26 /* Set MSR[IS] = r7 */ - bl 1f + bcl 20,31,$+4 1: mflr r8 addi r8, r8, (2f-1b) /* Find the target offset */ @@ -202,7 +202,7 @@ next_tlb: li r9,0 insrwi r9, r7, 1, 26 /* Set MSR[IS] = r7 */ - bl 1f + bcl 20,31,$+4 1: mflr r8 and r8, r8, r11 /* Get our offset within page */ addi r8, r8, (2f-1b) @@ -240,7 +240,7 @@ setup_map_47x: sync /* Find the entry we are running from */ - bl 2f + bcl 20,31,$+4 2: mflr r23 tlbsx r23, 0, r23 tlbre r24, r23, 0 /* TLB Word 0 */ @@ -296,7 +296,7 @@ clear_utlb_entry: /* Update the msr to the new TS */ insrwi r5, r7, 1, 26 - bl 1f + bcl 20,31,$+4 1: mflr r6 addi r6, r6, (2f-1b) @@ -355,7 +355,7 @@ write_utlb: /* Defaults to 256M */ lis r10, 0x1000 - bl 1f + bcl 20,31,$+4 1: mflr r4 addi r4, r4, (2f-1b) /* virtual address of 2f */ -- cgit v1.2.3-59-g8ed1b From 11f27a7fa4ca27935de74e3eb052bdc430d5f8d8 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 8 Jul 2021 16:49:40 +0000 Subject: powerpc/ptdump: Use DEFINE_SHOW_ATTRIBUTE() Use DEFINE_SHOW_ATTRIBUTE() instead of open coding open() and fops. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/b864a92693ca8413ef0b19f0c12065c212899b6e.1625762905.git.christophe.leroy@csgroup.eu --- arch/powerpc/mm/ptdump/bats.c | 14 ++------------ arch/powerpc/mm/ptdump/hashpagetable.c | 12 +----------- arch/powerpc/mm/ptdump/ptdump.c | 13 +------------ arch/powerpc/mm/ptdump/segment_regs.c | 12 +----------- 4 files changed, 5 insertions(+), 46 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/ptdump/bats.c b/arch/powerpc/mm/ptdump/bats.c index 8bf7383fb26c..820c119013e4 100644 --- a/arch/powerpc/mm/ptdump/bats.c +++ b/arch/powerpc/mm/ptdump/bats.c @@ -57,7 +57,7 @@ static void bat_show_603(struct seq_file *m, int idx, u32 lower, u32 upper, bool #define BAT_SHOW_603(_m, _n, _l, _u, _d) bat_show_603(_m, _n, mfspr(_l), mfspr(_u), _d) -static int bats_show_603(struct seq_file *m, void *v) +static int bats_show(struct seq_file *m, void *v) { seq_puts(m, "---[ Instruction Block Address Translation ]---\n"); @@ -88,17 +88,7 @@ static int bats_show_603(struct seq_file *m, void *v) return 0; } -static int bats_open(struct inode *inode, struct file *file) -{ - return single_open(file, bats_show_603, NULL); -} - -static const struct file_operations bats_fops = { - .open = bats_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(bats); static int __init bats_init(void) { diff --git a/arch/powerpc/mm/ptdump/hashpagetable.c b/arch/powerpc/mm/ptdump/hashpagetable.c index ad6df9a2e7c8..c7f824d294b2 100644 --- a/arch/powerpc/mm/ptdump/hashpagetable.c +++ b/arch/powerpc/mm/ptdump/hashpagetable.c @@ -526,17 +526,7 @@ static int ptdump_show(struct seq_file *m, void *v) return 0; } -static int ptdump_open(struct inode *inode, struct file *file) -{ - return single_open(file, ptdump_show, NULL); -} - -static const struct file_operations ptdump_fops = { - .open = ptdump_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(ptdump); static int ptdump_init(void) { diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 5062c58b1e5b..349fd8fe173f 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -397,18 +397,7 @@ static int ptdump_show(struct seq_file *m, void *v) return 0; } - -static int ptdump_open(struct inode *inode, struct file *file) -{ - return single_open(file, ptdump_show, NULL); -} - -static const struct file_operations ptdump_fops = { - .open = ptdump_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(ptdump); static void build_pgtable_complete_mask(void) { diff --git a/arch/powerpc/mm/ptdump/segment_regs.c b/arch/powerpc/mm/ptdump/segment_regs.c index 9223dfb85c51..9df3af8d481f 100644 --- a/arch/powerpc/mm/ptdump/segment_regs.c +++ b/arch/powerpc/mm/ptdump/segment_regs.c @@ -41,17 +41,7 @@ static int sr_show(struct seq_file *m, void *v) return 0; } -static int sr_open(struct inode *inode, struct file *file) -{ - return single_open(file, sr_show, NULL); -} - -static const struct file_operations sr_fops = { - .open = sr_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(sr); static int __init sr_init(void) { -- cgit v1.2.3-59-g8ed1b From 64b87b0c70e0fd28352895cba3c0a9631e0072dd Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 8 Jul 2021 16:49:41 +0000 Subject: powerpc/ptdump: Remove unused 'page_size' parameter note_page_update_state() doesn't use page_size. Remove it. Could also be removed to note_page() but as a following patch will remove all current users of note_page(), just leave it as is for now. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/e2f80d052001155251bfe009c360d0c5d9242c6b.1625762906.git.christophe.leroy@csgroup.eu --- arch/powerpc/mm/ptdump/ptdump.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 349fd8fe173f..3eb8732641da 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -189,7 +189,7 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr) } static void note_page_update_state(struct pg_state *st, unsigned long addr, - unsigned int level, u64 val, unsigned long page_size) + unsigned int level, u64 val) { u64 flag = val & pg_level[level].mask; u64 pa = val & PTE_RPN_MASK; @@ -213,7 +213,7 @@ static void note_page(struct pg_state *st, unsigned long addr, /* At first no level is set */ if (!st->level) { pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); - note_page_update_state(st, addr, level, val, page_size); + note_page_update_state(st, addr, level, val); /* * Dump the section of virtual memory when: * - the PTE flags from one entry to the next differs. @@ -242,7 +242,7 @@ static void note_page(struct pg_state *st, unsigned long addr, * Address indicates we have passed the end of the * current section of virtual memory */ - note_page_update_state(st, addr, level, val, page_size); + note_page_update_state(st, addr, level, val); } } -- cgit v1.2.3-59-g8ed1b From cf98d2b6eea6a1b2c43f85680ad58fcc3ea9496b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 8 Jul 2021 16:49:42 +0000 Subject: powerpc/ptdump: Reduce level numbers by 1 in note_page() and add p4d level Do the same as commit f8f0d0b6fa20 ("mm: ptdump: reduce level numbers by 1 in note_page()") and add missing p4d level. This will align powerpc to the users of generic ptdump. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/d76495c574132b197b445a1f133755cca4b912a4.1625762906.git.christophe.leroy@csgroup.eu --- arch/powerpc/mm/ptdump/8xx.c | 6 ++++-- arch/powerpc/mm/ptdump/book3s64.c | 6 ++++-- arch/powerpc/mm/ptdump/ptdump.c | 17 +++++++++-------- arch/powerpc/mm/ptdump/shared.c | 6 ++++-- 4 files changed, 21 insertions(+), 14 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/ptdump/8xx.c b/arch/powerpc/mm/ptdump/8xx.c index 86da2a669680..fac932eb8f9a 100644 --- a/arch/powerpc/mm/ptdump/8xx.c +++ b/arch/powerpc/mm/ptdump/8xx.c @@ -75,8 +75,10 @@ static const struct flag_info flag_array[] = { }; struct pgtable_level pg_level[5] = { - { - }, { /* pgd */ + { /* pgd */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* p4d */ .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, { /* pud */ diff --git a/arch/powerpc/mm/ptdump/book3s64.c b/arch/powerpc/mm/ptdump/book3s64.c index 14f73868db66..5ad92d9dc5d1 100644 --- a/arch/powerpc/mm/ptdump/book3s64.c +++ b/arch/powerpc/mm/ptdump/book3s64.c @@ -103,8 +103,10 @@ static const struct flag_info flag_array[] = { }; struct pgtable_level pg_level[5] = { - { - }, { /* pgd */ + { /* pgd */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* p4d */ .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, { /* pud */ diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 3eb8732641da..fb531bc64fc5 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -58,7 +58,7 @@ struct pg_state { const struct addr_marker *marker; unsigned long start_address; unsigned long start_pa; - unsigned int level; + int level; u64 current_flags; bool check_wx; unsigned long wx_pages; @@ -188,10 +188,9 @@ static void note_prot_wx(struct pg_state *st, unsigned long addr) st->wx_pages += (addr - st->start_address) / PAGE_SIZE; } -static void note_page_update_state(struct pg_state *st, unsigned long addr, - unsigned int level, u64 val) +static void note_page_update_state(struct pg_state *st, unsigned long addr, int level, u64 val) { - u64 flag = val & pg_level[level].mask; + u64 flag = level >= 0 ? val & pg_level[level].mask : 0; u64 pa = val & PTE_RPN_MASK; st->level = level; @@ -206,12 +205,12 @@ static void note_page_update_state(struct pg_state *st, unsigned long addr, } static void note_page(struct pg_state *st, unsigned long addr, - unsigned int level, u64 val, unsigned long page_size) + int level, u64 val, unsigned long page_size) { - u64 flag = val & pg_level[level].mask; + u64 flag = level >= 0 ? val & pg_level[level].mask : 0; /* At first no level is set */ - if (!st->level) { + if (st->level == -1) { pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); note_page_update_state(st, addr, level, val); /* @@ -383,6 +382,7 @@ static int ptdump_show(struct seq_file *m, void *v) struct pg_state st = { .seq = m, .marker = address_markers, + .level = -1, .start_address = IS_ENABLED(CONFIG_PPC64) ? PAGE_OFFSET : TASK_SIZE, }; @@ -393,7 +393,7 @@ static int ptdump_show(struct seq_file *m, void *v) /* Traverse kernel page tables */ walk_pagetables(&st); - note_page(&st, 0, 0, 0, 0); + note_page(&st, 0, -1, 0, 0); return 0; } @@ -415,6 +415,7 @@ void ptdump_check_wx(void) struct pg_state st = { .seq = NULL, .marker = address_markers, + .level = -1, .check_wx = true, .start_address = IS_ENABLED(CONFIG_PPC64) ? PAGE_OFFSET : TASK_SIZE, }; diff --git a/arch/powerpc/mm/ptdump/shared.c b/arch/powerpc/mm/ptdump/shared.c index c005fe041c18..03607ab90c66 100644 --- a/arch/powerpc/mm/ptdump/shared.c +++ b/arch/powerpc/mm/ptdump/shared.c @@ -68,8 +68,10 @@ static const struct flag_info flag_array[] = { }; struct pgtable_level pg_level[5] = { - { - }, { /* pgd */ + { /* pgd */ + .flag = flag_array, + .num = ARRAY_SIZE(flag_array), + }, { /* p4d */ .flag = flag_array, .num = ARRAY_SIZE(flag_array), }, { /* pud */ -- cgit v1.2.3-59-g8ed1b From e084728393a58e7fdafeee2e6b20e0faff09b557 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 8 Jul 2021 16:49:43 +0000 Subject: powerpc/ptdump: Convert powerpc to GENERIC_PTDUMP This patch converts powerpc to the generic PTDUMP implementation. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/03166d569526be70214fe9370a7bad219d2f41c8.1625762907.git.christophe.leroy@csgroup.eu --- arch/powerpc/Kconfig | 2 + arch/powerpc/Kconfig.debug | 30 --------- arch/powerpc/mm/Makefile | 2 +- arch/powerpc/mm/mmu_decl.h | 2 +- arch/powerpc/mm/ptdump/Makefile | 9 ++- arch/powerpc/mm/ptdump/ptdump.c | 146 ++++++++++------------------------------ 6 files changed, 47 insertions(+), 144 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index d01e3401581d..ec866085ff8b 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -123,6 +123,7 @@ config PPC select ARCH_HAS_COPY_MC if PPC64 select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEBUG_VM_PGTABLE + select ARCH_HAS_DEBUG_WX if STRICT_KERNEL_RWX select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_DMA_MAP_DIRECT if PPC_PSERIES select ARCH_HAS_ELF_RANDOMIZE @@ -182,6 +183,7 @@ config PPC select GENERIC_IRQ_SHOW select GENERIC_IRQ_SHOW_LEVEL select GENERIC_PCI_IOMAP if PCI + select GENERIC_PTDUMP select GENERIC_SMP_IDLE_THREAD select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index 205cd77f321f..192f0ed0097f 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -365,36 +365,6 @@ config FAIL_IOMMU If you are unsure, say N. -config PPC_PTDUMP - bool "Export kernel pagetable layout to userspace via debugfs" - depends on DEBUG_KERNEL && DEBUG_FS - help - This option exports the state of the kernel pagetables to a - debugfs file. This is only useful for kernel developers who are - working in architecture specific areas of the kernel - probably - not a good idea to enable this feature in a production kernel. - - If you are unsure, say N. - -config PPC_DEBUG_WX - bool "Warn on W+X mappings at boot" - depends on PPC_PTDUMP && STRICT_KERNEL_RWX - help - Generate a warning if any W+X mappings are found at boot. - - This is useful for discovering cases where the kernel is leaving - W+X mappings after applying NX, as such mappings are a security risk. - - Note that even if the check fails, your kernel is possibly - still fine, as W+X mappings are not a security hole in - themselves, what they do is that they make the exploitation - of other unfixed kernel bugs easier. - - There is no runtime or memory usage effect of this option - once the kernel has booted up - it's a one time check. - - If in doubt, say "Y". - config PPC_FAST_ENDIAN_SWITCH bool "Deprecated fast endian-switch syscall" depends on DEBUG_KERNEL && PPC_BOOK3S_64 diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index eae4ec2988fc..df8172da2301 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -18,5 +18,5 @@ obj-$(CONFIG_PPC_MM_SLICES) += slice.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o -obj-$(CONFIG_PPC_PTDUMP) += ptdump/ +obj-$(CONFIG_PTDUMP_CORE) += ptdump/ obj-$(CONFIG_KASAN) += kasan/ diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h index 7dac910c0b21..dd1cabc2ea0f 100644 --- a/arch/powerpc/mm/mmu_decl.h +++ b/arch/powerpc/mm/mmu_decl.h @@ -180,7 +180,7 @@ static inline void mmu_mark_rodata_ro(void) { } void __init mmu_mapin_immr(void); #endif -#ifdef CONFIG_PPC_DEBUG_WX +#ifdef CONFIG_DEBUG_WX void ptdump_check_wx(void); #else static inline void ptdump_check_wx(void) { } diff --git a/arch/powerpc/mm/ptdump/Makefile b/arch/powerpc/mm/ptdump/Makefile index 712762be3cb1..4050cbb55acf 100644 --- a/arch/powerpc/mm/ptdump/Makefile +++ b/arch/powerpc/mm/ptdump/Makefile @@ -5,5 +5,10 @@ obj-y += ptdump.o obj-$(CONFIG_4xx) += shared.o obj-$(CONFIG_PPC_8xx) += 8xx.o obj-$(CONFIG_PPC_BOOK3E_MMU) += shared.o -obj-$(CONFIG_PPC_BOOK3S_32) += shared.o bats.o segment_regs.o -obj-$(CONFIG_PPC_BOOK3S_64) += book3s64.o hashpagetable.o +obj-$(CONFIG_PPC_BOOK3S_32) += shared.o +obj-$(CONFIG_PPC_BOOK3S_64) += book3s64.o + +ifdef CONFIG_PTDUMP_DEBUGFS +obj-$(CONFIG_PPC_BOOK3S_32) += bats.o segment_regs.o +obj-$(CONFIG_PPC_BOOK3S_64) += hashpagetable.o +endif diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index fb531bc64fc5..2d80d775d15e 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -54,6 +55,7 @@ * */ struct pg_state { + struct ptdump_state ptdump; struct seq_file *seq; const struct addr_marker *marker; unsigned long start_address; @@ -102,6 +104,11 @@ static struct addr_marker address_markers[] = { { -1, NULL }, }; +static struct ptdump_range ptdump_range[] __ro_after_init = { + {TASK_SIZE_MAX, ~0UL}, + {0, 0} +}; + #define pt_dump_seq_printf(m, fmt, args...) \ ({ \ if (m) \ @@ -204,10 +211,10 @@ static void note_page_update_state(struct pg_state *st, unsigned long addr, int } } -static void note_page(struct pg_state *st, unsigned long addr, - int level, u64 val, unsigned long page_size) +static void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, u64 val) { u64 flag = level >= 0 ? val & pg_level[level].mask : 0; + struct pg_state *st = container_of(pt_st, struct pg_state, ptdump); /* At first no level is set */ if (st->level == -1) { @@ -245,94 +252,6 @@ static void note_page(struct pg_state *st, unsigned long addr, } } -static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start) -{ - pte_t *pte = pte_offset_kernel(pmd, 0); - unsigned long addr; - unsigned int i; - - for (i = 0; i < PTRS_PER_PTE; i++, pte++) { - addr = start + i * PAGE_SIZE; - note_page(st, addr, 4, pte_val(*pte), PAGE_SIZE); - - } -} - -static void walk_hugepd(struct pg_state *st, hugepd_t *phpd, unsigned long start, - int pdshift, int level) -{ -#ifdef CONFIG_ARCH_HAS_HUGEPD - unsigned int i; - int shift = hugepd_shift(*phpd); - int ptrs_per_hpd = pdshift - shift > 0 ? 1 << (pdshift - shift) : 1; - - if (start & ((1 << shift) - 1)) - return; - - for (i = 0; i < ptrs_per_hpd; i++) { - unsigned long addr = start + (i << shift); - pte_t *pte = hugepte_offset(*phpd, addr, pdshift); - - note_page(st, addr, level + 1, pte_val(*pte), 1 << shift); - } -#endif -} - -static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start) -{ - pmd_t *pmd = pmd_offset(pud, 0); - unsigned long addr; - unsigned int i; - - for (i = 0; i < PTRS_PER_PMD; i++, pmd++) { - addr = start + i * PMD_SIZE; - if (!pmd_none(*pmd) && !pmd_is_leaf(*pmd)) - /* pmd exists */ - walk_pte(st, pmd, addr); - else - note_page(st, addr, 3, pmd_val(*pmd), PMD_SIZE); - } -} - -static void walk_pud(struct pg_state *st, p4d_t *p4d, unsigned long start) -{ - pud_t *pud = pud_offset(p4d, 0); - unsigned long addr; - unsigned int i; - - for (i = 0; i < PTRS_PER_PUD; i++, pud++) { - addr = start + i * PUD_SIZE; - if (!pud_none(*pud) && !pud_is_leaf(*pud)) - /* pud exists */ - walk_pmd(st, pud, addr); - else - note_page(st, addr, 2, pud_val(*pud), PUD_SIZE); - } -} - -static void walk_pagetables(struct pg_state *st) -{ - unsigned int i; - unsigned long addr = st->start_address & PGDIR_MASK; - pgd_t *pgd = pgd_offset_k(addr); - - /* - * Traverse the linux pagetable structure and dump pages that are in - * the hash pagetable. - */ - for (i = pgd_index(addr); i < PTRS_PER_PGD; i++, pgd++, addr += PGDIR_SIZE) { - p4d_t *p4d = p4d_offset(pgd, 0); - - if (p4d_none(*p4d) || p4d_is_leaf(*p4d)) - note_page(st, addr, 1, p4d_val(*p4d), PGDIR_SIZE); - else if (is_hugepd(__hugepd(p4d_val(*p4d)))) - walk_hugepd(st, (hugepd_t *)p4d, addr, PGDIR_SHIFT, 1); - else - /* p4d exists */ - walk_pud(st, p4d, addr); - } -} - static void populate_markers(void) { int i = 0; @@ -383,17 +302,14 @@ static int ptdump_show(struct seq_file *m, void *v) .seq = m, .marker = address_markers, .level = -1, - .start_address = IS_ENABLED(CONFIG_PPC64) ? PAGE_OFFSET : TASK_SIZE, + .ptdump = { + .note_page = note_page, + .range = ptdump_range, + } }; -#ifdef CONFIG_PPC64 - if (!radix_enabled()) - st.start_address = KERN_VIRT_START; -#endif - /* Traverse kernel page tables */ - walk_pagetables(&st); - note_page(&st, 0, -1, 0, 0); + ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); return 0; } @@ -409,23 +325,24 @@ static void build_pgtable_complete_mask(void) pg_level[i].mask |= pg_level[i].flag[j].mask; } -#ifdef CONFIG_PPC_DEBUG_WX +#ifdef CONFIG_DEBUG_WX void ptdump_check_wx(void) { struct pg_state st = { .seq = NULL, - .marker = address_markers, + .marker = (struct addr_marker[]) { + { 0, NULL}, + { -1, NULL}, + }, .level = -1, .check_wx = true, - .start_address = IS_ENABLED(CONFIG_PPC64) ? PAGE_OFFSET : TASK_SIZE, + .ptdump = { + .note_page = note_page, + .range = ptdump_range, + } }; -#ifdef CONFIG_PPC64 - if (!radix_enabled()) - st.start_address = KERN_VIRT_START; -#endif - - walk_pagetables(&st); + ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); if (st.wx_pages) pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found\n", @@ -435,12 +352,21 @@ void ptdump_check_wx(void) } #endif -static int ptdump_init(void) +static int __init ptdump_init(void) { +#ifdef CONFIG_PPC64 + if (!radix_enabled()) + ptdump_range[0].start = KERN_VIRT_START; + else + ptdump_range[0].start = PAGE_OFFSET; +#endif + populate_markers(); build_pgtable_complete_mask(); - debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, - &ptdump_fops); + + if (IS_ENABLED(CONFIG_PTDUMP_DEBUGFS)) + debugfs_create_file("kernel_page_tables", 0400, NULL, NULL, &ptdump_fops); + return 0; } device_initcall(ptdump_init); -- cgit v1.2.3-59-g8ed1b From 316389e904f968d24d44cd96a6d171ee1ef269cf Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 25 Jun 2021 10:58:33 +0000 Subject: powerpc/syscalls: Simplify do_mmap2() When shift is nul, operations remain valid so no test needed. And 'ret' is unnecessary. And use IS_ALIGNED() to check alignment, that's more clear. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/373ec500f386374bc5735007df3d3869eac47be1.1624618701.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/syscalls.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c index bf4ae0f0e36c..825931e400df 100644 --- a/arch/powerpc/kernel/syscalls.c +++ b/arch/powerpc/kernel/syscalls.c @@ -41,20 +41,13 @@ static inline long do_mmap2(unsigned long addr, size_t len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long off, int shift) { - long ret = -EINVAL; - if (!arch_validate_prot(prot, addr)) - goto out; + return -EINVAL; - if (shift) { - if (off & ((1 << shift) - 1)) - goto out; - off >>= shift; - } + if (!IS_ALIGNED(off, 1 << shift)) + return -EINVAL; - ret = ksys_mmap_pgoff(addr, len, prot, flags, fd, off); -out: - return ret; + return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> shift); } SYSCALL_DEFINE6(mmap2, unsigned long, addr, size_t, len, -- cgit v1.2.3-59-g8ed1b From 19e932eb6ea47f4f37513eb2ae0daee19117765c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 17 Aug 2021 16:00:14 +0000 Subject: powerpc/ptrace: Make user_mode() common to PPC32 and PPC64 Today we have: #ifdef __powerpc64__ #define user_mode(regs) ((((regs)->msr) >> MSR_PR_LG) & 0x1) #else #define user_mode(regs) (((regs)->msr & MSR_PR) != 0) #endif With ppc64_defconfig, we get: if (!user_mode(regs)) 14b4: e9 3e 01 08 ld r9,264(r30) 14b8: 71 29 40 00 andi. r9,r9,16384 14bc: 41 82 07 a4 beq 1c60 <.emulate_instruction+0x7d0> If taking the ppc32 definition of user_mode(), the exact same code is generated for ppc64_defconfig. So, only keep one version of user_mode(), preferably the one not using MSR_PR_LG which should be kept internal to reg.h. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/000a28c51808bbd802b505af42d2cb316c2be7d3.1629216000.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/ptrace.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 3e5d470a6155..333979cf5d1e 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -181,11 +181,7 @@ static inline unsigned long frame_pointer(struct pt_regs *regs) return 0; } -#ifdef __powerpc64__ -#define user_mode(regs) ((((regs)->msr) >> MSR_PR_LG) & 0x1) -#else #define user_mode(regs) (((regs)->msr & MSR_PR) != 0) -#endif #define force_successful_syscall_return() \ do { \ -- cgit v1.2.3-59-g8ed1b From 9401f4e46cf6965e23738f70e149172344a01eef Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 2 Mar 2021 08:48:11 +0000 Subject: powerpc: Use lwarx/ldarx directly instead of PPC_LWARX/LDARX macros Force the eh flag at 0 on PPC32. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/1fc81f07cabebb875b963e295408cc3dd38c8d85.1614674882.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/asm-compat.h | 4 ++-- arch/powerpc/include/asm/atomic.h | 4 ++-- arch/powerpc/include/asm/bitops.h | 8 ++++---- arch/powerpc/include/asm/ppc-opcode.h | 2 -- arch/powerpc/include/asm/simple_spinlock.h | 6 +++--- 5 files changed, 11 insertions(+), 13 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/asm-compat.h b/arch/powerpc/include/asm/asm-compat.h index 19b70c5b5f18..2b736d9fbb1b 100644 --- a/arch/powerpc/include/asm/asm-compat.h +++ b/arch/powerpc/include/asm/asm-compat.h @@ -17,7 +17,7 @@ #define PPC_LONG stringify_in_c(.8byte) #define PPC_LONG_ALIGN stringify_in_c(.balign 8) #define PPC_TLNEI stringify_in_c(tdnei) -#define PPC_LLARX(t, a, b, eh) PPC_LDARX(t, a, b, eh) +#define PPC_LLARX stringify_in_c(ldarx) #define PPC_STLCX stringify_in_c(stdcx.) #define PPC_CNTLZL stringify_in_c(cntlzd) #define PPC_MTOCRF(FXM, RS) MTOCRF((FXM), RS) @@ -50,7 +50,7 @@ #define PPC_LONG stringify_in_c(.long) #define PPC_LONG_ALIGN stringify_in_c(.balign 4) #define PPC_TLNEI stringify_in_c(twnei) -#define PPC_LLARX(t, a, b, eh) PPC_LWARX(t, a, b, eh) +#define PPC_LLARX stringify_in_c(lwarx) #define PPC_STLCX stringify_in_c(stwcx.) #define PPC_CNTLZL stringify_in_c(cntlzw) #define PPC_MTOCRF stringify_in_c(mtcrf) diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h index a1732a79e92a..6a53ef178bfd 100644 --- a/arch/powerpc/include/asm/atomic.h +++ b/arch/powerpc/include/asm/atomic.h @@ -207,7 +207,7 @@ arch_atomic_try_cmpxchg_lock(atomic_t *v, int *old, int new) int r, o = *old; __asm__ __volatile__ ( -"1:\t" PPC_LWARX(%0,0,%2,1) " # atomic_try_cmpxchg_acquire \n" +"1: lwarx %0,0,%2,%5 # atomic_try_cmpxchg_acquire \n" " cmpw 0,%0,%3 \n" " bne- 2f \n" " stwcx. %4,0,%2 \n" @@ -215,7 +215,7 @@ arch_atomic_try_cmpxchg_lock(atomic_t *v, int *old, int new) "\t" PPC_ACQUIRE_BARRIER " \n" "2: \n" : "=&r" (r), "+m" (v->counter) - : "r" (&v->counter), "r" (o), "r" (new) + : "r" (&v->counter), "r" (o), "r" (new), "i" (IS_ENABLED(CONFIG_PPC64) ? 1 : 0) : "cr0", "memory"); if (unlikely(r != o)) diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h index 299ab33505a6..11847b6a244e 100644 --- a/arch/powerpc/include/asm/bitops.h +++ b/arch/powerpc/include/asm/bitops.h @@ -70,7 +70,7 @@ static inline void fn(unsigned long mask, \ unsigned long *p = (unsigned long *)_p; \ __asm__ __volatile__ ( \ prefix \ -"1:" PPC_LLARX(%0,0,%3,0) "\n" \ +"1:" PPC_LLARX "%0,0,%3,0\n" \ stringify_in_c(op) "%0,%0,%2\n" \ PPC_STLCX "%0,0,%3\n" \ "bne- 1b\n" \ @@ -115,13 +115,13 @@ static inline unsigned long fn( \ unsigned long *p = (unsigned long *)_p; \ __asm__ __volatile__ ( \ prefix \ -"1:" PPC_LLARX(%0,0,%3,eh) "\n" \ +"1:" PPC_LLARX "%0,0,%3,%4\n" \ stringify_in_c(op) "%1,%0,%2\n" \ PPC_STLCX "%1,0,%3\n" \ "bne- 1b\n" \ postfix \ : "=&r" (old), "=&r" (t) \ - : "r" (mask), "r" (p) \ + : "r" (mask), "r" (p), "i" (IS_ENABLED(CONFIG_PPC64) ? eh : 0) \ : "cc", "memory"); \ return (old & mask); \ } @@ -170,7 +170,7 @@ clear_bit_unlock_return_word(int nr, volatile unsigned long *addr) __asm__ __volatile__ ( PPC_RELEASE_BARRIER -"1:" PPC_LLARX(%0,0,%3,0) "\n" +"1:" PPC_LLARX "%0,0,%3,0\n" "andc %1,%0,%2\n" PPC_STLCX "%1,0,%3\n" "bne- 1b\n" diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index bede76dd3db7..baea657bc868 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -576,8 +576,6 @@ #define PPC_DIVDE(t, a, b) stringify_in_c(.long PPC_RAW_DIVDE(t, a, b)) #define PPC_DIVDEU(t, a, b) stringify_in_c(.long PPC_RAW_DIVDEU(t, a, b)) #define PPC_LQARX(t, a, b, eh) stringify_in_c(.long PPC_RAW_LQARX(t, a, b, eh)) -#define PPC_LDARX(t, a, b, eh) stringify_in_c(.long PPC_RAW_LDARX(t, a, b, eh)) -#define PPC_LWARX(t, a, b, eh) stringify_in_c(.long PPC_RAW_LWARX(t, a, b, eh)) #define PPC_STQCX(t, a, b) stringify_in_c(.long PPC_RAW_STQCX(t, a, b)) #define PPC_MADDHD(t, a, b, c) stringify_in_c(.long PPC_RAW_MADDHD(t, a, b, c)) #define PPC_MADDHDU(t, a, b, c) stringify_in_c(.long PPC_RAW_MADDHDU(t, a, b, c)) diff --git a/arch/powerpc/include/asm/simple_spinlock.h b/arch/powerpc/include/asm/simple_spinlock.h index 552f325412cc..8985791a2ba5 100644 --- a/arch/powerpc/include/asm/simple_spinlock.h +++ b/arch/powerpc/include/asm/simple_spinlock.h @@ -51,7 +51,7 @@ static inline unsigned long __arch_spin_trylock(arch_spinlock_t *lock) token = LOCK_TOKEN; __asm__ __volatile__( -"1: " PPC_LWARX(%0,0,%2,1) "\n\ +"1: lwarx %0,0,%2,1\n\ cmpwi 0,%0,0\n\ bne- 2f\n\ stwcx. %1,0,%2\n\ @@ -179,7 +179,7 @@ static inline long __arch_read_trylock(arch_rwlock_t *rw) long tmp; __asm__ __volatile__( -"1: " PPC_LWARX(%0,0,%1,1) "\n" +"1: lwarx %0,0,%1,1\n" __DO_SIGN_EXTEND " addic. %0,%0,1\n\ ble- 2f\n" @@ -203,7 +203,7 @@ static inline long __arch_write_trylock(arch_rwlock_t *rw) token = WRLOCK_TOKEN; __asm__ __volatile__( -"1: " PPC_LWARX(%0,0,%2,1) "\n\ +"1: lwarx %0,0,%2,1\n\ cmpwi 0,%0,0\n\ bne- 2f\n" " stwcx. %1,0,%2\n\ -- cgit v1.2.3-59-g8ed1b From fd42b7b09c602c904452c0c3e5955ca21d8e387a Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 12 Aug 2021 02:00:35 +1000 Subject: KVM: PPC: Book3S HV: Initialise vcpu MSR with MSR_ME It is possible to create a VCPU without setting the MSR before running it, which results in a warning in kvmhv_vcpu_entry_p9() that MSR_ME is not set. This is pretty harmless because the MSR_ME bit is added to HSRR1 before HRFID to guest, and a normal qemu guest doesn't hit it. Initialise the vcpu MSR with MSR_ME set. Reported-by: Alexey Kardashevskiy Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210811160134.904987-2-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 085fb8ecbf68..c402eb0276b0 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2684,6 +2684,7 @@ static int kvmppc_core_vcpu_create_hv(struct kvm_vcpu *vcpu) spin_lock_init(&vcpu->arch.vpa_update_lock); spin_lock_init(&vcpu->arch.tbacct_lock); vcpu->arch.busy_preempt = TB_NIL; + vcpu->arch.shregs.msr = MSR_ME; vcpu->arch.intr_msr = MSR_SF | MSR_ME; /* -- cgit v1.2.3-59-g8ed1b From daac40e8d7a63ab8608132a7cfce411986feac32 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 12 Aug 2021 02:00:36 +1000 Subject: KVM: PPC: Book3S HV: Remove TM emulation from POWER7/8 path TM fake-suspend emulation is only used by POWER9. Remove it from the old code path. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210811160134.904987-3-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 42 --------------------------------- 1 file changed, 42 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 8dd437d7a2c6..75079397c2a5 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1088,12 +1088,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) cmpwi r12, BOOK3S_INTERRUPT_H_INST_STORAGE beq kvmppc_hisi -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM - /* For softpatch interrupt, go off and do TM instruction emulation */ - cmpwi r12, BOOK3S_INTERRUPT_HV_SOFTPATCH - beq kvmppc_tm_emul -#endif - /* See if this is a leftover HDEC interrupt */ cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER bne 2f @@ -1599,42 +1593,6 @@ maybe_reenter_guest: blt deliver_guest_interrupt b guest_exit_cont -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM -/* - * Softpatch interrupt for transactional memory emulation cases - * on POWER9 DD2.2. This is early in the guest exit path - we - * haven't saved registers or done a treclaim yet. - */ -kvmppc_tm_emul: - /* Save instruction image in HEIR */ - mfspr r3, SPRN_HEIR - stw r3, VCPU_HEIR(r9) - - /* - * The cases we want to handle here are those where the guest - * is in real suspend mode and is trying to transition to - * transactional mode. - */ - lbz r0, HSTATE_FAKE_SUSPEND(r13) - cmpwi r0, 0 /* keep exiting guest if in fake suspend */ - bne guest_exit_cont - rldicl r3, r11, 64 - MSR_TS_S_LG, 62 - cmpwi r3, 1 /* or if not in suspend state */ - bne guest_exit_cont - - /* Call C code to do the emulation */ - mr r3, r9 - bl kvmhv_p9_tm_emulation_early - nop - ld r9, HSTATE_KVM_VCPU(r13) - li r12, BOOK3S_INTERRUPT_HV_SOFTPATCH - cmpwi r3, 0 - beq guest_exit_cont /* continue exiting if not handled */ - ld r10, VCPU_PC(r9) - ld r11, VCPU_MSR(r9) - b fast_interrupt_c_return /* go back to guest if handled */ -#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ - /* * Check whether an HDSI is an HPTE not found fault or something else. * If it is an HPTE not found fault that is due to the guest accessing -- cgit v1.2.3-59-g8ed1b From 4782e0cd0d184d727ad3b0cfe20d1d44d9f98239 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 12 Aug 2021 02:00:37 +1000 Subject: KVM: PPC: Book3S HV P9: Fixes for TM softpatch interrupt NIP The softpatch interrupt sets HSRR0 to the faulting instruction +4, so it should subtract 4 for the faulting instruction address in the case it is a TM softpatch interrupt (the instruction was not executed) and it was not emulated. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210811160134.904987-4-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv_tm.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_hv_tm.c b/arch/powerpc/kvm/book3s_hv_tm.c index cc90b8b82329..e7c36f8bf205 100644 --- a/arch/powerpc/kvm/book3s_hv_tm.c +++ b/arch/powerpc/kvm/book3s_hv_tm.c @@ -46,6 +46,15 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) u64 newmsr, bescr; int ra, rs; + /* + * The TM softpatch interrupt sets NIP to the instruction following + * the faulting instruction, which is not executed. Rewind nip to the + * faulting instruction so it looks like a normal synchronous + * interrupt, then update nip in the places where the instruction is + * emulated. + */ + vcpu->arch.regs.nip -= 4; + /* * rfid, rfebb, and mtmsrd encode bit 31 = 0 since it's a reserved bit * in these instructions, so masking bit 31 out doesn't change these @@ -67,7 +76,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) (newmsr & MSR_TM))); newmsr = sanitize_msr(newmsr); vcpu->arch.shregs.msr = newmsr; - vcpu->arch.cfar = vcpu->arch.regs.nip - 4; + vcpu->arch.cfar = vcpu->arch.regs.nip; vcpu->arch.regs.nip = vcpu->arch.shregs.srr0; return RESUME_GUEST; @@ -100,7 +109,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) vcpu->arch.bescr = bescr; msr = (msr & ~MSR_TS_MASK) | MSR_TS_T; vcpu->arch.shregs.msr = msr; - vcpu->arch.cfar = vcpu->arch.regs.nip - 4; + vcpu->arch.cfar = vcpu->arch.regs.nip; vcpu->arch.regs.nip = vcpu->arch.ebbrr; return RESUME_GUEST; @@ -116,6 +125,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) newmsr = (newmsr & ~MSR_LE) | (msr & MSR_LE); newmsr = sanitize_msr(newmsr); vcpu->arch.shregs.msr = newmsr; + vcpu->arch.regs.nip += 4; return RESUME_GUEST; /* ignore bit 31, see comment above */ @@ -152,6 +162,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) msr = (msr & ~MSR_TS_MASK) | MSR_TS_S; } vcpu->arch.shregs.msr = msr; + vcpu->arch.regs.nip += 4; return RESUME_GUEST; /* ignore bit 31, see comment above */ @@ -189,6 +200,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) | (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 29); vcpu->arch.shregs.msr &= ~MSR_TS_MASK; + vcpu->arch.regs.nip += 4; return RESUME_GUEST; /* ignore bit 31, see comment above */ @@ -220,6 +232,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) | (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 29); vcpu->arch.shregs.msr = msr | MSR_TS_S; + vcpu->arch.regs.nip += 4; return RESUME_GUEST; } -- cgit v1.2.3-59-g8ed1b From d82b392d9b3556b63e3f9916cf057ea847e173a9 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 12 Aug 2021 02:00:38 +1000 Subject: KVM: PPC: Book3S HV Nested: Fix TM softpatch HFAC interrupt emulation Have the TM softpatch emulation code set up the HFAC interrupt and return -1 in case an instruction was executed with HFSCR bits clear, and have the interrupt exit handler fall through to the HFAC handler. When the L0 is running a nested guest, this ensures the HFAC interrupt is correctly passed up to the L1. The "direct guest" exit handler will turn these into PROGILL program interrupts so functionality in practice will be unchanged. But it's possible an L1 would want to handle these in a different way. Also rearrange the FAC interrupt emulation code to match the HFAC format while here (mainly, adding the FSCR_INTR_CAUSE mask). Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210811160134.904987-5-npiggin@gmail.com --- arch/powerpc/include/asm/reg.h | 3 ++- arch/powerpc/kvm/book3s_hv.c | 35 ++++++++++++++++++++------------ arch/powerpc/kvm/book3s_hv_tm.c | 44 ++++++++++++++++++++++------------------- 3 files changed, 48 insertions(+), 34 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index be85cf156a1f..e9d27265253b 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -415,6 +415,7 @@ #define FSCR_TAR __MASK(FSCR_TAR_LG) #define FSCR_EBB __MASK(FSCR_EBB_LG) #define FSCR_DSCR __MASK(FSCR_DSCR_LG) +#define FSCR_INTR_CAUSE (ASM_CONST(0xFF) << 56) /* interrupt cause */ #define SPRN_HFSCR 0xbe /* HV=1 Facility Status & Control Register */ #define HFSCR_PREFIX __MASK(FSCR_PREFIX_LG) #define HFSCR_MSGP __MASK(FSCR_MSGP_LG) @@ -426,7 +427,7 @@ #define HFSCR_DSCR __MASK(FSCR_DSCR_LG) #define HFSCR_VECVSX __MASK(FSCR_VECVSX_LG) #define HFSCR_FP __MASK(FSCR_FP_LG) -#define HFSCR_INTR_CAUSE (ASM_CONST(0xFF) << 56) /* interrupt cause */ +#define HFSCR_INTR_CAUSE FSCR_INTR_CAUSE #define SPRN_TAR 0x32f /* Target Address Register */ #define SPRN_LPCR 0x13E /* LPAR Control Register */ #define LPCR_VPM0 ASM_CONST(0x8000000000000000) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index c402eb0276b0..e7df8a3ca62c 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1679,6 +1679,21 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu, r = RESUME_GUEST; } break; + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + case BOOK3S_INTERRUPT_HV_SOFTPATCH: + /* + * This occurs for various TM-related instructions that + * we need to emulate on POWER9 DD2.2. We have already + * handled the cases where the guest was in real-suspend + * mode and was transitioning to transactional state. + */ + r = kvmhv_p9_tm_emulation(vcpu); + if (r != -1) + break; + fallthrough; /* go to facility unavailable handler */ +#endif + /* * This occurs if the guest (kernel or userspace), does something that * is prohibited by HFSCR. @@ -1697,18 +1712,6 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu, } break; -#ifdef CONFIG_PPC_TRANSACTIONAL_MEM - case BOOK3S_INTERRUPT_HV_SOFTPATCH: - /* - * This occurs for various TM-related instructions that - * we need to emulate on POWER9 DD2.2. We have already - * handled the cases where the guest was in real-suspend - * mode and was transitioning to transactional state. - */ - r = kvmhv_p9_tm_emulation(vcpu); - break; -#endif - case BOOK3S_INTERRUPT_HV_RM_HARD: r = RESUME_PASSTHROUGH; break; @@ -1811,9 +1814,15 @@ static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu) * mode and was transitioning to transactional state. */ r = kvmhv_p9_tm_emulation(vcpu); - break; + if (r != -1) + break; + fallthrough; /* go to facility unavailable handler */ #endif + case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: + r = RESUME_HOST; + break; + case BOOK3S_INTERRUPT_HV_RM_HARD: vcpu->arch.trap = 0; r = RESUME_GUEST; diff --git a/arch/powerpc/kvm/book3s_hv_tm.c b/arch/powerpc/kvm/book3s_hv_tm.c index e7c36f8bf205..866cadd70094 100644 --- a/arch/powerpc/kvm/book3s_hv_tm.c +++ b/arch/powerpc/kvm/book3s_hv_tm.c @@ -88,14 +88,15 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) } /* check EBB facility is available */ if (!(vcpu->arch.hfscr & HFSCR_EBB)) { - /* generate an illegal instruction interrupt */ - kvmppc_core_queue_program(vcpu, SRR1_PROGILL); - return RESUME_GUEST; + vcpu->arch.hfscr &= ~HFSCR_INTR_CAUSE; + vcpu->arch.hfscr |= (u64)FSCR_EBB_LG << 56; + vcpu->arch.trap = BOOK3S_INTERRUPT_H_FAC_UNAVAIL; + return -1; /* rerun host interrupt handler */ } if ((msr & MSR_PR) && !(vcpu->arch.fscr & FSCR_EBB)) { /* generate a facility unavailable interrupt */ - vcpu->arch.fscr = (vcpu->arch.fscr & ~(0xffull << 56)) | - ((u64)FSCR_EBB_LG << 56); + vcpu->arch.fscr &= ~FSCR_INTR_CAUSE; + vcpu->arch.fscr |= (u64)FSCR_EBB_LG << 56; kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_FAC_UNAVAIL); return RESUME_GUEST; } @@ -138,14 +139,15 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) } /* check for TM disabled in the HFSCR or MSR */ if (!(vcpu->arch.hfscr & HFSCR_TM)) { - /* generate an illegal instruction interrupt */ - kvmppc_core_queue_program(vcpu, SRR1_PROGILL); - return RESUME_GUEST; + vcpu->arch.hfscr &= ~HFSCR_INTR_CAUSE; + vcpu->arch.hfscr |= (u64)FSCR_TM_LG << 56; + vcpu->arch.trap = BOOK3S_INTERRUPT_H_FAC_UNAVAIL; + return -1; /* rerun host interrupt handler */ } if (!(msr & MSR_TM)) { /* generate a facility unavailable interrupt */ - vcpu->arch.fscr = (vcpu->arch.fscr & ~(0xffull << 56)) | - ((u64)FSCR_TM_LG << 56); + vcpu->arch.fscr &= ~FSCR_INTR_CAUSE; + vcpu->arch.fscr |= (u64)FSCR_TM_LG << 56; kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_FAC_UNAVAIL); return RESUME_GUEST; @@ -169,14 +171,15 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) case (PPC_INST_TRECLAIM & PO_XOP_OPCODE_MASK): /* check for TM disabled in the HFSCR or MSR */ if (!(vcpu->arch.hfscr & HFSCR_TM)) { - /* generate an illegal instruction interrupt */ - kvmppc_core_queue_program(vcpu, SRR1_PROGILL); - return RESUME_GUEST; + vcpu->arch.hfscr &= ~HFSCR_INTR_CAUSE; + vcpu->arch.hfscr |= (u64)FSCR_TM_LG << 56; + vcpu->arch.trap = BOOK3S_INTERRUPT_H_FAC_UNAVAIL; + return -1; /* rerun host interrupt handler */ } if (!(msr & MSR_TM)) { /* generate a facility unavailable interrupt */ - vcpu->arch.fscr = (vcpu->arch.fscr & ~(0xffull << 56)) | - ((u64)FSCR_TM_LG << 56); + vcpu->arch.fscr &= ~FSCR_INTR_CAUSE; + vcpu->arch.fscr |= (u64)FSCR_TM_LG << 56; kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_FAC_UNAVAIL); return RESUME_GUEST; @@ -208,14 +211,15 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) /* XXX do we need to check for PR=0 here? */ /* check for TM disabled in the HFSCR or MSR */ if (!(vcpu->arch.hfscr & HFSCR_TM)) { - /* generate an illegal instruction interrupt */ - kvmppc_core_queue_program(vcpu, SRR1_PROGILL); - return RESUME_GUEST; + vcpu->arch.hfscr &= ~HFSCR_INTR_CAUSE; + vcpu->arch.hfscr |= (u64)FSCR_TM_LG << 56; + vcpu->arch.trap = BOOK3S_INTERRUPT_H_FAC_UNAVAIL; + return -1; /* rerun host interrupt handler */ } if (!(msr & MSR_TM)) { /* generate a facility unavailable interrupt */ - vcpu->arch.fscr = (vcpu->arch.fscr & ~(0xffull << 56)) | - ((u64)FSCR_TM_LG << 56); + vcpu->arch.fscr &= ~FSCR_INTR_CAUSE; + vcpu->arch.fscr |= (u64)FSCR_TM_LG << 56; kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_FAC_UNAVAIL); return RESUME_GUEST; -- cgit v1.2.3-59-g8ed1b From 7487cabc7ed2f7716bf304e4e97c57fd995cf70e Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 12 Aug 2021 02:00:39 +1000 Subject: KVM: PPC: Book3S HV Nested: Sanitise vcpu registers As one of the arguments of the H_ENTER_NESTED hypercall, the nested hypervisor (L1) prepares a structure containing the values of various hypervisor-privileged registers with which it wants the nested guest (L2) to run. Since the nested HV runs in supervisor mode it needs the host to write to these registers. To stop a nested HV manipulating this mechanism and using a nested guest as a proxy to access a facility that has been made unavailable to it, we have a routine that sanitises the values of the HV registers before copying them into the nested guest's vcpu struct. However, when coming out of the guest the values are copied as they were back into L1 memory, which means that any sanitisation we did during guest entry will be exposed to L1 after H_ENTER_NESTED returns. This patch alters this sanitisation to have effect on the vcpu->arch registers directly before entering and after exiting the guest, leaving the structure that is copied back into L1 unchanged (except when we really want L1 to access the value, e.g the Cause bits of HFSCR). Signed-off-by: Fabiano Rosas Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Reviewed-by: Nicholas Piggin Link: https://lore.kernel.org/r/20210811160134.904987-6-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv_nested.c | 94 ++++++++++++++++++------------------- 1 file changed, 46 insertions(+), 48 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index 898f942eb198..1eb4e989edc7 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -105,7 +105,6 @@ static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap, struct kvmppc_vcore *vc = vcpu->arch.vcore; hr->dpdes = vc->dpdes; - hr->hfscr = vcpu->arch.hfscr; hr->purr = vcpu->arch.purr; hr->spurr = vcpu->arch.spurr; hr->ic = vcpu->arch.ic; @@ -128,55 +127,17 @@ static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap, case BOOK3S_INTERRUPT_H_INST_STORAGE: hr->asdr = vcpu->arch.fault_gpa; break; + case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: + hr->hfscr = ((~HFSCR_INTR_CAUSE & hr->hfscr) | + (HFSCR_INTR_CAUSE & vcpu->arch.hfscr)); + break; case BOOK3S_INTERRUPT_H_EMUL_ASSIST: hr->heir = vcpu->arch.emul_inst; break; } } -/* - * This can result in some L0 HV register state being leaked to an L1 - * hypervisor when the hv_guest_state is copied back to the guest after - * being modified here. - * - * There is no known problem with such a leak, and in many cases these - * register settings could be derived by the guest by observing behaviour - * and timing, interrupts, etc., but it is an issue to consider. - */ -static void sanitise_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) -{ - struct kvmppc_vcore *vc = vcpu->arch.vcore; - u64 mask; - - /* - * Don't let L1 change LPCR bits for the L2 except these: - */ - mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD | - LPCR_LPES | LPCR_MER; - - /* - * Additional filtering is required depending on hardware - * and configuration. - */ - hr->lpcr = kvmppc_filter_lpcr_hv(vcpu->kvm, - (vc->lpcr & ~mask) | (hr->lpcr & mask)); - - /* - * Don't let L1 enable features for L2 which we've disabled for L1, - * but preserve the interrupt cause field. - */ - hr->hfscr &= (HFSCR_INTR_CAUSE | vcpu->arch.hfscr); - - /* Don't let data address watchpoint match in hypervisor state */ - hr->dawrx0 &= ~DAWRX_HYP; - hr->dawrx1 &= ~DAWRX_HYP; - - /* Don't let completed instruction address breakpt match in HV state */ - if ((hr->ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER) - hr->ciabr &= ~CIABR_PRIV; -} - -static void restore_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) +static void restore_hv_regs(struct kvm_vcpu *vcpu, const struct hv_guest_state *hr) { struct kvmppc_vcore *vc = vcpu->arch.vcore; @@ -288,6 +249,43 @@ static int kvmhv_write_guest_state_and_regs(struct kvm_vcpu *vcpu, sizeof(struct pt_regs)); } +static void load_l2_hv_regs(struct kvm_vcpu *vcpu, + const struct hv_guest_state *l2_hv, + const struct hv_guest_state *l1_hv, u64 *lpcr) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + u64 mask; + + restore_hv_regs(vcpu, l2_hv); + + /* + * Don't let L1 change LPCR bits for the L2 except these: + */ + mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD | + LPCR_LPES | LPCR_MER; + + /* + * Additional filtering is required depending on hardware + * and configuration. + */ + *lpcr = kvmppc_filter_lpcr_hv(vcpu->kvm, + (vc->lpcr & ~mask) | (*lpcr & mask)); + + /* + * Don't let L1 enable features for L2 which we've disabled for L1, + * but preserve the interrupt cause field. + */ + vcpu->arch.hfscr = l2_hv->hfscr & (HFSCR_INTR_CAUSE | vcpu->arch.hfscr); + + /* Don't let data address watchpoint match in hypervisor state */ + vcpu->arch.dawrx0 = l2_hv->dawrx0 & ~DAWRX_HYP; + vcpu->arch.dawrx1 = l2_hv->dawrx1 & ~DAWRX_HYP; + + /* Don't let completed instruction address breakpt match in HV state */ + if ((l2_hv->ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER) + vcpu->arch.ciabr = l2_hv->ciabr & ~CIABR_PRIV; +} + long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) { long int err, r; @@ -296,7 +294,7 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) struct hv_guest_state l2_hv = {0}, saved_l1_hv; struct kvmppc_vcore *vc = vcpu->arch.vcore; u64 hv_ptr, regs_ptr; - u64 hdec_exp; + u64 hdec_exp, lpcr; s64 delta_purr, delta_spurr, delta_ic, delta_vtb; if (vcpu->kvm->arch.l1_ptcr == 0) @@ -369,8 +367,8 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) /* Guest must always run with ME enabled, HV disabled. */ vcpu->arch.shregs.msr = (vcpu->arch.regs.msr | MSR_ME) & ~MSR_HV; - sanitise_hv_regs(vcpu, &l2_hv); - restore_hv_regs(vcpu, &l2_hv); + lpcr = l2_hv.lpcr; + load_l2_hv_regs(vcpu, &l2_hv, &saved_l1_hv, &lpcr); vcpu->arch.ret = RESUME_GUEST; vcpu->arch.trap = 0; @@ -380,7 +378,7 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) r = RESUME_HOST; break; } - r = kvmhv_run_single_vcpu(vcpu, hdec_exp, l2_hv.lpcr); + r = kvmhv_run_single_vcpu(vcpu, hdec_exp, lpcr); } while (is_kvmppc_resume_guest(r)); /* save L2 state for return */ -- cgit v1.2.3-59-g8ed1b From 8b210a880b35ba75eb42b79dfd65e369c1feb119 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 12 Aug 2021 02:00:40 +1000 Subject: KVM: PPC: Book3S HV Nested: Make nested HFSCR state accessible When the L0 runs a nested L2, there are several permutations of HFSCR that can be relevant. The HFSCR that the L1 vcpu L1 requested, the HFSCR that the L1 vcpu may use, and the HFSCR that is actually being used to run the L2. The L1 requested HFSCR is not accessible outside the nested hcall handler, so copy that into a new kvm_nested_guest.hfscr field. The permitted HFSCR is taken from the HFSCR that the L1 runs with, which is also not accessible while the hcall is being made. Move this into a new kvm_vcpu_arch.hfscr_permitted field. These will be used by the next patch to improve facility handling for nested guests, and later by facility demand faulting patches. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210811160134.904987-7-npiggin@gmail.com --- arch/powerpc/include/asm/kvm_book3s_64.h | 1 + arch/powerpc/include/asm/kvm_host.h | 2 ++ arch/powerpc/kvm/book3s_hv.c | 2 ++ arch/powerpc/kvm/book3s_hv_nested.c | 5 +++-- 4 files changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index eaf3a562bf1e..19b6942c6969 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -39,6 +39,7 @@ struct kvm_nested_guest { pgd_t *shadow_pgtable; /* our page table for this guest */ u64 l1_gr_to_hr; /* L1's addr of part'n-scoped table */ u64 process_table; /* process table entry for this guest */ + u64 hfscr; /* HFSCR that the L1 requested for this nested guest */ long refcnt; /* number of pointers to this struct */ struct mutex tlb_lock; /* serialize page faults and tlbies */ struct kvm_nested_guest *next; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 9f52f282b1aa..a779f7849cfb 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -811,6 +811,8 @@ struct kvm_vcpu_arch { u32 online; + u64 hfscr_permitted; /* A mask of permitted HFSCR facilities */ + /* For support of nested guests */ struct kvm_nested_guest *nested; u32 nested_vcpu_id; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index e7df8a3ca62c..9d31267d26b3 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -2715,6 +2715,8 @@ static int kvmppc_core_vcpu_create_hv(struct kvm_vcpu *vcpu) if (cpu_has_feature(CPU_FTR_TM_COMP)) vcpu->arch.hfscr |= HFSCR_TM; + vcpu->arch.hfscr_permitted = vcpu->arch.hfscr; + kvmppc_mmu_book3s_hv_init(vcpu); vcpu->arch.state = KVMPPC_VCPU_NOTREADY; diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index 1eb4e989edc7..5ad5014c6f68 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -272,10 +272,10 @@ static void load_l2_hv_regs(struct kvm_vcpu *vcpu, (vc->lpcr & ~mask) | (*lpcr & mask)); /* - * Don't let L1 enable features for L2 which we've disabled for L1, + * Don't let L1 enable features for L2 which we don't allow for L1, * but preserve the interrupt cause field. */ - vcpu->arch.hfscr = l2_hv->hfscr & (HFSCR_INTR_CAUSE | vcpu->arch.hfscr); + vcpu->arch.hfscr = l2_hv->hfscr & (HFSCR_INTR_CAUSE | vcpu->arch.hfscr_permitted); /* Don't let data address watchpoint match in hypervisor state */ vcpu->arch.dawrx0 = l2_hv->dawrx0 & ~DAWRX_HYP; @@ -362,6 +362,7 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) /* set L1 state to L2 state */ vcpu->arch.nested = l2; vcpu->arch.nested_vcpu_id = l2_hv.vcpu_token; + l2->hfscr = l2_hv.hfscr; vcpu->arch.regs = l2_regs; /* Guest must always run with ME enabled, HV disabled. */ -- cgit v1.2.3-59-g8ed1b From 7c3ded5735141ff4d049747c9f76672a8b737c49 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 12 Aug 2021 02:00:41 +1000 Subject: KVM: PPC: Book3S HV Nested: Stop forwarding all HFUs to L1 If the nested hypervisor has no access to a facility because it has been disabled by the host, it should also not be able to see the Hypervisor Facility Unavailable that arises from one of its guests trying to access the facility. This patch turns a HFU that happened in L2 into a Hypervisor Emulation Assistance interrupt and forwards it to L1 for handling. The ones that happened because L1 explicitly disabled the facility for L2 are still let through, along with the corresponding Cause bits in the HFSCR. Signed-off-by: Fabiano Rosas [np: move handling into kvmppc_handle_nested_exit] Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210811160134.904987-8-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 9d31267d26b3..15e2ba93678e 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1730,6 +1730,7 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu, static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu) { + struct kvm_nested_guest *nested = vcpu->arch.nested; int r; int srcu_idx; @@ -1819,9 +1820,35 @@ static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu) fallthrough; /* go to facility unavailable handler */ #endif - case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: - r = RESUME_HOST; + case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: { + u64 cause = vcpu->arch.hfscr >> 56; + + /* + * Only pass HFU interrupts to the L1 if the facility is + * permitted but disabled by the L1's HFSCR, otherwise + * the interrupt does not make sense to the L1 so turn + * it into a HEAI. + */ + if (!(vcpu->arch.hfscr_permitted & (1UL << cause)) || + (nested->hfscr & (1UL << cause))) { + vcpu->arch.trap = BOOK3S_INTERRUPT_H_EMUL_ASSIST; + + /* + * If the fetch failed, return to guest and + * try executing it again. + */ + r = kvmppc_get_last_inst(vcpu, INST_GENERIC, + &vcpu->arch.emul_inst); + if (r != EMULATE_DONE) + r = RESUME_GUEST; + else + r = RESUME_HOST; + } else { + r = RESUME_HOST; + } + break; + } case BOOK3S_INTERRUPT_HV_RM_HARD: vcpu->arch.trap = 0; -- cgit v1.2.3-59-g8ed1b From f2e29db156523bf08a0524e0f4306a641912c6d9 Mon Sep 17 00:00:00 2001 From: Fabiano Rosas Date: Thu, 12 Aug 2021 02:00:42 +1000 Subject: KVM: PPC: Book3S HV Nested: save_hv_return_state does not require trap argument vcpu is already anargument so vcpu->arch.trap can be used directly. Signed-off-by: Fabiano Rosas Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210811160134.904987-9-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv_nested.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index 5ad5014c6f68..ed8a2c9f5629 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -99,7 +99,7 @@ static void byteswap_hv_regs(struct hv_guest_state *hr) hr->dawrx1 = swab64(hr->dawrx1); } -static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap, +static void save_hv_return_state(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) { struct kvmppc_vcore *vc = vcpu->arch.vcore; @@ -118,7 +118,7 @@ static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap, hr->pidr = vcpu->arch.pid; hr->cfar = vcpu->arch.cfar; hr->ppr = vcpu->arch.ppr; - switch (trap) { + switch (vcpu->arch.trap) { case BOOK3S_INTERRUPT_H_DATA_STORAGE: hr->hdar = vcpu->arch.fault_dar; hr->hdsisr = vcpu->arch.fault_dsisr; @@ -389,7 +389,7 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) delta_spurr = vcpu->arch.spurr - l2_hv.spurr; delta_ic = vcpu->arch.ic - l2_hv.ic; delta_vtb = vc->vtb - l2_hv.vtb; - save_hv_return_state(vcpu, vcpu->arch.trap, &l2_hv); + save_hv_return_state(vcpu, &l2_hv); /* restore L1 state */ vcpu->arch.nested = NULL; -- cgit v1.2.3-59-g8ed1b From 1782663897945a5cf28e564ba5eed730098e9aa4 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 12 Aug 2021 02:00:43 +1000 Subject: KVM: PPC: Book3S HV Nested: Reflect guest PMU in-use to L0 when guest SPRs are live After the L1 saves its PMU SPRs but before loading the L2's PMU SPRs, switch the pmcregs_in_use field in the L1 lppaca to the value advertised by the L2 in its VPA. On the way out of the L2, set it back after saving the L2 PMU registers (if they were in-use). This transfers the PMU liveness indication between the L1 and L2 at the points where the registers are not live. This fixes the nested HV bug for which a workaround was added to the L0 HV by commit 63279eeb7f93a ("KVM: PPC: Book3S HV: Always save guest pmu for guest capable of nesting"), which explains the problem in detail. That workaround is no longer required for guests that include this bug fix. Fixes: 360cae313702 ("KVM: PPC: Book3S HV: Nested guest entry via hypercall") Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20210811160134.904987-10-npiggin@gmail.com --- arch/powerpc/include/asm/pmc.h | 7 +++++++ arch/powerpc/kvm/book3s_hv.c | 20 ++++++++++++++++++++ 2 files changed, 27 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/pmc.h b/arch/powerpc/include/asm/pmc.h index c6bbe9778d3c..3c09109e708e 100644 --- a/arch/powerpc/include/asm/pmc.h +++ b/arch/powerpc/include/asm/pmc.h @@ -34,6 +34,13 @@ static inline void ppc_set_pmu_inuse(int inuse) #endif } +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +static inline int ppc_get_pmu_inuse(void) +{ + return get_paca()->pmcregs_in_use; +} +#endif + extern void power4_enable_pmcs(void); #else /* CONFIG_PPC64 */ diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 15e2ba93678e..938b0948478f 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -59,6 +59,7 @@ #include #include #include +#include #include #include #include @@ -3891,6 +3892,18 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true); +#ifdef CONFIG_PPC_PSERIES + if (kvmhv_on_pseries()) { + barrier(); + if (vcpu->arch.vpa.pinned_addr) { + struct lppaca *lp = vcpu->arch.vpa.pinned_addr; + get_lppaca()->pmcregs_in_use = lp->pmcregs_in_use; + } else { + get_lppaca()->pmcregs_in_use = 1; + } + barrier(); + } +#endif kvmhv_load_guest_pmu(vcpu); msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX); @@ -4025,6 +4038,13 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, save_pmu |= nesting_enabled(vcpu->kvm); kvmhv_save_guest_pmu(vcpu, save_pmu); +#ifdef CONFIG_PPC_PSERIES + if (kvmhv_on_pseries()) { + barrier(); + get_lppaca()->pmcregs_in_use = ppc_get_pmu_inuse(); + barrier(); + } +#endif vc->entry_exit_map = 0x101; vc->in_guest = 0; -- cgit v1.2.3-59-g8ed1b From 0c8fb653d487d2873f5eefdcaf4cecff4e103828 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 12 Aug 2021 02:00:44 +1000 Subject: powerpc/64s: Remove WORT SPR from POWER9/10 This register is not architected and not implemented in POWER9 or 10, it just reads back zeroes for compatibility. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Reviewed-by: Fabiano Rosas Link: https://lore.kernel.org/r/20210811160134.904987-11-npiggin@gmail.com --- arch/powerpc/kvm/book3s_hv.c | 3 --- arch/powerpc/platforms/powernv/idle.c | 2 -- 2 files changed, 5 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 938b0948478f..dc8f18b0aa75 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3767,7 +3767,6 @@ static void load_spr_state(struct kvm_vcpu *vcpu) mtspr(SPRN_EBBHR, vcpu->arch.ebbhr); mtspr(SPRN_EBBRR, vcpu->arch.ebbrr); mtspr(SPRN_BESCR, vcpu->arch.bescr); - mtspr(SPRN_WORT, vcpu->arch.wort); mtspr(SPRN_TIDR, vcpu->arch.tid); mtspr(SPRN_AMR, vcpu->arch.amr); mtspr(SPRN_UAMOR, vcpu->arch.uamor); @@ -3794,7 +3793,6 @@ static void store_spr_state(struct kvm_vcpu *vcpu) vcpu->arch.ebbhr = mfspr(SPRN_EBBHR); vcpu->arch.ebbrr = mfspr(SPRN_EBBRR); vcpu->arch.bescr = mfspr(SPRN_BESCR); - vcpu->arch.wort = mfspr(SPRN_WORT); vcpu->arch.tid = mfspr(SPRN_TIDR); vcpu->arch.amr = mfspr(SPRN_AMR); vcpu->arch.uamor = mfspr(SPRN_UAMOR); @@ -3826,7 +3824,6 @@ static void restore_p9_host_os_sprs(struct kvm_vcpu *vcpu, struct p9_host_os_sprs *host_os_sprs) { mtspr(SPRN_PSPB, 0); - mtspr(SPRN_WORT, 0); mtspr(SPRN_UAMOR, 0); mtspr(SPRN_DSCR, host_os_sprs->dscr); diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 528a7e0cf83a..180baecad914 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -667,7 +667,6 @@ static unsigned long power9_idle_stop(unsigned long psscr) sprs.purr = mfspr(SPRN_PURR); sprs.spurr = mfspr(SPRN_SPURR); sprs.dscr = mfspr(SPRN_DSCR); - sprs.wort = mfspr(SPRN_WORT); sprs.ciabr = mfspr(SPRN_CIABR); sprs.mmcra = mfspr(SPRN_MMCRA); @@ -785,7 +784,6 @@ core_woken: mtspr(SPRN_PURR, sprs.purr); mtspr(SPRN_SPURR, sprs.spurr); mtspr(SPRN_DSCR, sprs.dscr); - mtspr(SPRN_WORT, sprs.wort); mtspr(SPRN_CIABR, sprs.ciabr); mtspr(SPRN_MMCRA, sprs.mmcra); -- cgit v1.2.3-59-g8ed1b From b1643084d164cea0c107a39bcdf0119fc52619af Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Wed, 18 Aug 2021 22:45:54 +0530 Subject: powerpc/perf: Use stack siar instead of mfspr Minor optimization in the 'perf_instruction_pointer' function code by making use of stack siar instead of mfspr. Fixes: 75382aa72f06 ("powerpc/perf: Move code to select SIAR or pt_regs into perf_read_regs") Signed-off-by: Kajol Jain Tested-by: Nageswara R Sastry Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210818171556.36912-1-kjain@linux.ibm.com --- arch/powerpc/perf/core-book3s.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 91203ed9d0ff..3a782a35100d 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2269,7 +2269,7 @@ unsigned long perf_instruction_pointer(struct pt_regs *regs) else return regs->nip; } else if (use_siar && siar_valid(regs)) - return mfspr(SPRN_SIAR) + perf_ip_adjust(regs); + return siar + perf_ip_adjust(regs); else if (use_siar) return 0; // no valid instruction pointer else -- cgit v1.2.3-59-g8ed1b From cc90c6742ef5b438f4cb86029d7a794bd0a44a06 Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Wed, 18 Aug 2021 22:45:55 +0530 Subject: powerpc/perf: Drop the case of returning 0 as instruction pointer Drop the case of returning 0 as instruction pointer since kernel never executes at 0 and userspace almost never does either. Fixes: e6878835ac47 ("powerpc/perf: Sample only if SIAR-Valid bit is set in P7+") Signed-off-by: Kajol Jain Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210818171556.36912-2-kjain@linux.ibm.com --- arch/powerpc/perf/core-book3s.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 3a782a35100d..9bb466d2d99e 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2270,8 +2270,6 @@ unsigned long perf_instruction_pointer(struct pt_regs *regs) return regs->nip; } else if (use_siar && siar_valid(regs)) return siar + perf_ip_adjust(regs); - else if (use_siar) - return 0; // no valid instruction pointer else return regs->nip; } -- cgit v1.2.3-59-g8ed1b From 3c69a5f22223fa3e312689ec218b5059784d49d7 Mon Sep 17 00:00:00 2001 From: Kajol Jain Date: Wed, 18 Aug 2021 22:45:56 +0530 Subject: powerpc/perf: Fix the check for SIAR value Incase of random sampling, there can be scenarios where Sample Instruction Address Register(SIAR) may not latch to the sampled instruction and could result in the value of 0. In these scenarios it is preferred to return regs->nip. These corner cases are seen in the previous generation (p9) also. Patch adds the check for SIAR value along with regs_use_siar and siar_valid checks so that the function will return regs->nip incase SIAR is zero. Patch drops the code under PPMU_P10_DD1 flag check which handles SIAR 0 case only for Power10 DD1. Fixes: 2ca13a4cc56c9 ("powerpc/perf: Use regs->nip when SIAR is zero") Signed-off-by: Kajol Jain Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210818171556.36912-3-kjain@linux.ibm.com --- arch/powerpc/perf/core-book3s.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 9bb466d2d99e..73e62e9b179b 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2260,15 +2260,9 @@ unsigned long perf_misc_flags(struct pt_regs *regs) */ unsigned long perf_instruction_pointer(struct pt_regs *regs) { - bool use_siar = regs_use_siar(regs); unsigned long siar = mfspr(SPRN_SIAR); - if (ppmu && (ppmu->flags & PPMU_P10_DD1)) { - if (siar) - return siar; - else - return regs->nip; - } else if (use_siar && siar_valid(regs)) + if (regs_use_siar(regs) && siar_valid(regs) && siar) return siar + perf_ip_adjust(regs); else return regs->nip; -- cgit v1.2.3-59-g8ed1b From 4f8e78c0757e3c5a65d9d8ac76e2434c71a78f5a Mon Sep 17 00:00:00 2001 From: Xiongwei Song Date: Sat, 7 Aug 2021 09:02:36 +0800 Subject: powerpc: Add esr as a synonym for pt_regs.dsisr Create an anonymous union for dsisr and esr regsiters, we can reference esr to get the exception detail when CONFIG_4xx=y or CONFIG_BOOKE=y. Otherwise, reference dsisr. This makes code more clear. Signed-off-by: Xiongwei Song [mpe: Reword commit title] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210807010239.416055-2-sxwjean@me.com --- arch/powerpc/include/asm/ptrace.h | 5 ++++- arch/powerpc/kernel/process.c | 2 +- arch/powerpc/kernel/ptrace/ptrace.c | 2 ++ arch/powerpc/kernel/traps.c | 2 +- arch/powerpc/platforms/44x/machine_check.c | 4 ++-- arch/powerpc/platforms/4xx/machine_check.c | 2 +- 6 files changed, 11 insertions(+), 6 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 333979cf5d1e..ab58939653db 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -44,7 +44,10 @@ struct pt_regs #endif unsigned long trap; unsigned long dar; - unsigned long dsisr; + union { + unsigned long dsisr; + unsigned long esr; + }; unsigned long result; }; }; diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 185beb290580..f74af8f9133c 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1499,7 +1499,7 @@ static void __show_regs(struct pt_regs *regs) trap == INTERRUPT_DATA_STORAGE || trap == INTERRUPT_ALIGNMENT) { if (IS_ENABLED(CONFIG_4xx) || IS_ENABLED(CONFIG_BOOKE)) - pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, regs->dsisr); + pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, regs->esr); else pr_cont("DAR: "REG" DSISR: %08lx ", regs->dar, regs->dsisr); } diff --git a/arch/powerpc/kernel/ptrace/ptrace.c b/arch/powerpc/kernel/ptrace/ptrace.c index 0a0a33eb0d28..a222fd4d6334 100644 --- a/arch/powerpc/kernel/ptrace/ptrace.c +++ b/arch/powerpc/kernel/ptrace/ptrace.c @@ -375,6 +375,8 @@ void __init pt_regs_check(void) offsetof(struct user_pt_regs, dar)); BUILD_BUG_ON(offsetof(struct pt_regs, dsisr) != offsetof(struct user_pt_regs, dsisr)); + BUILD_BUG_ON(offsetof(struct pt_regs, esr) != + offsetof(struct user_pt_regs, dsisr)); BUILD_BUG_ON(offsetof(struct pt_regs, result) != offsetof(struct user_pt_regs, result)); diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 51d4f5faf425..5463d201d465 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -562,7 +562,7 @@ static inline int check_io_access(struct pt_regs *regs) #ifdef CONFIG_PPC_ADV_DEBUG_REGS /* On 4xx, the reason for the machine check or program exception is in the ESR. */ -#define get_reason(regs) ((regs)->dsisr) +#define get_reason(regs) ((regs)->esr) #define REASON_FP ESR_FP #define REASON_ILLEGAL (ESR_PIL | ESR_PUO) #define REASON_PRIVILEGED ESR_PPR diff --git a/arch/powerpc/platforms/44x/machine_check.c b/arch/powerpc/platforms/44x/machine_check.c index a5c898bb9bab..5d19daacd78a 100644 --- a/arch/powerpc/platforms/44x/machine_check.c +++ b/arch/powerpc/platforms/44x/machine_check.c @@ -11,7 +11,7 @@ int machine_check_440A(struct pt_regs *regs) { - unsigned long reason = regs->dsisr; + unsigned long reason = regs->esr; printk("Machine check in kernel mode.\n"); if (reason & ESR_IMCP){ @@ -48,7 +48,7 @@ int machine_check_440A(struct pt_regs *regs) #ifdef CONFIG_PPC_47x int machine_check_47x(struct pt_regs *regs) { - unsigned long reason = regs->dsisr; + unsigned long reason = regs->esr; u32 mcsr; printk(KERN_ERR "Machine check in kernel mode.\n"); diff --git a/arch/powerpc/platforms/4xx/machine_check.c b/arch/powerpc/platforms/4xx/machine_check.c index a71c29892a91..a905da1d6f41 100644 --- a/arch/powerpc/platforms/4xx/machine_check.c +++ b/arch/powerpc/platforms/4xx/machine_check.c @@ -10,7 +10,7 @@ int machine_check_4xx(struct pt_regs *regs) { - unsigned long reason = regs->dsisr; + unsigned long reason = regs->esr; if (reason & ESR_IMCP) { printk("Instruction"); -- cgit v1.2.3-59-g8ed1b From cfa47772ca8d53d7a6c9b331a7f6e7c4c9827214 Mon Sep 17 00:00:00 2001 From: Xiongwei Song Date: Sat, 7 Aug 2021 09:02:37 +0800 Subject: powerpc/64e: Get esr offset with _ESR macro Use _ESR to get the offset of esr register in pr_regs for 64e cpus. Signed-off-by: Xiongwei Song Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210807010239.416055-3-sxwjean@me.com --- arch/powerpc/kernel/asm-offsets.c | 2 +- arch/powerpc/kernel/exceptions-64e.S | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index a47eefa09bcb..f4ebc435fd78 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -287,6 +287,7 @@ int main(void) STACK_PT_REGS_OFFSET(_XER, xer); STACK_PT_REGS_OFFSET(_DAR, dar); STACK_PT_REGS_OFFSET(_DSISR, dsisr); + STACK_PT_REGS_OFFSET(_ESR, esr); STACK_PT_REGS_OFFSET(ORIG_GPR3, orig_gpr3); STACK_PT_REGS_OFFSET(RESULT, result); STACK_PT_REGS_OFFSET(_TRAP, trap); @@ -298,7 +299,6 @@ int main(void) * we use them to hold SRR0 and SRR1. */ STACK_PT_REGS_OFFSET(_DEAR, dar); - STACK_PT_REGS_OFFSET(_ESR, dsisr); #else /* CONFIG_PPC64 */ STACK_PT_REGS_OFFSET(SOFTE, softe); STACK_PT_REGS_OFFSET(_PPR, ppr); diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index 7e0943d9f9b0..bf8822a5ae0e 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -546,7 +546,7 @@ __end_interrupts: mfspr r14,SPRN_DEAR mfspr r15,SPRN_ESR std r14,_DAR(r1) - std r15,_DSISR(r1) + std r15,_ESR(r1) ld r14,PACA_EXGEN+EX_R14(r13) ld r15,PACA_EXGEN+EX_R15(r13) EXCEPTION_COMMON(0x300) @@ -559,7 +559,7 @@ __end_interrupts: li r15,0 mr r14,r10 std r14,_DAR(r1) - std r15,_DSISR(r1) + std r15,_ESR(r1) ld r14,PACA_EXGEN+EX_R14(r13) ld r15,PACA_EXGEN+EX_R15(r13) EXCEPTION_COMMON(0x400) @@ -576,7 +576,7 @@ __end_interrupts: mfspr r14,SPRN_DEAR mfspr r15,SPRN_ESR std r14,_DAR(r1) - std r15,_DSISR(r1) + std r15,_ESR(r1) ld r14,PACA_EXGEN+EX_R14(r13) ld r15,PACA_EXGEN+EX_R15(r13) EXCEPTION_COMMON(0x600) @@ -587,7 +587,7 @@ __end_interrupts: NORMAL_EXCEPTION_PROLOG(0x700, BOOKE_INTERRUPT_PROGRAM, PROLOG_ADDITION_1REG) mfspr r14,SPRN_ESR - std r14,_DSISR(r1) + std r14,_ESR(r1) ld r14,PACA_EXGEN+EX_R14(r13) EXCEPTION_COMMON(0x700) addi r3,r1,STACK_FRAME_OVERHEAD @@ -1058,7 +1058,7 @@ bad_stack_book3e: mfspr r10,SPRN_DEAR mfspr r11,SPRN_ESR std r10,_DAR(r1) - std r11,_DSISR(r1) + std r11,_ESR(r1) std r0,GPR0(r1); /* save r0 in stackframe */ \ std r2,GPR2(r1); /* save r2 in stackframe */ \ SAVE_4GPRS(3, r1); /* save r3 - r6 in stackframe */ \ -- cgit v1.2.3-59-g8ed1b From 4872cbd0ca35ca5b20d52e2539e7e1950f126e7b Mon Sep 17 00:00:00 2001 From: Xiongwei Song Date: Sat, 7 Aug 2021 09:02:38 +0800 Subject: powerpc: Add dear as a synonym for pt_regs.dar register Create an anonymous union for dar and dear regsiters, we can reference dear to get the effective address when CONFIG_4xx=y or CONFIG_BOOKE=y. Otherwise, reference dar. This makes code more clear. Signed-off-by: Xiongwei Song [mpe: Reword commit title] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210807010239.416055-4-sxwjean@me.com --- arch/powerpc/include/asm/ptrace.h | 5 ++++- arch/powerpc/kernel/process.c | 2 +- arch/powerpc/kernel/ptrace/ptrace.c | 2 ++ 3 files changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index ab58939653db..5114ddba6f16 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -43,7 +43,10 @@ struct pt_regs unsigned long mq; #endif unsigned long trap; - unsigned long dar; + union { + unsigned long dar; + unsigned long dear; + }; union { unsigned long dsisr; unsigned long esr; diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index f74af8f9133c..50436b52c213 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1499,7 +1499,7 @@ static void __show_regs(struct pt_regs *regs) trap == INTERRUPT_DATA_STORAGE || trap == INTERRUPT_ALIGNMENT) { if (IS_ENABLED(CONFIG_4xx) || IS_ENABLED(CONFIG_BOOKE)) - pr_cont("DEAR: "REG" ESR: "REG" ", regs->dar, regs->esr); + pr_cont("DEAR: "REG" ESR: "REG" ", regs->dear, regs->esr); else pr_cont("DAR: "REG" DSISR: %08lx ", regs->dar, regs->dsisr); } diff --git a/arch/powerpc/kernel/ptrace/ptrace.c b/arch/powerpc/kernel/ptrace/ptrace.c index a222fd4d6334..7c7093c17c45 100644 --- a/arch/powerpc/kernel/ptrace/ptrace.c +++ b/arch/powerpc/kernel/ptrace/ptrace.c @@ -373,6 +373,8 @@ void __init pt_regs_check(void) offsetof(struct user_pt_regs, trap)); BUILD_BUG_ON(offsetof(struct pt_regs, dar) != offsetof(struct user_pt_regs, dar)); + BUILD_BUG_ON(offsetof(struct pt_regs, dear) != + offsetof(struct user_pt_regs, dar)); BUILD_BUG_ON(offsetof(struct pt_regs, dsisr) != offsetof(struct user_pt_regs, dsisr)); BUILD_BUG_ON(offsetof(struct pt_regs, esr) != -- cgit v1.2.3-59-g8ed1b From d9db6e420268b2d561731468a31f0b15e2e9a145 Mon Sep 17 00:00:00 2001 From: Xiongwei Song Date: Sat, 7 Aug 2021 09:02:39 +0800 Subject: powerpc/64e: Get dear offset with _DEAR macro Use _DEAR to get the offset of dear register in pr_regs for 64e cpus. Signed-off-by: Xiongwei Song Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210807010239.416055-5-sxwjean@me.com --- arch/powerpc/kernel/asm-offsets.c | 13 +++---------- arch/powerpc/kernel/exceptions-64e.S | 8 ++++---- 2 files changed, 7 insertions(+), 14 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index f4ebc435fd78..8357d5fcd09e 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -286,23 +286,16 @@ int main(void) STACK_PT_REGS_OFFSET(_CCR, ccr); STACK_PT_REGS_OFFSET(_XER, xer); STACK_PT_REGS_OFFSET(_DAR, dar); + STACK_PT_REGS_OFFSET(_DEAR, dear); STACK_PT_REGS_OFFSET(_DSISR, dsisr); STACK_PT_REGS_OFFSET(_ESR, esr); STACK_PT_REGS_OFFSET(ORIG_GPR3, orig_gpr3); STACK_PT_REGS_OFFSET(RESULT, result); STACK_PT_REGS_OFFSET(_TRAP, trap); -#ifndef CONFIG_PPC64 - /* - * The PowerPC 400-class & Book-E processors have neither the DAR - * nor the DSISR SPRs. Hence, we overload them to hold the similar - * DEAR and ESR SPRs for such processors. For critical interrupts - * we use them to hold SRR0 and SRR1. - */ - STACK_PT_REGS_OFFSET(_DEAR, dar); -#else /* CONFIG_PPC64 */ +#ifdef CONFIG_PPC64 STACK_PT_REGS_OFFSET(SOFTE, softe); STACK_PT_REGS_OFFSET(_PPR, ppr); -#endif /* CONFIG_PPC64 */ +#endif #ifdef CONFIG_PPC_PKEY STACK_PT_REGS_OFFSET(STACK_REGS_AMR, amr); diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index bf8822a5ae0e..711c66b76df1 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -545,7 +545,7 @@ __end_interrupts: PROLOG_ADDITION_2REGS) mfspr r14,SPRN_DEAR mfspr r15,SPRN_ESR - std r14,_DAR(r1) + std r14,_DEAR(r1) std r15,_ESR(r1) ld r14,PACA_EXGEN+EX_R14(r13) ld r15,PACA_EXGEN+EX_R15(r13) @@ -558,7 +558,7 @@ __end_interrupts: PROLOG_ADDITION_2REGS) li r15,0 mr r14,r10 - std r14,_DAR(r1) + std r14,_DEAR(r1) std r15,_ESR(r1) ld r14,PACA_EXGEN+EX_R14(r13) ld r15,PACA_EXGEN+EX_R15(r13) @@ -575,7 +575,7 @@ __end_interrupts: PROLOG_ADDITION_2REGS) mfspr r14,SPRN_DEAR mfspr r15,SPRN_ESR - std r14,_DAR(r1) + std r14,_DEAR(r1) std r15,_ESR(r1) ld r14,PACA_EXGEN+EX_R14(r13) ld r15,PACA_EXGEN+EX_R15(r13) @@ -1057,7 +1057,7 @@ bad_stack_book3e: std r11,_CCR(r1) mfspr r10,SPRN_DEAR mfspr r11,SPRN_ESR - std r10,_DAR(r1) + std r10,_DEAR(r1) std r11,_ESR(r1) std r0,GPR0(r1); /* save r0 in stackframe */ \ std r2,GPR2(r1); /* save r2 in stackframe */ \ -- cgit v1.2.3-59-g8ed1b From 133c17a1788d68c9fff59d5f724a4ba14647a16d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 23 Aug 2021 08:24:20 +0000 Subject: powerpc: Remove MSR_PR check in interrupt_exit_{user/kernel}_prepare() In those hot functions that are called at every interrupt, any saved cycle is worth it. interrupt_exit_user_prepare() and interrupt_exit_kernel_prepare() are called from three places: - From entry_32.S - From interrupt_64.S - From interrupt_exit_user_restart() and interrupt_exit_kernel_restart() In entry_32.S, there are inambiguously called based on MSR_PR: interrupt_return: lwz r4,_MSR(r1) addi r3,r1,STACK_FRAME_OVERHEAD andi. r0,r4,MSR_PR beq .Lkernel_interrupt_return bl interrupt_exit_user_prepare ... .Lkernel_interrupt_return: bl interrupt_exit_kernel_prepare In interrupt_64.S, that's similar: interrupt_return_\srr\(): ld r4,_MSR(r1) andi. r0,r4,MSR_PR beq interrupt_return_\srr\()_kernel interrupt_return_\srr\()_user: /* make backtraces match the _kernel variant */ addi r3,r1,STACK_FRAME_OVERHEAD bl interrupt_exit_user_prepare ... interrupt_return_\srr\()_kernel: addi r3,r1,STACK_FRAME_OVERHEAD bl interrupt_exit_kernel_prepare In interrupt_exit_user_restart() and interrupt_exit_kernel_restart(), MSR_PR is verified respectively by BUG_ON(!user_mode(regs)) and BUG_ON(user_mode(regs)) prior to calling interrupt_exit_user_prepare() and interrupt_exit_kernel_prepare(). The verification in interrupt_exit_user_prepare() and interrupt_exit_kernel_prepare() are therefore useless and can be removed. Signed-off-by: Christophe Leroy Acked-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/385ead49ccb66a259b25fee3eebf0bd4094068f3.1629707037.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/interrupt.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c index abc2b3dc3f20..531f0a1dbabc 100644 --- a/arch/powerpc/kernel/interrupt.c +++ b/arch/powerpc/kernel/interrupt.c @@ -464,7 +464,6 @@ notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs) if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x)) BUG_ON(!(regs->msr & MSR_RI)); - BUG_ON(!(regs->msr & MSR_PR)); BUG_ON(arch_irq_disabled_regs(regs)); CT_WARN_ON(ct_state() == CONTEXT_USER); @@ -498,7 +497,6 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs) if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x) && unlikely(!(regs->msr & MSR_RI))) unrecoverable_exception(regs); - BUG_ON(regs->msr & MSR_PR); /* * CT_WARN_ON comes here via program_check_exception, * so avoid recursion. -- cgit v1.2.3-59-g8ed1b From 806c0e6e7e97adc17389c8dc1f52d4736f49299b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 23 Aug 2021 08:24:21 +0000 Subject: powerpc: Refactor verification of MSR_RI 40x and BOOKE don't have MSR_RI therefore all tests involving MSR_RI may be problematic on those plateforms. Create helpers to check or set MSR_RI in regs, and use them in common code. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/c2fb93708196734f4176dda334aaa3055f213b89.1629707037.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/ptrace.h | 23 +++++++++++++++++++++++ arch/powerpc/kernel/interrupt.c | 9 +++------ arch/powerpc/kernel/traps.c | 8 ++++---- arch/powerpc/mm/book3s64/slb.c | 2 +- arch/powerpc/platforms/embedded6xx/holly.c | 2 +- arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c | 2 +- arch/powerpc/platforms/pasemi/idle.c | 2 +- arch/powerpc/platforms/powernv/opal.c | 2 +- arch/powerpc/platforms/pseries/ras.c | 2 +- arch/powerpc/sysdev/fsl_rio.c | 2 +- arch/powerpc/xmon/xmon.c | 16 +++------------- 11 files changed, 40 insertions(+), 30 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 5114ddba6f16..f9c235c1d6ce 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -22,6 +22,7 @@ #include #include #include +#include #ifndef __ASSEMBLY__ struct pt_regs @@ -272,6 +273,28 @@ static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) regs->gpr[3] = rc; } +static inline bool cpu_has_msr_ri(void) +{ + return !IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x); +} + +static inline bool regs_is_unrecoverable(struct pt_regs *regs) +{ + return unlikely(cpu_has_msr_ri() && !(regs->msr & MSR_RI)); +} + +static inline void regs_set_recoverable(struct pt_regs *regs) +{ + if (cpu_has_msr_ri()) + regs_set_return_msr(regs, regs->msr | MSR_RI); +} + +static inline void regs_set_unrecoverable(struct pt_regs *regs) +{ + if (cpu_has_msr_ri()) + regs_set_return_msr(regs, regs->msr & ~MSR_RI); +} + #define arch_has_single_step() (1) #define arch_has_block_step() (true) #define ARCH_HAS_USER_SINGLE_STEP_REPORT diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c index 531f0a1dbabc..a73f3f70a657 100644 --- a/arch/powerpc/kernel/interrupt.c +++ b/arch/powerpc/kernel/interrupt.c @@ -92,8 +92,7 @@ notrace long system_call_exception(long r3, long r4, long r5, CT_WARN_ON(ct_state() == CONTEXT_KERNEL); user_exit_irqoff(); - if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x)) - BUG_ON(!(regs->msr & MSR_RI)); + BUG_ON(regs_is_unrecoverable(regs)); BUG_ON(!(regs->msr & MSR_PR)); BUG_ON(arch_irq_disabled_regs(regs)); @@ -462,8 +461,7 @@ notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs) { unsigned long ret; - if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x)) - BUG_ON(!(regs->msr & MSR_RI)); + BUG_ON(regs_is_unrecoverable(regs)); BUG_ON(arch_irq_disabled_regs(regs)); CT_WARN_ON(ct_state() == CONTEXT_USER); @@ -494,8 +492,7 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs) bool stack_store = current_thread_info()->flags & _TIF_EMULATE_STACK_STORE; - if (!IS_ENABLED(CONFIG_BOOKE) && !IS_ENABLED(CONFIG_40x) && - unlikely(!(regs->msr & MSR_RI))) + if (regs_is_unrecoverable(regs)) unrecoverable_exception(regs); /* * CT_WARN_ON comes here via program_check_exception, diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 5463d201d465..3a344f5265c2 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -428,7 +428,7 @@ void hv_nmi_check_nonrecoverable(struct pt_regs *regs) return; nonrecoverable: - regs_set_return_msr(regs, regs->msr & ~MSR_RI); + regs_set_unrecoverable(regs); #endif } DEFINE_INTERRUPT_HANDLER_NMI(system_reset_exception) @@ -498,7 +498,7 @@ out: die("Unrecoverable nested System Reset", regs, SIGABRT); #endif /* Must die if the interrupt is not recoverable */ - if (!(regs->msr & MSR_RI)) { + if (regs_is_unrecoverable(regs)) { /* For the reason explained in die_mce, nmi_exit before die */ nmi_exit(); die("Unrecoverable System Reset", regs, SIGABRT); @@ -550,7 +550,7 @@ static inline int check_io_access(struct pt_regs *regs) printk(KERN_DEBUG "%s bad port %lx at %p\n", (*nip & 0x100)? "OUT to": "IN from", regs->gpr[rb] - _IO_BASE, nip); - regs_set_return_msr(regs, regs->msr | MSR_RI); + regs_set_recoverable(regs); regs_set_return_ip(regs, extable_fixup(entry)); return 1; } @@ -840,7 +840,7 @@ DEFINE_INTERRUPT_HANDLER_NMI(machine_check_exception) bail: /* Must die if the interrupt is not recoverable */ - if (!(regs->msr & MSR_RI)) + if (regs_is_unrecoverable(regs)) die_mce("Unrecoverable Machine check", regs, SIGBUS); #ifdef CONFIG_PPC_BOOK3S_64 diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c index c91bd85eb90e..f0037bcc47a0 100644 --- a/arch/powerpc/mm/book3s64/slb.c +++ b/arch/powerpc/mm/book3s64/slb.c @@ -822,7 +822,7 @@ DEFINE_INTERRUPT_HANDLER_RAW(do_slb_fault) /* IRQs are not reconciled here, so can't check irqs_disabled */ VM_WARN_ON(mfmsr() & MSR_EE); - if (unlikely(!(regs->msr & MSR_RI))) + if (regs_is_unrecoverable(regs)) return -EINVAL; /* diff --git a/arch/powerpc/platforms/embedded6xx/holly.c b/arch/powerpc/platforms/embedded6xx/holly.c index 85521b3e7098..7a85b117f7a4 100644 --- a/arch/powerpc/platforms/embedded6xx/holly.c +++ b/arch/powerpc/platforms/embedded6xx/holly.c @@ -251,7 +251,7 @@ static int ppc750_machine_check_exception(struct pt_regs *regs) /* Are we prepared to handle this fault */ if ((entry = search_exception_tables(regs->nip)) != NULL) { tsi108_clear_pci_cfg_error(); - regs_set_return_msr(regs, regs->msr | MSR_RI); + regs_set_recoverable(regs); regs_set_return_ip(regs, extable_fixup(entry)); return 1; } diff --git a/arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c b/arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c index d8da6a483e59..9eb9abb5bce2 100644 --- a/arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c +++ b/arch/powerpc/platforms/embedded6xx/mpc7448_hpc2.c @@ -173,7 +173,7 @@ static int mpc7448_machine_check_exception(struct pt_regs *regs) /* Are we prepared to handle this fault */ if ((entry = search_exception_tables(regs->nip)) != NULL) { tsi108_clear_pci_cfg_error(); - regs_set_return_msr(regs, regs->msr | MSR_RI); + regs_set_recoverable(regs); regs_set_return_ip(regs, extable_fixup(entry)); return 1; } diff --git a/arch/powerpc/platforms/pasemi/idle.c b/arch/powerpc/platforms/pasemi/idle.c index 9b88e3cded7d..2a5b84268b77 100644 --- a/arch/powerpc/platforms/pasemi/idle.c +++ b/arch/powerpc/platforms/pasemi/idle.c @@ -58,7 +58,7 @@ static int pasemi_system_reset_exception(struct pt_regs *regs) restore_astate(hard_smp_processor_id()); /* everything handled */ - regs_set_return_msr(regs, regs->msr | MSR_RI); + regs_set_recoverable(regs); return 1; } diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 2835376e61a4..e9d18519e650 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -588,7 +588,7 @@ static int opal_recover_mce(struct pt_regs *regs, { int recovered = 0; - if (!(regs->msr & MSR_RI)) { + if (regs_is_unrecoverable(regs)) { /* If MSR_RI isn't set, we cannot recover */ pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n"); recovered = 0; diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c index 167f2e1b8d39..56092dccfdb8 100644 --- a/arch/powerpc/platforms/pseries/ras.c +++ b/arch/powerpc/platforms/pseries/ras.c @@ -783,7 +783,7 @@ static int recover_mce(struct pt_regs *regs, struct machine_check_event *evt) { int recovered = 0; - if (!(regs->msr & MSR_RI)) { + if (regs_is_unrecoverable(regs)) { /* If MSR_RI isn't set, we cannot recover */ pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n"); recovered = 0; diff --git a/arch/powerpc/sysdev/fsl_rio.c b/arch/powerpc/sysdev/fsl_rio.c index 5a95b8ea23d8..ff7906b48ca1 100644 --- a/arch/powerpc/sysdev/fsl_rio.c +++ b/arch/powerpc/sysdev/fsl_rio.c @@ -108,7 +108,7 @@ int fsl_rio_mcheck_exception(struct pt_regs *regs) __func__); out_be32((u32 *)(rio_regs_win + RIO_LTLEDCSR), 0); - regs_set_return_msr(regs, regs->msr | MSR_RI); + regs_set_recoverable(regs); regs_set_return_ip(regs, extable_fixup(entry)); return 1; } diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index ead460b80905..dd8241c009e5 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -482,16 +482,6 @@ static inline void get_output_lock(void) {} static inline void release_output_lock(void) {} #endif -static inline int unrecoverable_excp(struct pt_regs *regs) -{ -#if defined(CONFIG_4xx) || defined(CONFIG_PPC_BOOK3E) - /* We have no MSR_RI bit on 4xx or Book3e, so we simply return false */ - return 0; -#else - return ((regs->msr & MSR_RI) == 0); -#endif -} - static void xmon_touch_watchdogs(void) { touch_softlockup_watchdog_sync(); @@ -565,7 +555,7 @@ static int xmon_core(struct pt_regs *regs, volatile int fromipi) bp = NULL; if ((regs->msr & (MSR_IR|MSR_PR|MSR_64BIT)) == (MSR_IR|MSR_64BIT)) bp = at_breakpoint(regs->nip); - if (bp || unrecoverable_excp(regs)) + if (bp || regs_is_unrecoverable(regs)) fromipi = 0; if (!fromipi) { @@ -577,7 +567,7 @@ static int xmon_core(struct pt_regs *regs, volatile int fromipi) cpu, BP_NUM(bp)); xmon_print_symbol(regs->nip, " ", ")\n"); } - if (unrecoverable_excp(regs)) + if (regs_is_unrecoverable(regs)) printf("WARNING: exception is not recoverable, " "can't continue\n"); release_output_lock(); @@ -693,7 +683,7 @@ static int xmon_core(struct pt_regs *regs, volatile int fromipi) printf("Stopped at breakpoint %tx (", BP_NUM(bp)); xmon_print_symbol(regs->nip, " ", ")\n"); } - if (unrecoverable_excp(regs)) + if (regs_is_unrecoverable(regs)) printf("WARNING: exception is not recoverable, " "can't continue\n"); remove_bpts(); -- cgit v1.2.3-59-g8ed1b From c12adb0678446b3284a6135dc5fa7aa3b6a968a5 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Thu, 7 Jan 2021 08:40:38 -0500 Subject: powerpc: retire sbc8548 board support The support was for this was mainlined 13 years ago, in v2.6.25 [0e0fffe88767] just around the ppc --> powerpc migration. I believe the board was introduced a year or two before that, so it is roughly a 15 year old platform - with the CPU speed and memory size that was typical for that era. I haven't had one of these boards for several years, and availability was discontinued several years before that. Given that, there is no point in adding a burden to testing coverage that builds all possible defconfigs, so it makes sense to remove it. Of course it will remain in the git history forever, for anyone who happens to find a functional board and wants to tinker with it. Acked-by: Scott Wood Signed-off-by: Paul Gortmaker Signed-off-by: Michael Ellerman --- arch/powerpc/boot/Makefile | 1 - arch/powerpc/boot/dts/sbc8548-altflash.dts | 111 ----------- arch/powerpc/boot/dts/sbc8548-post.dtsi | 289 ---------------------------- arch/powerpc/boot/dts/sbc8548-pre.dtsi | 48 ----- arch/powerpc/boot/dts/sbc8548.dts | 106 ---------- arch/powerpc/boot/wrapper | 2 +- arch/powerpc/configs/85xx/sbc8548_defconfig | 50 ----- arch/powerpc/configs/mpc85xx_base.config | 1 - arch/powerpc/platforms/85xx/Kconfig | 6 - arch/powerpc/platforms/85xx/Makefile | 1 - arch/powerpc/platforms/85xx/sbc8548.c | 134 ------------- 11 files changed, 1 insertion(+), 748 deletions(-) delete mode 100644 arch/powerpc/boot/dts/sbc8548-altflash.dts delete mode 100644 arch/powerpc/boot/dts/sbc8548-post.dtsi delete mode 100644 arch/powerpc/boot/dts/sbc8548-pre.dtsi delete mode 100644 arch/powerpc/boot/dts/sbc8548.dts delete mode 100644 arch/powerpc/configs/85xx/sbc8548_defconfig delete mode 100644 arch/powerpc/platforms/85xx/sbc8548.c (limited to 'arch/powerpc') diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index e312ea802aa6..3e30ea6f2187 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -341,7 +341,6 @@ image-$(CONFIG_TQM8541) += cuImage.tqm8541 image-$(CONFIG_TQM8548) += cuImage.tqm8548 image-$(CONFIG_TQM8555) += cuImage.tqm8555 image-$(CONFIG_TQM8560) += cuImage.tqm8560 -image-$(CONFIG_SBC8548) += cuImage.sbc8548 image-$(CONFIG_KSI8560) += cuImage.ksi8560 # Board ports in arch/powerpc/platform/86xx/Kconfig diff --git a/arch/powerpc/boot/dts/sbc8548-altflash.dts b/arch/powerpc/boot/dts/sbc8548-altflash.dts deleted file mode 100644 index bb7a1e712bb7..000000000000 --- a/arch/powerpc/boot/dts/sbc8548-altflash.dts +++ /dev/null @@ -1,111 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SBC8548 Device Tree Source - * - * Configured for booting off the alternate (64MB SODIMM) flash. - * Requires switching JP12 jumpers and changing SW2.8 setting. - * - * Copyright 2013 Wind River Systems Inc. - * - * Paul Gortmaker (see MAINTAINERS for contact information) - */ - - -/dts-v1/; - -/include/ "sbc8548-pre.dtsi" - -/{ - localbus@e0000000 { - #address-cells = <2>; - #size-cells = <1>; - compatible = "simple-bus"; - reg = <0xe0000000 0x5000>; - interrupt-parent = <&mpic>; - - ranges = <0x0 0x0 0xfc000000 0x04000000 /*64MB Flash*/ - 0x3 0x0 0xf0000000 0x04000000 /*64MB SDRAM*/ - 0x4 0x0 0xf4000000 0x04000000 /*64MB SDRAM*/ - 0x5 0x0 0xf8000000 0x00b10000 /* EPLD */ - 0x6 0x0 0xef800000 0x00800000>; /*8MB Flash*/ - - flash@0,0 { - #address-cells = <1>; - #size-cells = <1>; - reg = <0x0 0x0 0x04000000>; - compatible = "intel,JS28F128", "cfi-flash"; - bank-width = <4>; - device-width = <1>; - partition@0 { - label = "space"; - /* FC000000 -> FFEFFFFF */ - reg = <0x00000000 0x03f00000>; - }; - partition@3f00000 { - label = "bootloader"; - /* FFF00000 -> FFFFFFFF */ - reg = <0x03f00000 0x00100000>; - read-only; - }; - }; - - - epld@5,0 { - compatible = "wrs,epld-localbus"; - #address-cells = <2>; - #size-cells = <1>; - reg = <0x5 0x0 0x00b10000>; - ranges = < - 0x0 0x0 0x5 0x000000 0x1fff /* LED */ - 0x1 0x0 0x5 0x100000 0x1fff /* Switches */ - 0x3 0x0 0x5 0x300000 0x1fff /* HW Rev. */ - 0xb 0x0 0x5 0xb00000 0x1fff /* EEPROM */ - >; - - led@0,0 { - compatible = "led"; - reg = <0x0 0x0 0x1fff>; - }; - - switches@1,0 { - compatible = "switches"; - reg = <0x1 0x0 0x1fff>; - }; - - hw-rev@3,0 { - compatible = "hw-rev"; - reg = <0x3 0x0 0x1fff>; - }; - - eeprom@b,0 { - compatible = "eeprom"; - reg = <0xb 0 0x1fff>; - }; - - }; - - alt-flash@6,0 { - #address-cells = <1>; - #size-cells = <1>; - compatible = "intel,JS28F640", "cfi-flash"; - reg = <0x6 0x0 0x800000>; - bank-width = <1>; - device-width = <1>; - partition@0 { - label = "space"; - /* EF800000 -> EFF9FFFF */ - reg = <0x00000000 0x007a0000>; - }; - partition@7a0000 { - label = "bootloader"; - /* EFFA0000 -> EFFFFFFF */ - reg = <0x007a0000 0x00060000>; - read-only; - }; - }; - - - }; -}; - -/include/ "sbc8548-post.dtsi" diff --git a/arch/powerpc/boot/dts/sbc8548-post.dtsi b/arch/powerpc/boot/dts/sbc8548-post.dtsi deleted file mode 100644 index 9d848d409408..000000000000 --- a/arch/powerpc/boot/dts/sbc8548-post.dtsi +++ /dev/null @@ -1,289 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SBC8548 Device Tree Source - * - * Copyright 2007 Wind River Systems Inc. - * - * Paul Gortmaker (see MAINTAINERS for contact information) - */ - -/{ - soc8548@e0000000 { - #address-cells = <1>; - #size-cells = <1>; - device_type = "soc"; - ranges = <0x00000000 0xe0000000 0x00100000>; - bus-frequency = <0>; - compatible = "simple-bus"; - - ecm-law@0 { - compatible = "fsl,ecm-law"; - reg = <0x0 0x1000>; - fsl,num-laws = <10>; - }; - - ecm@1000 { - compatible = "fsl,mpc8548-ecm", "fsl,ecm"; - reg = <0x1000 0x1000>; - interrupts = <17 2>; - interrupt-parent = <&mpic>; - }; - - memory-controller@2000 { - compatible = "fsl,mpc8548-memory-controller"; - reg = <0x2000 0x1000>; - interrupt-parent = <&mpic>; - interrupts = <0x12 0x2>; - }; - - L2: l2-cache-controller@20000 { - compatible = "fsl,mpc8548-l2-cache-controller"; - reg = <0x20000 0x1000>; - cache-line-size = <0x20>; // 32 bytes - cache-size = <0x80000>; // L2, 512K - interrupt-parent = <&mpic>; - interrupts = <0x10 0x2>; - }; - - i2c@3000 { - #address-cells = <1>; - #size-cells = <0>; - cell-index = <0>; - compatible = "fsl-i2c"; - reg = <0x3000 0x100>; - interrupts = <0x2b 0x2>; - interrupt-parent = <&mpic>; - dfsrr; - }; - - i2c@3100 { - #address-cells = <1>; - #size-cells = <0>; - cell-index = <1>; - compatible = "fsl-i2c"; - reg = <0x3100 0x100>; - interrupts = <0x2b 0x2>; - interrupt-parent = <&mpic>; - dfsrr; - }; - - dma@21300 { - #address-cells = <1>; - #size-cells = <1>; - compatible = "fsl,mpc8548-dma", "fsl,eloplus-dma"; - reg = <0x21300 0x4>; - ranges = <0x0 0x21100 0x200>; - cell-index = <0>; - dma-channel@0 { - compatible = "fsl,mpc8548-dma-channel", - "fsl,eloplus-dma-channel"; - reg = <0x0 0x80>; - cell-index = <0>; - interrupt-parent = <&mpic>; - interrupts = <20 2>; - }; - dma-channel@80 { - compatible = "fsl,mpc8548-dma-channel", - "fsl,eloplus-dma-channel"; - reg = <0x80 0x80>; - cell-index = <1>; - interrupt-parent = <&mpic>; - interrupts = <21 2>; - }; - dma-channel@100 { - compatible = "fsl,mpc8548-dma-channel", - "fsl,eloplus-dma-channel"; - reg = <0x100 0x80>; - cell-index = <2>; - interrupt-parent = <&mpic>; - interrupts = <22 2>; - }; - dma-channel@180 { - compatible = "fsl,mpc8548-dma-channel", - "fsl,eloplus-dma-channel"; - reg = <0x180 0x80>; - cell-index = <3>; - interrupt-parent = <&mpic>; - interrupts = <23 2>; - }; - }; - - enet0: ethernet@24000 { - #address-cells = <1>; - #size-cells = <1>; - cell-index = <0>; - device_type = "network"; - model = "eTSEC"; - compatible = "gianfar"; - reg = <0x24000 0x1000>; - ranges = <0x0 0x24000 0x1000>; - local-mac-address = [ 00 00 00 00 00 00 ]; - interrupts = <0x1d 0x2 0x1e 0x2 0x22 0x2>; - interrupt-parent = <&mpic>; - tbi-handle = <&tbi0>; - phy-handle = <&phy0>; - - mdio@520 { - #address-cells = <1>; - #size-cells = <0>; - compatible = "fsl,gianfar-mdio"; - reg = <0x520 0x20>; - - phy0: ethernet-phy@19 { - interrupt-parent = <&mpic>; - interrupts = <0x6 0x1>; - reg = <0x19>; - }; - phy1: ethernet-phy@1a { - interrupt-parent = <&mpic>; - interrupts = <0x7 0x1>; - reg = <0x1a>; - }; - tbi0: tbi-phy@11 { - reg = <0x11>; - device_type = "tbi-phy"; - }; - }; - }; - - enet1: ethernet@25000 { - #address-cells = <1>; - #size-cells = <1>; - cell-index = <1>; - device_type = "network"; - model = "eTSEC"; - compatible = "gianfar"; - reg = <0x25000 0x1000>; - ranges = <0x0 0x25000 0x1000>; - local-mac-address = [ 00 00 00 00 00 00 ]; - interrupts = <0x23 0x2 0x24 0x2 0x28 0x2>; - interrupt-parent = <&mpic>; - tbi-handle = <&tbi1>; - phy-handle = <&phy1>; - - mdio@520 { - #address-cells = <1>; - #size-cells = <0>; - compatible = "fsl,gianfar-tbi"; - reg = <0x520 0x20>; - - tbi1: tbi-phy@11 { - reg = <0x11>; - device_type = "tbi-phy"; - }; - }; - }; - - serial0: serial@4500 { - cell-index = <0>; - device_type = "serial"; - compatible = "fsl,ns16550", "ns16550"; - reg = <0x4500 0x100>; // reg base, size - clock-frequency = <0>; // should we fill in in uboot? - interrupts = <0x2a 0x2>; - interrupt-parent = <&mpic>; - }; - - serial1: serial@4600 { - cell-index = <1>; - device_type = "serial"; - compatible = "fsl,ns16550", "ns16550"; - reg = <0x4600 0x100>; // reg base, size - clock-frequency = <0>; // should we fill in in uboot? - interrupts = <0x2a 0x2>; - interrupt-parent = <&mpic>; - }; - - global-utilities@e0000 { //global utilities reg - compatible = "fsl,mpc8548-guts"; - reg = <0xe0000 0x1000>; - fsl,has-rstcr; - }; - - crypto@30000 { - compatible = "fsl,sec2.1", "fsl,sec2.0"; - reg = <0x30000 0x10000>; - interrupts = <45 2>; - interrupt-parent = <&mpic>; - fsl,num-channels = <4>; - fsl,channel-fifo-len = <24>; - fsl,exec-units-mask = <0xfe>; - fsl,descriptor-types-mask = <0x12b0ebf>; - }; - - mpic: pic@40000 { - interrupt-controller; - #address-cells = <0>; - #interrupt-cells = <2>; - reg = <0x40000 0x40000>; - compatible = "chrp,open-pic"; - device_type = "open-pic"; - }; - }; - - pci0: pci@e0008000 { - interrupt-map-mask = <0xf800 0x0 0x0 0x7>; - interrupt-map = < - /* IDSEL 0x01 (PCI-X slot) @66MHz */ - 0x0800 0x0 0x0 0x1 &mpic 0x2 0x1 - 0x0800 0x0 0x0 0x2 &mpic 0x3 0x1 - 0x0800 0x0 0x0 0x3 &mpic 0x4 0x1 - 0x0800 0x0 0x0 0x4 &mpic 0x1 0x1 - - /* IDSEL 0x11 (PCI, 3.3V 32bit) @33MHz */ - 0x8800 0x0 0x0 0x1 &mpic 0x2 0x1 - 0x8800 0x0 0x0 0x2 &mpic 0x3 0x1 - 0x8800 0x0 0x0 0x3 &mpic 0x4 0x1 - 0x8800 0x0 0x0 0x4 &mpic 0x1 0x1>; - - interrupt-parent = <&mpic>; - interrupts = <0x18 0x2>; - bus-range = <0 0>; - ranges = <0x02000000 0x0 0x80000000 0x80000000 0x0 0x10000000 - 0x01000000 0x0 0x00000000 0xe2000000 0x0 0x00800000>; - clock-frequency = <66000000>; - #interrupt-cells = <1>; - #size-cells = <2>; - #address-cells = <3>; - reg = <0xe0008000 0x1000>; - compatible = "fsl,mpc8540-pcix", "fsl,mpc8540-pci"; - device_type = "pci"; - }; - - pci1: pcie@e000a000 { - interrupt-map-mask = <0xf800 0x0 0x0 0x7>; - interrupt-map = < - - /* IDSEL 0x0 (PEX) */ - 0x0000 0x0 0x0 0x1 &mpic 0x0 0x1 - 0x0000 0x0 0x0 0x2 &mpic 0x1 0x1 - 0x0000 0x0 0x0 0x3 &mpic 0x2 0x1 - 0x0000 0x0 0x0 0x4 &mpic 0x3 0x1>; - - interrupt-parent = <&mpic>; - interrupts = <0x1a 0x2>; - bus-range = <0x0 0xff>; - ranges = <0x02000000 0x0 0xa0000000 0xa0000000 0x0 0x10000000 - 0x01000000 0x0 0x00000000 0xe2800000 0x0 0x08000000>; - clock-frequency = <33000000>; - #interrupt-cells = <1>; - #size-cells = <2>; - #address-cells = <3>; - reg = <0xe000a000 0x1000>; - compatible = "fsl,mpc8548-pcie"; - device_type = "pci"; - pcie@0 { - reg = <0x0 0x0 0x0 0x0 0x0>; - #size-cells = <2>; - #address-cells = <3>; - device_type = "pci"; - ranges = <0x02000000 0x0 0xa0000000 - 0x02000000 0x0 0xa0000000 - 0x0 0x10000000 - - 0x01000000 0x0 0x00000000 - 0x01000000 0x0 0x00000000 - 0x0 0x00800000>; - }; - }; -}; diff --git a/arch/powerpc/boot/dts/sbc8548-pre.dtsi b/arch/powerpc/boot/dts/sbc8548-pre.dtsi deleted file mode 100644 index 0e3665fd15d0..000000000000 --- a/arch/powerpc/boot/dts/sbc8548-pre.dtsi +++ /dev/null @@ -1,48 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SBC8548 Device Tree Source - * - * Copyright 2007 Wind River Systems Inc. - * - * Paul Gortmaker (see MAINTAINERS for contact information) - */ - -/{ - model = "SBC8548"; - compatible = "SBC8548"; - #address-cells = <1>; - #size-cells = <1>; - - aliases { - ethernet0 = &enet0; - ethernet1 = &enet1; - serial0 = &serial0; - serial1 = &serial1; - pci0 = &pci0; - pci1 = &pci1; - }; - - cpus { - #address-cells = <1>; - #size-cells = <0>; - - PowerPC,8548@0 { - device_type = "cpu"; - reg = <0>; - d-cache-line-size = <0x20>; // 32 bytes - i-cache-line-size = <0x20>; // 32 bytes - d-cache-size = <0x8000>; // L1, 32K - i-cache-size = <0x8000>; // L1, 32K - timebase-frequency = <0>; // From uboot - bus-frequency = <0>; - clock-frequency = <0>; - next-level-cache = <&L2>; - }; - }; - - memory { - device_type = "memory"; - reg = <0x00000000 0x10000000>; - }; - -}; diff --git a/arch/powerpc/boot/dts/sbc8548.dts b/arch/powerpc/boot/dts/sbc8548.dts deleted file mode 100644 index ce0a119f496e..000000000000 --- a/arch/powerpc/boot/dts/sbc8548.dts +++ /dev/null @@ -1,106 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SBC8548 Device Tree Source - * - * Copyright 2007 Wind River Systems Inc. - * - * Paul Gortmaker (see MAINTAINERS for contact information) - */ - - -/dts-v1/; - -/include/ "sbc8548-pre.dtsi" - -/{ - localbus@e0000000 { - #address-cells = <2>; - #size-cells = <1>; - compatible = "simple-bus"; - reg = <0xe0000000 0x5000>; - interrupt-parent = <&mpic>; - - ranges = <0x0 0x0 0xff800000 0x00800000 /*8MB Flash*/ - 0x3 0x0 0xf0000000 0x04000000 /*64MB SDRAM*/ - 0x4 0x0 0xf4000000 0x04000000 /*64MB SDRAM*/ - 0x5 0x0 0xf8000000 0x00b10000 /* EPLD */ - 0x6 0x0 0xec000000 0x04000000>; /*64MB Flash*/ - - - flash@0,0 { - #address-cells = <1>; - #size-cells = <1>; - compatible = "intel,JS28F640", "cfi-flash"; - reg = <0x0 0x0 0x800000>; - bank-width = <1>; - device-width = <1>; - partition@0 { - label = "space"; - /* FF800000 -> FFF9FFFF */ - reg = <0x00000000 0x007a0000>; - }; - partition@7a0000 { - label = "bootloader"; - /* FFFA0000 -> FFFFFFFF */ - reg = <0x007a0000 0x00060000>; - read-only; - }; - }; - - epld@5,0 { - compatible = "wrs,epld-localbus"; - #address-cells = <2>; - #size-cells = <1>; - reg = <0x5 0x0 0x00b10000>; - ranges = < - 0x0 0x0 0x5 0x000000 0x1fff /* LED */ - 0x1 0x0 0x5 0x100000 0x1fff /* Switches */ - 0x3 0x0 0x5 0x300000 0x1fff /* HW Rev. */ - 0xb 0x0 0x5 0xb00000 0x1fff /* EEPROM */ - >; - - led@0,0 { - compatible = "led"; - reg = <0x0 0x0 0x1fff>; - }; - - switches@1,0 { - compatible = "switches"; - reg = <0x1 0x0 0x1fff>; - }; - - hw-rev@3,0 { - compatible = "hw-rev"; - reg = <0x3 0x0 0x1fff>; - }; - - eeprom@b,0 { - compatible = "eeprom"; - reg = <0xb 0 0x1fff>; - }; - - }; - - alt-flash@6,0 { - #address-cells = <1>; - #size-cells = <1>; - reg = <0x6 0x0 0x04000000>; - compatible = "intel,JS28F128", "cfi-flash"; - bank-width = <4>; - device-width = <1>; - partition@0 { - label = "space"; - /* EC000000 -> EFEFFFFF */ - reg = <0x00000000 0x03f00000>; - }; - partition@3f00000 { - label = "bootloader"; - /* EFF00000 -> EFFFFFFF */ - reg = <0x03f00000 0x00100000>; - read-only; - }; - }; - }; -}; - -/include/ "sbc8548-post.dtsi" diff --git a/arch/powerpc/boot/wrapper b/arch/powerpc/boot/wrapper index 1f4676bab786..1cd82564c996 100755 --- a/arch/powerpc/boot/wrapper +++ b/arch/powerpc/boot/wrapper @@ -298,7 +298,7 @@ cuboot*) *-tqm8541|*-mpc8560*|*-tqm8560|*-tqm8555|*-ksi8560*) platformo=$object/cuboot-85xx-cpm2.o ;; - *-mpc85*|*-tqm85*|*-sbc85*) + *-mpc85*|*-tqm85*) platformo=$object/cuboot-85xx.o ;; *-amigaone) diff --git a/arch/powerpc/configs/85xx/sbc8548_defconfig b/arch/powerpc/configs/85xx/sbc8548_defconfig deleted file mode 100644 index 258881727119..000000000000 --- a/arch/powerpc/configs/85xx/sbc8548_defconfig +++ /dev/null @@ -1,50 +0,0 @@ -CONFIG_PPC_85xx=y -CONFIG_SYSVIPC=y -CONFIG_LOG_BUF_SHIFT=14 -CONFIG_BLK_DEV_INITRD=y -CONFIG_EXPERT=y -CONFIG_SLAB=y -# CONFIG_BLK_DEV_BSG is not set -CONFIG_SBC8548=y -CONFIG_GEN_RTC=y -CONFIG_BINFMT_MISC=y -CONFIG_MATH_EMULATION=y -# CONFIG_SECCOMP is not set -CONFIG_PCI=y -CONFIG_NET=y -CONFIG_PACKET=y -CONFIG_UNIX=y -CONFIG_XFRM_USER=y -CONFIG_INET=y -CONFIG_IP_MULTICAST=y -CONFIG_IP_PNP=y -CONFIG_IP_PNP_DHCP=y -CONFIG_IP_PNP_BOOTP=y -CONFIG_SYN_COOKIES=y -# CONFIG_IPV6 is not set -# CONFIG_FW_LOADER is not set -CONFIG_MTD=y -CONFIG_MTD_BLOCK=y -CONFIG_MTD_CFI=y -CONFIG_MTD_CFI_ADV_OPTIONS=y -CONFIG_MTD_CFI_GEOMETRY=y -CONFIG_MTD_CFI_I4=y -CONFIG_MTD_CFI_INTELEXT=y -CONFIG_MTD_PHYSMAP_OF=y -CONFIG_BLK_DEV_LOOP=y -CONFIG_BLK_DEV_RAM=y -CONFIG_NETDEVICES=y -CONFIG_GIANFAR=y -CONFIG_BROADCOM_PHY=y -# CONFIG_INPUT_KEYBOARD is not set -# CONFIG_INPUT_MOUSE is not set -# CONFIG_SERIO is not set -# CONFIG_VT is not set -CONFIG_SERIAL_8250=y -CONFIG_SERIAL_8250_CONSOLE=y -# CONFIG_HW_RANDOM is not set -# CONFIG_USB_SUPPORT is not set -CONFIG_PROC_KCORE=y -CONFIG_TMPFS=y -CONFIG_NFS_FS=y -CONFIG_ROOT_NFS=y diff --git a/arch/powerpc/configs/mpc85xx_base.config b/arch/powerpc/configs/mpc85xx_base.config index b1593fe6f70b..85907b776908 100644 --- a/arch/powerpc/configs/mpc85xx_base.config +++ b/arch/powerpc/configs/mpc85xx_base.config @@ -13,7 +13,6 @@ CONFIG_P1022_DS=y CONFIG_P1022_RDK=y CONFIG_P1023_RDB=y CONFIG_TWR_P102x=y -CONFIG_SBC8548=y CONFIG_SOCRATES=y CONFIG_STX_GP3=y CONFIG_TQM8540=y diff --git a/arch/powerpc/platforms/85xx/Kconfig b/arch/powerpc/platforms/85xx/Kconfig index b77cbb0a50e1..4142ebf01382 100644 --- a/arch/powerpc/platforms/85xx/Kconfig +++ b/arch/powerpc/platforms/85xx/Kconfig @@ -208,12 +208,6 @@ config TQM8560 select TQM85xx select CPM2 -config SBC8548 - bool "Wind River SBC8548" - select DEFAULT_UIMAGE - help - This option enables support for the Wind River SBC8548 board - config PPA8548 bool "Prodrive PPA8548" help diff --git a/arch/powerpc/platforms/85xx/Makefile b/arch/powerpc/platforms/85xx/Makefile index d1dd0dca5ebf..60e4e97a929d 100644 --- a/arch/powerpc/platforms/85xx/Makefile +++ b/arch/powerpc/platforms/85xx/Makefile @@ -26,7 +26,6 @@ obj-$(CONFIG_CORENET_GENERIC) += corenet_generic.o obj-$(CONFIG_FB_FSL_DIU) += t1042rdb_diu.o obj-$(CONFIG_STX_GP3) += stx_gp3.o obj-$(CONFIG_TQM85xx) += tqm85xx.o -obj-$(CONFIG_SBC8548) += sbc8548.o obj-$(CONFIG_PPA8548) += ppa8548.o obj-$(CONFIG_SOCRATES) += socrates.o socrates_fpga_pic.o obj-$(CONFIG_KSI8560) += ksi8560.o diff --git a/arch/powerpc/platforms/85xx/sbc8548.c b/arch/powerpc/platforms/85xx/sbc8548.c deleted file mode 100644 index e4acf5ce6b07..000000000000 --- a/arch/powerpc/platforms/85xx/sbc8548.c +++ /dev/null @@ -1,134 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Wind River SBC8548 setup and early boot code. - * - * Copyright 2007 Wind River Systems Inc. - * - * By Paul Gortmaker (see MAINTAINERS for contact information) - * - * Based largely on the MPC8548CDS support - Copyright 2005 Freescale Inc. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "mpc85xx.h" - -static int sbc_rev; - -static void __init sbc8548_pic_init(void) -{ - struct mpic *mpic = mpic_alloc(NULL, 0, MPIC_BIG_ENDIAN, - 0, 256, " OpenPIC "); - BUG_ON(mpic == NULL); - mpic_init(mpic); -} - -/* Extract the HW Rev from the EPLD on the board */ -static int __init sbc8548_hw_rev(void) -{ - struct device_node *np; - struct resource res; - unsigned int *rev; - int board_rev = 0; - - np = of_find_compatible_node(NULL, NULL, "hw-rev"); - if (np == NULL) { - printk("No HW-REV found in DTB.\n"); - return -ENODEV; - } - - of_address_to_resource(np, 0, &res); - of_node_put(np); - - rev = ioremap(res.start,sizeof(unsigned int)); - board_rev = (*rev) >> 28; - iounmap(rev); - - return board_rev; -} - -/* - * Setup the architecture - */ -static void __init sbc8548_setup_arch(void) -{ - if (ppc_md.progress) - ppc_md.progress("sbc8548_setup_arch()", 0); - - fsl_pci_assign_primary(); - - sbc_rev = sbc8548_hw_rev(); -} - -static void sbc8548_show_cpuinfo(struct seq_file *m) -{ - uint pvid, svid, phid1; - - pvid = mfspr(SPRN_PVR); - svid = mfspr(SPRN_SVR); - - seq_printf(m, "Vendor\t\t: Wind River\n"); - seq_printf(m, "Machine\t\t: SBC8548 v%d\n", sbc_rev); - seq_printf(m, "PVR\t\t: 0x%x\n", pvid); - seq_printf(m, "SVR\t\t: 0x%x\n", svid); - - /* Display cpu Pll setting */ - phid1 = mfspr(SPRN_HID1); - seq_printf(m, "PLL setting\t: 0x%x\n", ((phid1 >> 24) & 0x3f)); -} - -machine_arch_initcall(sbc8548, mpc85xx_common_publish_devices); - -/* - * Called very early, device-tree isn't unflattened - */ -static int __init sbc8548_probe(void) -{ - return of_machine_is_compatible("SBC8548"); -} - -define_machine(sbc8548) { - .name = "SBC8548", - .probe = sbc8548_probe, - .setup_arch = sbc8548_setup_arch, - .init_IRQ = sbc8548_pic_init, - .show_cpuinfo = sbc8548_show_cpuinfo, - .get_irq = mpic_get_irq, -#ifdef CONFIG_PCI - .pcibios_fixup_bus = fsl_pcibios_fixup_bus, - .pcibios_fixup_phb = fsl_pcibios_fixup_phb, -#endif - .calibrate_decr = generic_calibrate_decr, - .progress = udbg_progress, -}; -- cgit v1.2.3-59-g8ed1b From d7c1814f2f4f812d7a39447f975abdc095dbf7aa Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Thu, 7 Jan 2021 13:45:32 -0500 Subject: powerpc: retire sbc8641d board support The support was for this was added to mainline over 12 years ago, in v2.6.26 [4e8aae89a35d] just around the ppc --> powerpc migration. I believe the board was introduced shortly after the sbc8548 board, making it roughly a 14 year old platform - with the CPU speed and memory size typical for that era. I haven't had one of these boards for several years, and availability was discontinued several years before that. Given that, there is no point in adding a burden to testing coverage that builds all possible defconfigs, so it makes sense to remove it. Of course it will remain in the git history forever, for anyone who happens to find a functional board and wants to tinker with it. Acked-by: Scott Wood Signed-off-by: Paul Gortmaker Signed-off-by: Michael Ellerman --- arch/powerpc/boot/dts/fsl/sbc8641d.dts | 176 ------------------------------- arch/powerpc/configs/mpc86xx_base.config | 1 - arch/powerpc/configs/ppc6xx_defconfig | 1 - arch/powerpc/platforms/86xx/Kconfig | 8 +- arch/powerpc/platforms/86xx/Makefile | 1 - arch/powerpc/platforms/86xx/sbc8641d.c | 87 --------------- 6 files changed, 1 insertion(+), 273 deletions(-) delete mode 100644 arch/powerpc/boot/dts/fsl/sbc8641d.dts delete mode 100644 arch/powerpc/platforms/86xx/sbc8641d.c (limited to 'arch/powerpc') diff --git a/arch/powerpc/boot/dts/fsl/sbc8641d.dts b/arch/powerpc/boot/dts/fsl/sbc8641d.dts deleted file mode 100644 index 3dca10acc161..000000000000 --- a/arch/powerpc/boot/dts/fsl/sbc8641d.dts +++ /dev/null @@ -1,176 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SBC8641D Device Tree Source - * - * Copyright 2008 Wind River Systems Inc. - * - * Paul Gortmaker (see MAINTAINERS for contact information) - * - * Based largely on the mpc8641_hpcn.dts by Freescale Semiconductor Inc. - */ - -/include/ "mpc8641si-pre.dtsi" - -/ { - model = "SBC8641D"; - compatible = "wind,sbc8641"; - - memory { - device_type = "memory"; - reg = <0x00000000 0x20000000>; // 512M at 0x0 - }; - - lbc: localbus@f8005000 { - reg = <0xf8005000 0x1000>; - - ranges = <0 0 0xff000000 0x01000000 // 16MB Boot flash - 1 0 0xf0000000 0x00010000 // 64KB EEPROM - 2 0 0xf1000000 0x00100000 // EPLD (1MB) - 3 0 0xe0000000 0x04000000 // 64MB LB SDRAM (CS3) - 4 0 0xe4000000 0x04000000 // 64MB LB SDRAM (CS4) - 6 0 0xf4000000 0x00100000 // LCD display (1MB) - 7 0 0xe8000000 0x04000000>; // 64MB OneNAND - - flash@0,0 { - compatible = "cfi-flash"; - reg = <0 0 0x01000000>; - bank-width = <2>; - device-width = <2>; - #address-cells = <1>; - #size-cells = <1>; - partition@0 { - label = "dtb"; - reg = <0x00000000 0x00100000>; - read-only; - }; - partition@300000 { - label = "kernel"; - reg = <0x00100000 0x00400000>; - read-only; - }; - partition@400000 { - label = "fs"; - reg = <0x00500000 0x00a00000>; - }; - partition@700000 { - label = "firmware"; - reg = <0x00f00000 0x00100000>; - read-only; - }; - }; - - epld@2,0 { - compatible = "wrs,epld-localbus"; - #address-cells = <2>; - #size-cells = <1>; - reg = <2 0 0x100000>; - ranges = <0 0 5 0 1 // User switches - 1 0 5 1 1 // Board ID/Rev - 3 0 5 3 1>; // LEDs - }; - }; - - soc: soc@f8000000 { - ranges = <0x00000000 0xf8000000 0x00100000>; - - enet0: ethernet@24000 { - tbi-handle = <&tbi0>; - phy-handle = <&phy0>; - phy-connection-type = "rgmii-id"; - }; - - mdio@24520 { - phy0: ethernet-phy@1f { - reg = <0x1f>; - }; - phy1: ethernet-phy@0 { - reg = <0>; - }; - phy2: ethernet-phy@1 { - reg = <1>; - }; - phy3: ethernet-phy@2 { - reg = <2>; - }; - tbi0: tbi-phy@11 { - reg = <0x11>; - device_type = "tbi-phy"; - }; - }; - - enet1: ethernet@25000 { - tbi-handle = <&tbi1>; - phy-handle = <&phy1>; - phy-connection-type = "rgmii-id"; - }; - - mdio@25520 { - tbi1: tbi-phy@11 { - reg = <0x11>; - device_type = "tbi-phy"; - }; - }; - - enet2: ethernet@26000 { - tbi-handle = <&tbi2>; - phy-handle = <&phy2>; - phy-connection-type = "rgmii-id"; - }; - - mdio@26520 { - tbi2: tbi-phy@11 { - reg = <0x11>; - device_type = "tbi-phy"; - }; - }; - - enet3: ethernet@27000 { - tbi-handle = <&tbi3>; - phy-handle = <&phy3>; - phy-connection-type = "rgmii-id"; - }; - - mdio@27520 { - tbi3: tbi-phy@11 { - reg = <0x11>; - device_type = "tbi-phy"; - }; - }; - }; - - pci0: pcie@f8008000 { - reg = <0xf8008000 0x1000>; - ranges = <0x02000000 0x0 0x80000000 0x80000000 0x0 0x20000000 - 0x01000000 0x0 0x00000000 0xe2000000 0x0 0x00100000>; - interrupt-map-mask = <0xff00 0 0 7>; - - pcie@0 { - ranges = <0x02000000 0x0 0x80000000 - 0x02000000 0x0 0x80000000 - 0x0 0x20000000 - - 0x01000000 0x0 0x00000000 - 0x01000000 0x0 0x00000000 - 0x0 0x00100000>; - }; - - }; - - pci1: pcie@f8009000 { - reg = <0xf8009000 0x1000>; - ranges = <0x02000000 0x0 0xa0000000 0xa0000000 0x0 0x20000000 - 0x01000000 0x0 0x00000000 0xe3000000 0x0 0x00100000>; - - pcie@0 { - ranges = <0x02000000 0x0 0xa0000000 - 0x02000000 0x0 0xa0000000 - 0x0 0x20000000 - - 0x01000000 0x0 0x00000000 - 0x01000000 0x0 0x00000000 - 0x0 0x00100000>; - }; - }; -}; - -/include/ "mpc8641si-post.dtsi" diff --git a/arch/powerpc/configs/mpc86xx_base.config b/arch/powerpc/configs/mpc86xx_base.config index 67bd1fa036ee..588870e6af3b 100644 --- a/arch/powerpc/configs/mpc86xx_base.config +++ b/arch/powerpc/configs/mpc86xx_base.config @@ -1,6 +1,5 @@ CONFIG_PPC_86xx=y CONFIG_MPC8641_HPCN=y -CONFIG_SBC8641D=y CONFIG_MPC8610_HPCD=y CONFIG_GEF_PPC9A=y CONFIG_GEF_SBC310=y diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig index ee09f7bb6ea9..6697c5e6682f 100644 --- a/arch/powerpc/configs/ppc6xx_defconfig +++ b/arch/powerpc/configs/ppc6xx_defconfig @@ -55,7 +55,6 @@ CONFIG_MPC837x_RDB=y CONFIG_ASP834x=y CONFIG_PPC_86xx=y CONFIG_MPC8641_HPCN=y -CONFIG_SBC8641D=y CONFIG_MPC8610_HPCD=y CONFIG_GEF_SBC610=y CONFIG_CPU_FREQ=y diff --git a/arch/powerpc/platforms/86xx/Kconfig b/arch/powerpc/platforms/86xx/Kconfig index 07a9d60c618a..be867abebc83 100644 --- a/arch/powerpc/platforms/86xx/Kconfig +++ b/arch/powerpc/platforms/86xx/Kconfig @@ -20,12 +20,6 @@ config MPC8641_HPCN help This option enables support for the MPC8641 HPCN board. -config SBC8641D - bool "Wind River SBC8641D" - select DEFAULT_UIMAGE - help - This option enables support for the WRS SBC8641D board. - config MPC8610_HPCD bool "Freescale MPC8610 HPCD" select DEFAULT_UIMAGE @@ -74,7 +68,7 @@ config MPC8641 select FSL_PCI if PCI select PPC_UDBG_16550 select MPIC - default y if MPC8641_HPCN || SBC8641D || GEF_SBC610 || GEF_SBC310 || GEF_PPC9A \ + default y if MPC8641_HPCN || GEF_SBC610 || GEF_SBC310 || GEF_PPC9A \ || MVME7100 config MPC8610 diff --git a/arch/powerpc/platforms/86xx/Makefile b/arch/powerpc/platforms/86xx/Makefile index 2c04449be107..5bbe1475bf26 100644 --- a/arch/powerpc/platforms/86xx/Makefile +++ b/arch/powerpc/platforms/86xx/Makefile @@ -6,7 +6,6 @@ obj-y := pic.o common.o obj-$(CONFIG_SMP) += mpc86xx_smp.o obj-$(CONFIG_MPC8641_HPCN) += mpc86xx_hpcn.o -obj-$(CONFIG_SBC8641D) += sbc8641d.o obj-$(CONFIG_MPC8610_HPCD) += mpc8610_hpcd.o obj-$(CONFIG_GEF_SBC610) += gef_sbc610.o obj-$(CONFIG_GEF_SBC310) += gef_sbc310.o diff --git a/arch/powerpc/platforms/86xx/sbc8641d.c b/arch/powerpc/platforms/86xx/sbc8641d.c deleted file mode 100644 index dc23dd383d6e..000000000000 --- a/arch/powerpc/platforms/86xx/sbc8641d.c +++ /dev/null @@ -1,87 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * SBC8641D board specific routines - * - * Copyright 2008 Wind River Systems Inc. - * - * By Paul Gortmaker (see MAINTAINERS for contact information) - * - * Based largely on the 8641 HPCN support by Freescale Semiconductor Inc. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include - -#include -#include - -#include "mpc86xx.h" - -static void __init -sbc8641_setup_arch(void) -{ - if (ppc_md.progress) - ppc_md.progress("sbc8641_setup_arch()", 0); - - printk("SBC8641 board from Wind River\n"); - -#ifdef CONFIG_SMP - mpc86xx_smp_init(); -#endif - - fsl_pci_assign_primary(); -} - - -static void -sbc8641_show_cpuinfo(struct seq_file *m) -{ - uint svid = mfspr(SPRN_SVR); - - seq_printf(m, "Vendor\t\t: Wind River Systems\n"); - - seq_printf(m, "SVR\t\t: 0x%x\n", svid); -} - - -/* - * Called very early, device-tree isn't unflattened - */ -static int __init sbc8641_probe(void) -{ - if (of_machine_is_compatible("wind,sbc8641")) - return 1; /* Looks good */ - - return 0; -} - -machine_arch_initcall(sbc8641, mpc86xx_common_publish_devices); - -define_machine(sbc8641) { - .name = "SBC8641D", - .probe = sbc8641_probe, - .setup_arch = sbc8641_setup_arch, - .init_IRQ = mpc86xx_init_irq, - .show_cpuinfo = sbc8641_show_cpuinfo, - .get_irq = mpic_get_irq, - .time_init = mpc86xx_time_init, - .calibrate_decr = generic_calibrate_decr, - .progress = udbg_progress, -#ifdef CONFIG_PCI - .pcibios_fixup_bus = fsl_pcibios_fixup_bus, -#endif -}; -- cgit v1.2.3-59-g8ed1b From 8149238ffd210875f5a77e3c654bb59b58da35e3 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 25 Aug 2021 13:34:45 +0000 Subject: powerpc: Redefine HMT_xxx macros as empty on PPC32 HMT_xxx macros are macros for adjusting thread priority (hardware multi-threading) are macros inherited from PPC64 via commit 5f7c690728ac ("[PATCH] powerpc: Merged ppc_asm.h") Those instructions are pointless on PPC32, but some common fonctions like arch_cpu_idle() use them. So make them empty on PPC32 to avoid those instructions. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/c5a07fadea33d640ad10cecf0ac8faaec1c524e0.1629898474.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/vdso/processor.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/vdso/processor.h b/arch/powerpc/include/asm/vdso/processor.h index e072577bc7c0..8d79f994b4aa 100644 --- a/arch/powerpc/include/asm/vdso/processor.h +++ b/arch/powerpc/include/asm/vdso/processor.h @@ -5,12 +5,21 @@ #ifndef __ASSEMBLY__ /* Macros for adjusting thread priority (hardware multi-threading) */ +#ifdef CONFIG_PPC64 #define HMT_very_low() asm volatile("or 31, 31, 31 # very low priority") #define HMT_low() asm volatile("or 1, 1, 1 # low priority") #define HMT_medium_low() asm volatile("or 6, 6, 6 # medium low priority") #define HMT_medium() asm volatile("or 2, 2, 2 # medium priority") #define HMT_medium_high() asm volatile("or 5, 5, 5 # medium high priority") #define HMT_high() asm volatile("or 3, 3, 3 # high priority") +#else +#define HMT_very_low() +#define HMT_low() +#define HMT_medium_low() +#define HMT_medium() +#define HMT_medium_high() +#define HMT_high() +#endif #ifdef CONFIG_PPC64 #define cpu_relax() do { HMT_low(); HMT_medium(); barrier(); } while (0) -- cgit v1.2.3-59-g8ed1b From 602d0f96563c2e0b8e1ddb22ac46bf7f58480d64 Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Thu, 26 Aug 2021 21:56:51 +0930 Subject: powerpc/microwatt: Add Ethernet to device tree The liteeth network device is used in the Microwatt soc. Signed-off-by: Joel Stanley Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210826122653.3236867-2-joel@jms.id.au --- arch/powerpc/boot/dts/microwatt.dts | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/boot/dts/microwatt.dts b/arch/powerpc/boot/dts/microwatt.dts index 974abbdda249..65b270a90f94 100644 --- a/arch/powerpc/boot/dts/microwatt.dts +++ b/arch/powerpc/boot/dts/microwatt.dts @@ -127,6 +127,18 @@ fifo-size = <16>; interrupts = <0x10 0x1>; }; + + ethernet@8020000 { + compatible = "litex,liteeth"; + reg = <0x8021000 0x100 + 0x8020800 0x100 + 0x8030000 0x2000>; + reg-names = "mac", "mido", "buffer"; + litex,rx-slots = <2>; + litex,tx-slots = <2>; + litex,slot-size = <0x800>; + interrupts = <0x11 0x1>; + }; }; chosen { -- cgit v1.2.3-59-g8ed1b From ef4fcaf99cd27eb48790f2adc4eff456dbe1dec4 Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Thu, 26 Aug 2021 21:56:52 +0930 Subject: powerpc/configs/microwattt: Enable Liteeth Liteeth is the network device used by Microwatt. Signed-off-by: Joel Stanley Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210826122653.3236867-3-joel@jms.id.au --- arch/powerpc/configs/microwatt_defconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/configs/microwatt_defconfig b/arch/powerpc/configs/microwatt_defconfig index ebc90aefbc0c..6b76232094b2 100644 --- a/arch/powerpc/configs/microwatt_defconfig +++ b/arch/powerpc/configs/microwatt_defconfig @@ -53,6 +53,7 @@ CONFIG_MTD_SPI_NOR=y CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_NETDEVICES=y +CONFIG_LITEX_LITEETH=y # CONFIG_WLAN is not set # CONFIG_INPUT is not set # CONFIG_SERIO is not set -- cgit v1.2.3-59-g8ed1b From 3e18e271182206c996a3a7efbbe70c66307ef137 Mon Sep 17 00:00:00 2001 From: Joel Stanley Date: Thu, 26 Aug 2021 21:56:53 +0930 Subject: powerpc/configs/microwatt: Enable options for systemd When booting with systemd these options are required. This increases the image by about 50KB, or 2%. Signed-off-by: Joel Stanley Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210826122653.3236867-4-joel@jms.id.au --- arch/powerpc/configs/microwatt_defconfig | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/configs/microwatt_defconfig b/arch/powerpc/configs/microwatt_defconfig index 6b76232094b2..9465209b8c5b 100644 --- a/arch/powerpc/configs/microwatt_defconfig +++ b/arch/powerpc/configs/microwatt_defconfig @@ -5,6 +5,7 @@ CONFIG_PREEMPT_VOLUNTARY=y CONFIG_TICK_CPU_ACCOUNTING=y CONFIG_LOG_BUF_SHIFT=16 CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT=12 +CONFIG_CGROUPS=y CONFIG_BLK_DEV_INITRD=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_KALLSYMS_ALL=y @@ -78,8 +79,10 @@ CONFIG_SPI_SPIDEV=y CONFIG_EXT4_FS=y # CONFIG_FILE_LOCKING is not set # CONFIG_DNOTIFY is not set -# CONFIG_INOTIFY_USER is not set +CONFIG_AUTOFS_FS=y +CONFIG_TMPFS=y # CONFIG_MISC_FILESYSTEMS is not set +CONFIG_CRYPTO_SHA256=y # CONFIG_CRYPTO_HW is not set # CONFIG_XZ_DEC_X86 is not set # CONFIG_XZ_DEC_IA64 is not set -- cgit v1.2.3-59-g8ed1b From 8efd249babea2fec268cff90b9f5ca723dbb7499 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 26 Aug 2021 15:33:59 +0530 Subject: powerpc/smp: Fix a crash while booting kvm guest with nr_cpus=2 Aneesh reported a crash with a fairly recent upstream kernel when booting kernel whose commandline was appended with nr_cpus=2 1:mon> e cpu 0x1: Vector: 300 (Data Access) at [c000000008a67bd0] pc: c00000000002557c: cpu_to_chip_id+0x3c/0x100 lr: c000000000058380: start_secondary+0x460/0xb00 sp: c000000008a67e70 msr: 8000000000001033 dar: 10 dsisr: 80000 current = 0xc00000000891bb00 paca = 0xc0000018ff981f80 irqmask: 0x03 irq_happened: 0x01 pid = 0, comm = swapper/1 Linux version 5.13.0-rc3-15704-ga050a6d2b7e8 (kvaneesh@ltc-boston8) (gcc (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0, GNU ld (GNU Binutils for Ubuntu) 2.34) #433 SMP Tue May 25 02:38:49 CDT 2021 1:mon> t [link register ] c000000000058380 start_secondary+0x460/0xb00 [c000000008a67e70] c000000008a67eb0 (unreliable) [c000000008a67eb0] c0000000000589d4 start_secondary+0xab4/0xb00 [c000000008a67f90] c00000000000c654 start_secondary_prolog+0x10/0x14 Current code assumes that num_possible_cpus() is always greater than threads_per_core. However this may not be true when using nr_cpus=2 or similar options. Handle the case where num_possible_cpus() is not an exact multiple of threads_per_core. Fixes: c1e53367dab1 ("powerpc/smp: Cache CPU to chip lookup") Reported-by: Aneesh Kumar K.V Signed-off-by: Srikar Dronamraju Debugged-by: Michael Ellerman Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210826100401.412519-2-srikar@linux.vnet.ibm.com --- arch/powerpc/kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index f2abd88e0c25..a3eab89ee572 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1109,7 +1109,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus) } if (cpu_to_chip_id(boot_cpuid) != -1) { - int idx = num_possible_cpus() / threads_per_core; + int idx = DIV_ROUND_UP(num_possible_cpus(), threads_per_core); /* * All threads of a core will all belong to the same core, -- cgit v1.2.3-59-g8ed1b From b8b928030332a0ca16d42433eb2c3085600d8704 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 26 Aug 2021 15:34:00 +0530 Subject: powerpc/smp: Update cpu_core_map on all PowerPc systems lscpu() uses core_siblings to list the number of sockets in the system. core_siblings is set using topology_core_cpumask. While optimizing the powerpc bootup path, Commit 4ca234a9cbd7 ("powerpc/smp: Stop updating cpu_core_mask"). it was found that updating cpu_core_mask() ended up taking a lot of time. It was thought that on Powerpc, cpu_core_mask() would always be same as cpu_cpu_mask() i.e number of sockets will always be equal to number of nodes. As an optimization, cpu_core_mask() was made a snapshot of cpu_cpu_mask(). However that was found to be false with PowerPc KVM guests, where each node could have more than one socket. So with Commit c47f892d7aa6 ("powerpc/smp: Reintroduce cpu_core_mask"), cpu_core_mask was updated based on chip_id but in an optimized way using some mask manipulations and chip_id caching. However on non-PowerNV and non-pseries KVM guests (i.e not implementing cpu_to_chip_id(), continued to use a copy of cpu_cpu_mask(). There are two issues that were noticed on such systems 1. lscpu would report one extra socket. On a IBM,9009-42A (aka zz system) which has only 2 chips/ sockets/ nodes, lscpu would report Architecture: ppc64le Byte Order: Little Endian CPU(s): 160 On-line CPU(s) list: 0-159 Thread(s) per core: 8 Core(s) per socket: 6 Socket(s): 3 <-------------- NUMA node(s): 2 Model: 2.2 (pvr 004e 0202) Model name: POWER9 (architected), altivec supported Hypervisor vendor: pHyp Virtualization type: para L1d cache: 32K L1i cache: 32K L2 cache: 512K L3 cache: 10240K NUMA node0 CPU(s): 0-79 NUMA node1 CPU(s): 80-159 2. Currently cpu_cpu_mask is updated when a core is added/removed. However its not updated when smt mode switching or on CPUs are explicitly offlined. However all other percpu masks are updated to ensure only active/online CPUs are in the masks. This results in build_sched_domain traces since there will be CPUs in cpu_cpu_mask() but those CPUs are not present in SMT / CACHE / MC / NUMA domains. A loop of threads running smt mode switching and core add/remove will soon show this trace. Hence cpu_cpu_mask has to be update at smt mode switch. This will have impact on cpu_core_mask(). cpu_core_mask() is a snapshot of cpu_cpu_mask. Different CPUs within the same socket will end up having different cpu_core_masks since they are snapshots at different points of time. This means when lscpu will start reporting many more sockets than the actual number of sockets/ nodes / chips. Different ways to handle this problem: A. Update the snapshot aka cpu_core_mask for all CPUs whenever cpu_cpu_mask is updated. This would a non-optimal solution. B. Instead of a cpumask_var_t, make cpu_core_map a cpumask pointer pointing to cpu_cpu_mask. However percpu cpumask pointer is frowned upon and we need a clean way to handle PowerPc KVM guest which is not a snapshot. C. Update cpu_core_masks all PowerPc systems like in PowerPc KVM guests using mask manipulations. This approach is relatively simple and unifies with the existing code. D. On top of 3, we could also resurrect get_physical_package_id which could return a nid for the said CPU. However this is not needed at this time. Option C is the preferred approach for now. While this is somewhat a revert of Commit 4ca234a9cbd7 ("powerpc/smp: Stop updating cpu_core_mask"). 1. Plain revert has some conflicts 2. For chip_id == -1, the cpu_core_mask is made identical to cpu_cpu_mask, unlike previously where cpu_core_mask was set to a core if chip_id doesn't exist. This goes by the principle that if chip_id is not exposed, then sockets / chip / node share the same set of CPUs. With the fix, lscpu o/p would be Architecture: ppc64le Byte Order: Little Endian CPU(s): 160 On-line CPU(s) list: 0-159 Thread(s) per core: 8 Core(s) per socket: 6 Socket(s): 2 <-------------- NUMA node(s): 2 Model: 2.2 (pvr 004e 0202) Model name: POWER9 (architected), altivec supported Hypervisor vendor: pHyp Virtualization type: para L1d cache: 32K L1i cache: 32K L2 cache: 512K L3 cache: 10240K NUMA node0 CPU(s): 0-79 NUMA node1 CPU(s): 80-159 Fixes: 4ca234a9cbd7 ("powerpc/smp: Stop updating cpu_core_mask") Signed-off-by: Srikar Dronamraju Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210826100401.412519-3-srikar@linux.vnet.ibm.com --- arch/powerpc/kernel/smp.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index a3eab89ee572..4bdc339fff3c 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1527,6 +1527,7 @@ static void add_cpu_to_masks(int cpu) * add it to it's own thread sibling mask. */ cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); + cpumask_set_cpu(cpu, cpu_core_mask(cpu)); for (i = first_thread; i < first_thread + threads_per_core; i++) if (cpu_online(i)) @@ -1544,11 +1545,6 @@ static void add_cpu_to_masks(int cpu) if (chip_id_lookup_table && ret) chip_id = cpu_to_chip_id(cpu); - if (chip_id == -1) { - cpumask_copy(per_cpu(cpu_core_map, cpu), cpu_cpu_mask(cpu)); - goto out; - } - if (shared_caches) submask_fn = cpu_l2_cache_mask; @@ -1558,6 +1554,10 @@ static void add_cpu_to_masks(int cpu) /* Skip all CPUs already part of current CPU core mask */ cpumask_andnot(mask, cpu_online_mask, cpu_core_mask(cpu)); + /* If chip_id is -1; limit the cpu_core_mask to within DIE*/ + if (chip_id == -1) + cpumask_and(mask, mask, cpu_cpu_mask(cpu)); + for_each_cpu(i, mask) { if (chip_id == cpu_to_chip_id(i)) { or_cpumasks_related(cpu, i, submask_fn, cpu_core_mask); @@ -1567,7 +1567,6 @@ static void add_cpu_to_masks(int cpu) } } -out: free_cpumask_var(mask); } -- cgit v1.2.3-59-g8ed1b From 5bf63497b8ddf53d8973f68076119e482639b2bb Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 26 Aug 2021 15:34:01 +0530 Subject: powerpc/smp: Enable CACHE domain for shared processor Currently CACHE domain is not enabled on shared processor mode PowerVM LPARS. On PowerVM systems, 'ibm,thread-group' device-tree property 2 under cpu-device-node indicates which all CPUs share L2-cache. However 'ibm,thread-group' device-tree property 2 is a relatively new property. In absence of 'ibm,thread-group' property 2, 'l2-cache' device property under cpu-device-node could help system to identify CPUs sharing L2-cache. However this property is not exposed by PhyP in shared processor mode configurations. In absence of properties that inform OS about which CPUs share L2-cache, fallback on core boundary. Here are some stats from Power9 shared LPAR with the changes. $ lscpu Architecture: ppc64le Byte Order: Little Endian CPU(s): 32 On-line CPU(s) list: 0-31 Thread(s) per core: 8 Core(s) per socket: 1 Socket(s): 3 NUMA node(s): 2 Model: 2.2 (pvr 004e 0202) Model name: POWER9 (architected), altivec supported Hypervisor vendor: pHyp Virtualization type: para L1d cache: 32K L1i cache: 32K NUMA node0 CPU(s): 16-23 NUMA node1 CPU(s): 0-15,24-31 Physical sockets: 2 Physical chips: 1 Physical cores/chip: 10 Before patch $ grep -r . /sys/kernel/debug/sched/domains/cpu0/domain*/name Before /sys/kernel/debug/sched/domains/cpu0/domain0/name:SMT /sys/kernel/debug/sched/domains/cpu0/domain1/name:DIE /sys/kernel/debug/sched/domains/cpu0/domain2/name:NUMA After /sys/kernel/debug/sched/domains/cpu0/domain0/name:SMT /sys/kernel/debug/sched/domains/cpu0/domain1/name:CACHE /sys/kernel/debug/sched/domains/cpu0/domain2/name:DIE /sys/kernel/debug/sched/domains/cpu0/domain3/name:NUMA $ awk '/domain/{print $1, $2}' /proc/schedstat | sort -u | sed -e 's/00000000,//g' Before domain0 00000055 domain0 000000aa domain0 00005500 domain0 0000aa00 domain0 00550000 domain0 00aa0000 domain0 55000000 domain0 aa000000 domain1 00ff0000 domain1 ff00ffff domain2 ffffffff After domain0 00000055 domain0 000000aa domain0 00005500 domain0 0000aa00 domain0 00550000 domain0 00aa0000 domain0 55000000 domain0 aa000000 domain1 000000ff domain1 0000ff00 domain1 00ff0000 domain1 ff000000 domain2 ff00ffff domain2 ffffffff domain3 ffffffff (Lower is better) perf stat -a -r 5 -n perf bench sched pipe | tail -n 2 Before 153.798 +- 0.142 seconds time elapsed ( +- 0.09% ) After 111.545 +- 0.652 seconds time elapsed ( +- 0.58% ) which is an improvement of 27.47% Signed-off-by: Srikar Dronamraju Reviewed-by: Gautham R. Shenoy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210826100401.412519-4-srikar@linux.vnet.ibm.com --- arch/powerpc/kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 4bdc339fff3c..548aa58d25f9 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1400,7 +1400,7 @@ static bool update_mask_by_l2(int cpu, cpumask_var_t *mask) l2_cache = cpu_to_l2cache(cpu); if (!l2_cache || !*mask) { /* Assume only core siblings share cache with this CPU */ - for_each_cpu(i, submask_fn(cpu)) + for_each_cpu(i, cpu_sibling_mask(cpu)) set_cpus_related(cpu, i, cpu_l2_cache_mask); return false; -- cgit v1.2.3-59-g8ed1b From 544af6429777cefae2f8af9a9866df5e8cb21763 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 26 Aug 2021 15:35:17 +0530 Subject: powerpc/numa: Drop dbg in favour of pr_debug powerpc supported numa=debug which is not documented. This option was used to print early debug output. However something more flexible can be achieved by using CONFIG_DYNAMIC_DEBUG. Hence drop dbg (and numa=debug) in favour of pr_debug Suggested-by: Michael Ellerman Signed-off-by: Srikar Dronamraju [mpe: Rebase on to powerpc/next form2 affinity changes] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210826100521.412639-2-srikar@linux.vnet.ibm.com --- arch/powerpc/mm/numa.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index fe9c6ced40b2..73ba6b072274 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -40,9 +40,6 @@ static int numa_enabled = 1; static char *cmdline __initdata; -static int numa_debug; -#define dbg(args...) if (numa_debug) { printk(KERN_INFO args); } - int numa_cpu_lookup_table[NR_CPUS]; cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; struct pglist_data *node_data[MAX_NUMNODES]; @@ -87,7 +84,7 @@ static void __init setup_node_to_cpumask_map(void) alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]); /* cpumask_of_node() will now work */ - dbg("Node to cpumask map for %u nodes\n", nr_node_ids); + pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids); } static int __init fake_numa_create_new_node(unsigned long end_pfn, @@ -131,7 +128,7 @@ static int __init fake_numa_create_new_node(unsigned long end_pfn, cmdline = p; fake_nid++; *nid = fake_nid; - dbg("created new fake_node with id %d\n", fake_nid); + pr_debug("created new fake_node with id %d\n", fake_nid); return 1; } return 0; @@ -149,7 +146,7 @@ static void map_cpu_to_node(int cpu, int node) { update_numa_cpu_lookup_table(cpu, node); - dbg("adding cpu %d to node %d\n", cpu, node); + pr_debug("adding cpu %d to node %d\n", cpu, node); if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node]))) cpumask_set_cpu(cpu, node_to_cpumask_map[node]); @@ -160,7 +157,7 @@ static void unmap_cpu_from_node(unsigned long cpu) { int node = numa_cpu_lookup_table[cpu]; - dbg("removing cpu %lu from node %d\n", cpu, node); + pr_debug("removing cpu %lu from node %d\n", cpu, node); if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) { cpumask_clear_cpu(cpu, node_to_cpumask_map[node]); @@ -450,10 +447,10 @@ static int __init find_primary_domain_index(void) if (firmware_has_feature(FW_FEATURE_OPAL)) { affinity_form = FORM1_AFFINITY; } else if (firmware_has_feature(FW_FEATURE_FORM2_AFFINITY)) { - dbg("Using form 2 affinity\n"); + pr_debug("Using form 2 affinity\n"); affinity_form = FORM2_AFFINITY; } else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) { - dbg("Using form 1 affinity\n"); + pr_debug("Using form 1 affinity\n"); affinity_form = FORM1_AFFINITY; } else affinity_form = FORM0_AFFINITY; @@ -482,7 +479,7 @@ static int __init find_primary_domain_index(void) &distance_ref_points_depth); if (!distance_ref_points) { - dbg("NUMA: ibm,associativity-reference-points not found.\n"); + pr_debug("NUMA: ibm,associativity-reference-points not found.\n"); goto err; } @@ -928,7 +925,7 @@ static int __init parse_numa_properties(void) return primary_domain_index; } - dbg("NUMA associativity depth for CPU/Memory: %d\n", primary_domain_index); + pr_debug("NUMA associativity depth for CPU/Memory: %d\n", primary_domain_index); /* * If it is FORM2 initialize the distance table here. @@ -1251,9 +1248,6 @@ static int __init early_numa(char *p) if (strstr(p, "off")) numa_enabled = 0; - if (strstr(p, "debug")) - numa_debug = 1; - p = strstr(p, "fake="); if (p) cmdline = p + strlen("fake="); @@ -1416,7 +1410,7 @@ static long vphn_get_associativity(unsigned long cpu, switch (rc) { case H_SUCCESS: - dbg("VPHN hcall succeeded. Reset polling...\n"); + pr_debug("VPHN hcall succeeded. Reset polling...\n"); goto out; case H_FUNCTION: -- cgit v1.2.3-59-g8ed1b From 506c2075ffd8db352c53201ef166948a272e3bce Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 26 Aug 2021 15:35:18 +0530 Subject: powerpc/numa: convert printk to pr_xxx Convert the remaining printk to pr_xxx One advantage would be all prints will now have prefix "numa:" from pr_fmt(). [ convert printk(KERN_ERR) to pr_warn : Suggested by Laurent Dufour ] Suggested-by: Michael Ellerman Signed-off-by: Srikar Dronamraju [mpe: Rebase onto powerpc/next, s/WARNING/Warning/] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210826100521.412639-3-srikar@linux.vnet.ibm.com --- arch/powerpc/mm/numa.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 73ba6b072274..f4e52581713e 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -162,8 +162,7 @@ static void unmap_cpu_from_node(unsigned long cpu) if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) { cpumask_clear_cpu(cpu, node_to_cpumask_map[node]); } else { - printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n", - cpu, node); + pr_warn("Warning: cpu %lu not found in node %d\n", cpu, node); } } #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */ @@ -479,15 +478,14 @@ static int __init find_primary_domain_index(void) &distance_ref_points_depth); if (!distance_ref_points) { - pr_debug("NUMA: ibm,associativity-reference-points not found.\n"); + pr_debug("ibm,associativity-reference-points not found.\n"); goto err; } distance_ref_points_depth /= sizeof(int); if (affinity_form == FORM0_AFFINITY) { if (distance_ref_points_depth < 2) { - printk(KERN_WARNING "NUMA: " - "short ibm,associativity-reference-points\n"); + pr_warn("short ibm,associativity-reference-points\n"); goto err; } @@ -504,8 +502,8 @@ static int __init find_primary_domain_index(void) * MAX_DISTANCE_REF_POINTS domains. */ if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) { - printk(KERN_WARNING "NUMA: distance array capped at " - "%d entries\n", MAX_DISTANCE_REF_POINTS); + pr_warn("distance array capped at %d entries\n", + MAX_DISTANCE_REF_POINTS); distance_ref_points_depth = MAX_DISTANCE_REF_POINTS; } @@ -910,7 +908,7 @@ static int __init parse_numa_properties(void) const __be32 *associativity; if (numa_enabled == 0) { - printk(KERN_WARNING "NUMA disabled by user\n"); + pr_warn("disabled by user\n"); return -1; } @@ -925,7 +923,7 @@ static int __init parse_numa_properties(void) return primary_domain_index; } - pr_debug("NUMA associativity depth for CPU/Memory: %d\n", primary_domain_index); + pr_debug("associativity depth for CPU/Memory: %d\n", primary_domain_index); /* * If it is FORM2 initialize the distance table here. @@ -1038,10 +1036,8 @@ static void __init setup_nonnuma(void) unsigned int nid = 0; int i; - printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n", - top_of_ram, total_ram); - printk(KERN_DEBUG "Memory hole size: %ldMB\n", - (top_of_ram - total_ram) >> 20); + pr_debug("Top of RAM: 0x%lx, Total RAM: 0x%lx\n", top_of_ram, total_ram); + pr_debug("Memory hole size: %ldMB\n", (top_of_ram - total_ram) >> 20); for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { fake_numa_create_new_node(end_pfn, &nid); -- cgit v1.2.3-59-g8ed1b From 544a09ee7434b949552266d20ece538d35535bd5 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 26 Aug 2021 15:35:19 +0530 Subject: powerpc/numa: Print debug statements only when required Currently, a debug message gets printed every time an attempt to add(remove) a CPU. However this is redundant if the CPU is already added (removed) from the node. Signed-off-by: Srikar Dronamraju Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210826100521.412639-4-srikar@linux.vnet.ibm.com --- arch/powerpc/mm/numa.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index f4e52581713e..52bc63ee6a6a 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -146,10 +146,10 @@ static void map_cpu_to_node(int cpu, int node) { update_numa_cpu_lookup_table(cpu, node); - pr_debug("adding cpu %d to node %d\n", cpu, node); - - if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node]))) + if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node]))) { + pr_debug("adding cpu %d to node %d\n", cpu, node); cpumask_set_cpu(cpu, node_to_cpumask_map[node]); + } } #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR) @@ -157,10 +157,9 @@ static void unmap_cpu_from_node(unsigned long cpu) { int node = numa_cpu_lookup_table[cpu]; - pr_debug("removing cpu %lu from node %d\n", cpu, node); - if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) { cpumask_clear_cpu(cpu, node_to_cpumask_map[node]); + pr_debug("removing cpu %lu from node %d\n", cpu, node); } else { pr_warn("Warning: cpu %lu not found in node %d\n", cpu, node); } -- cgit v1.2.3-59-g8ed1b From 9a245d0e1f006bc7ccf0285d0d520ed304d00c4a Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 26 Aug 2021 15:35:20 +0530 Subject: powerpc/numa: Update cpu_cpu_map on CPU online/offline cpu_cpu_map holds all the CPUs in the DIE. However in PowerPC, when onlining/offlining of CPUs, this mask doesn't get updated. This mask is however updated when CPUs are added/removed. So when both operations like online/offline of CPUs and adding/removing of CPUs are done simultaneously, then cpumaps end up broken. WARNING: CPU: 13 PID: 1142 at kernel/sched/topology.c:898 build_sched_domains+0xd48/0x1720 Modules linked in: rpadlpar_io rpaphp mptcp_diag xsk_diag tcp_diag udp_diag raw_diag inet_diag unix_diag af_packet_diag netlink_diag bonding tls nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set rfkill nf_tables nfnetlink pseries_rng xts vmx_crypto uio_pdrv_genirq uio binfmt_misc ip_tables xfs libcrc32c dm_service_time sd_mod t10_pi sg ibmvfc scsi_transport_fc ibmveth dm_multipath dm_mirror dm_region_hash dm_log dm_mod fuse CPU: 13 PID: 1142 Comm: kworker/13:2 Not tainted 5.13.0-rc6+ #28 Workqueue: events cpuset_hotplug_workfn NIP: c0000000001caac8 LR: c0000000001caac4 CTR: 00000000007088ec REGS: c00000005596f220 TRAP: 0700 Not tainted (5.13.0-rc6+) MSR: 8000000000029033 CR: 48828222 XER: 00000009 CFAR: c0000000001ea698 IRQMASK: 0 GPR00: c0000000001caac4 c00000005596f4c0 c000000001c4a400 0000000000000036 GPR04: 00000000fffdffff c00000005596f1d0 0000000000000027 c0000018cfd07f90 GPR08: 0000000000000023 0000000000000001 0000000000000027 c0000018fe68ffe8 GPR12: 0000000000008000 c00000001e9d1880 c00000013a047200 0000000000000800 GPR16: c000000001d3c7d0 0000000000000240 0000000000000048 c000000010aacd18 GPR20: 0000000000000001 c000000010aacc18 c00000013a047c00 c000000139ec2400 GPR24: 0000000000000280 c000000139ec2520 c000000136c1b400 c000000001c93060 GPR28: c00000013a047c20 c000000001d3c6c0 c000000001c978a0 000000000000000d NIP [c0000000001caac8] build_sched_domains+0xd48/0x1720 LR [c0000000001caac4] build_sched_domains+0xd44/0x1720 Call Trace: [c00000005596f4c0] [c0000000001caac4] build_sched_domains+0xd44/0x1720 (unreliable) [c00000005596f670] [c0000000001cc5ec] partition_sched_domains_locked+0x3ac/0x4b0 [c00000005596f710] [c0000000002804e4] rebuild_sched_domains_locked+0x404/0x9e0 [c00000005596f810] [c000000000283e60] rebuild_sched_domains+0x40/0x70 [c00000005596f840] [c000000000284124] cpuset_hotplug_workfn+0x294/0xf10 [c00000005596fc60] [c000000000175040] process_one_work+0x290/0x590 [c00000005596fd00] [c0000000001753c8] worker_thread+0x88/0x620 [c00000005596fda0] [c000000000181704] kthread+0x194/0x1a0 [c00000005596fe10] [c00000000000ccec] ret_from_kernel_thread+0x5c/0x70 Instruction dump: 485af049 60000000 2fa30800 409e0028 80fe0000 e89a00f8 e86100e8 38da0120 7f88e378 7ce53b78 4801fb91 60000000 <0fe00000> 39000000 38e00000 38c00000 Fix this by updating cpu_cpu_map aka cpumask_of_node() on every CPU online/offline. Signed-off-by: Srikar Dronamraju Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210826100521.412639-5-srikar@linux.vnet.ibm.com --- arch/powerpc/include/asm/topology.h | 13 +++++++++++++ arch/powerpc/kernel/smp.c | 3 +++ arch/powerpc/mm/numa.c | 7 ++----- 3 files changed, 18 insertions(+), 5 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index a4712ecad3e9..36fcafb1fd6d 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -66,6 +66,11 @@ static inline int early_cpu_to_node(int cpu) int of_drconf_to_nid_single(struct drmem_lmb *lmb); void update_numa_distance(struct device_node *node); +extern void map_cpu_to_node(int cpu, int node); +#ifdef CONFIG_HOTPLUG_CPU +extern void unmap_cpu_from_node(unsigned long cpu); +#endif /* CONFIG_HOTPLUG_CPU */ + #else static inline int early_cpu_to_node(int cpu) { return 0; } @@ -95,6 +100,14 @@ static inline int of_drconf_to_nid_single(struct drmem_lmb *lmb) } static inline void update_numa_distance(struct device_node *node) {} + +#ifdef CONFIG_SMP +static inline void map_cpu_to_node(int cpu, int node) {} +#ifdef CONFIG_HOTPLUG_CPU +static inline void unmap_cpu_from_node(unsigned long cpu) {} +#endif /* CONFIG_HOTPLUG_CPU */ +#endif /* CONFIG_SMP */ + #endif /* CONFIG_NUMA */ #if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR) diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 548aa58d25f9..9cc7d3dbf439 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -1442,6 +1442,8 @@ static void remove_cpu_from_masks(int cpu) struct cpumask *(*mask_fn)(int) = cpu_sibling_mask; int i; + unmap_cpu_from_node(cpu); + if (shared_caches) mask_fn = cpu_l2_cache_mask; @@ -1526,6 +1528,7 @@ static void add_cpu_to_masks(int cpu) * This CPU will not be in the online mask yet so we need to manually * add it to it's own thread sibling mask. */ + map_cpu_to_node(cpu, cpu_to_node(cpu)); cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); cpumask_set_cpu(cpu, cpu_core_mask(cpu)); diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 52bc63ee6a6a..6f14c8fb6359 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -142,7 +142,7 @@ static void reset_numa_cpu_lookup_table(void) numa_cpu_lookup_table[cpu] = -1; } -static void map_cpu_to_node(int cpu, int node) +void map_cpu_to_node(int cpu, int node) { update_numa_cpu_lookup_table(cpu, node); @@ -153,7 +153,7 @@ static void map_cpu_to_node(int cpu, int node) } #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR) -static void unmap_cpu_from_node(unsigned long cpu) +void unmap_cpu_from_node(unsigned long cpu) { int node = numa_cpu_lookup_table[cpu]; @@ -800,9 +800,6 @@ static int ppc_numa_cpu_prepare(unsigned int cpu) static int ppc_numa_cpu_dead(unsigned int cpu) { -#ifdef CONFIG_HOTPLUG_CPU - unmap_cpu_from_node(cpu); -#endif return 0; } -- cgit v1.2.3-59-g8ed1b From 0c634bafe3bbee7a36dca7f1277057e05bf14d91 Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Tue, 17 Aug 2021 03:39:19 -0300 Subject: powerpc/pseries/iommu: Replace hard-coded page shift Some functions assume IOMMU page size can only be 4K (pageshift == 12). Update them to accept any page size passed, so we can use 64K pages. In the process, some defines like TCE_SHIFT were made obsolete, and then removed. IODA3 Revision 3.0_prd1 (OpenPowerFoundation), Figures 3.4 and 3.5 show a RPN of 52-bit, and considers a 12-bit pageshift, so there should be no need of using TCE_RPN_MASK, which masks out any bit after 40 in rpn. It's usage removed from tce_build_pSeries(), tce_build_pSeriesLP(), and tce_buildmulti_pSeriesLP(). Most places had a tbl struct, so using tbl->it_page_shift was simple. tce_free_pSeriesLP() was a special case, since callers not always have a tbl struct, so adding a tceshift parameter seems the right thing to do. Signed-off-by: Leonardo Bras Reviewed-by: Alexey Kardashevskiy Reviewed-by: Frederic Barrat Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210817063929.38701-2-leobras.c@gmail.com --- arch/powerpc/include/asm/tce.h | 8 ------- arch/powerpc/platforms/pseries/iommu.c | 39 ++++++++++++++++++++-------------- 2 files changed, 23 insertions(+), 24 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/tce.h b/arch/powerpc/include/asm/tce.h index db5fc2f2262d..0c34d2756d92 100644 --- a/arch/powerpc/include/asm/tce.h +++ b/arch/powerpc/include/asm/tce.h @@ -19,15 +19,7 @@ #define TCE_VB 0 #define TCE_PCI 1 -/* TCE page size is 4096 bytes (1 << 12) */ - -#define TCE_SHIFT 12 -#define TCE_PAGE_SIZE (1 << TCE_SHIFT) - #define TCE_ENTRY_SIZE 8 /* each TCE is 64 bits */ - -#define TCE_RPN_MASK 0xfffffffffful /* 40-bit RPN (4K pages) */ -#define TCE_RPN_SHIFT 12 #define TCE_VALID 0x800 /* TCE valid */ #define TCE_ALLIO 0x400 /* TCE valid for all lpars */ #define TCE_PCI_WRITE 0x2 /* write from PCI allowed */ diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 0c55b991f665..b1b8d12bab39 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -107,6 +107,8 @@ static int tce_build_pSeries(struct iommu_table *tbl, long index, u64 proto_tce; __be64 *tcep; u64 rpn; + const unsigned long tceshift = tbl->it_page_shift; + const unsigned long pagesize = IOMMU_PAGE_SIZE(tbl); proto_tce = TCE_PCI_READ; // Read allowed @@ -117,10 +119,10 @@ static int tce_build_pSeries(struct iommu_table *tbl, long index, while (npages--) { /* can't move this out since we might cross MEMBLOCK boundary */ - rpn = __pa(uaddr) >> TCE_SHIFT; - *tcep = cpu_to_be64(proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT); + rpn = __pa(uaddr) >> tceshift; + *tcep = cpu_to_be64(proto_tce | rpn << tceshift); - uaddr += TCE_PAGE_SIZE; + uaddr += pagesize; tcep++; } return 0; @@ -146,7 +148,7 @@ static unsigned long tce_get_pseries(struct iommu_table *tbl, long index) return be64_to_cpu(*tcep); } -static void tce_free_pSeriesLP(unsigned long liobn, long, long); +static void tce_free_pSeriesLP(unsigned long liobn, long, long, long); static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long); static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift, @@ -166,12 +168,12 @@ static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift, proto_tce |= TCE_PCI_WRITE; while (npages--) { - tce = proto_tce | (rpn & TCE_RPN_MASK) << tceshift; + tce = proto_tce | rpn << tceshift; rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, tce); if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { ret = (int)rc; - tce_free_pSeriesLP(liobn, tcenum_start, + tce_free_pSeriesLP(liobn, tcenum_start, tceshift, (npages_start - (npages + 1))); break; } @@ -205,10 +207,11 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long tcenum_start = tcenum, npages_start = npages; int ret = 0; unsigned long flags; + const unsigned long tceshift = tbl->it_page_shift; if ((npages == 1) || !firmware_has_feature(FW_FEATURE_PUT_TCE_IND)) { return tce_build_pSeriesLP(tbl->it_index, tcenum, - tbl->it_page_shift, npages, uaddr, + tceshift, npages, uaddr, direction, attrs); } @@ -225,13 +228,13 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, if (!tcep) { local_irq_restore(flags); return tce_build_pSeriesLP(tbl->it_index, tcenum, - tbl->it_page_shift, + tceshift, npages, uaddr, direction, attrs); } __this_cpu_write(tce_page, tcep); } - rpn = __pa(uaddr) >> TCE_SHIFT; + rpn = __pa(uaddr) >> tceshift; proto_tce = TCE_PCI_READ; if (direction != DMA_TO_DEVICE) proto_tce |= TCE_PCI_WRITE; @@ -245,12 +248,12 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, limit = min_t(long, npages, 4096/TCE_ENTRY_SIZE); for (l = 0; l < limit; l++) { - tcep[l] = cpu_to_be64(proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT); + tcep[l] = cpu_to_be64(proto_tce | rpn << tceshift); rpn++; } rc = plpar_tce_put_indirect((u64)tbl->it_index, - (u64)tcenum << 12, + (u64)tcenum << tceshift, (u64)__pa(tcep), limit); @@ -277,12 +280,13 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, return ret; } -static void tce_free_pSeriesLP(unsigned long liobn, long tcenum, long npages) +static void tce_free_pSeriesLP(unsigned long liobn, long tcenum, long tceshift, + long npages) { u64 rc; while (npages--) { - rc = plpar_tce_put((u64)liobn, (u64)tcenum << 12, 0); + rc = plpar_tce_put((u64)liobn, (u64)tcenum << tceshift, 0); if (rc && printk_ratelimit()) { printk("tce_free_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc); @@ -301,9 +305,11 @@ static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long n u64 rc; if (!firmware_has_feature(FW_FEATURE_STUFF_TCE)) - return tce_free_pSeriesLP(tbl->it_index, tcenum, npages); + return tce_free_pSeriesLP(tbl->it_index, tcenum, + tbl->it_page_shift, npages); - rc = plpar_tce_stuff((u64)tbl->it_index, (u64)tcenum << 12, 0, npages); + rc = plpar_tce_stuff((u64)tbl->it_index, + (u64)tcenum << tbl->it_page_shift, 0, npages); if (rc && printk_ratelimit()) { printk("tce_freemulti_pSeriesLP: plpar_tce_stuff failed\n"); @@ -319,7 +325,8 @@ static unsigned long tce_get_pSeriesLP(struct iommu_table *tbl, long tcenum) u64 rc; unsigned long tce_ret; - rc = plpar_tce_get((u64)tbl->it_index, (u64)tcenum << 12, &tce_ret); + rc = plpar_tce_get((u64)tbl->it_index, + (u64)tcenum << tbl->it_page_shift, &tce_ret); if (rc && printk_ratelimit()) { printk("tce_get_pSeriesLP: plpar_tce_get failed. rc=%lld\n", rc); -- cgit v1.2.3-59-g8ed1b From 3c33066a21903076722a2881556a92aa3cd7d359 Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Tue, 17 Aug 2021 03:39:20 -0300 Subject: powerpc/kernel/iommu: Add new iommu_table_in_use() helper Having a function to check if the iommu table has any allocation helps deciding if a tbl can be reset for using a new DMA window. It should be enough to replace all instances of !bitmap_empty(tbl...). iommu_table_in_use() skips reserved memory, so we don't need to worry about releasing it before testing. This causes iommu_table_release_pages() to become unnecessary, given it is only used to remove reserved memory for testing. Also, only allow storing reserved memory values in tbl if they are valid in the table, so there is no need to check it in the new helper. Signed-off-by: Leonardo Bras Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210817063929.38701-3-leobras.c@gmail.com --- arch/powerpc/include/asm/iommu.h | 1 + arch/powerpc/kernel/iommu.c | 61 ++++++++++++++++++++-------------------- 2 files changed, 32 insertions(+), 30 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index deef7c94d7b6..bf3b84128525 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -154,6 +154,7 @@ extern int iommu_tce_table_put(struct iommu_table *tbl); */ extern struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid, unsigned long res_start, unsigned long res_end); +bool iommu_table_in_use(struct iommu_table *tbl); #define IOMMU_TABLE_GROUP_MAX_TABLES 2 diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 2af89a5e379f..ed98ad63633e 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -690,32 +690,24 @@ static void iommu_table_reserve_pages(struct iommu_table *tbl, if (tbl->it_offset == 0) set_bit(0, tbl->it_map); - tbl->it_reserved_start = res_start; - tbl->it_reserved_end = res_end; - - /* Check if res_start..res_end isn't empty and overlaps the table */ - if (res_start && res_end && - (tbl->it_offset + tbl->it_size < res_start || - res_end < tbl->it_offset)) - return; + if (res_start < tbl->it_offset) + res_start = tbl->it_offset; - for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i) - set_bit(i - tbl->it_offset, tbl->it_map); -} + if (res_end > (tbl->it_offset + tbl->it_size)) + res_end = tbl->it_offset + tbl->it_size; -static void iommu_table_release_pages(struct iommu_table *tbl) -{ - int i; + /* Check if res_start..res_end is a valid range in the table */ + if (res_start >= res_end) { + tbl->it_reserved_start = tbl->it_offset; + tbl->it_reserved_end = tbl->it_offset; + return; + } - /* - * In case we have reserved the first bit, we should not emit - * the warning below. - */ - if (tbl->it_offset == 0) - clear_bit(0, tbl->it_map); + tbl->it_reserved_start = res_start; + tbl->it_reserved_end = res_end; for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i) - clear_bit(i - tbl->it_offset, tbl->it_map); + set_bit(i - tbl->it_offset, tbl->it_map); } /* @@ -779,6 +771,22 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid, return tbl; } +bool iommu_table_in_use(struct iommu_table *tbl) +{ + unsigned long start = 0, end; + + /* ignore reserved bit0 */ + if (tbl->it_offset == 0) + start = 1; + end = tbl->it_reserved_start - tbl->it_offset; + if (find_next_bit(tbl->it_map, end, start) != end) + return true; + + start = tbl->it_reserved_end - tbl->it_offset; + end = tbl->it_size; + return find_next_bit(tbl->it_map, end, start) != end; +} + static void iommu_table_free(struct kref *kref) { struct iommu_table *tbl; @@ -795,10 +803,8 @@ static void iommu_table_free(struct kref *kref) iommu_debugfs_del(tbl); - iommu_table_release_pages(tbl); - /* verify that table contains no entries */ - if (!bitmap_empty(tbl->it_map, tbl->it_size)) + if (iommu_table_in_use(tbl)) pr_warn("%s: Unexpected TCEs\n", __func__); /* free bitmap */ @@ -1099,14 +1105,9 @@ int iommu_take_ownership(struct iommu_table *tbl) for (i = 0; i < tbl->nr_pools; i++) spin_lock_nest_lock(&tbl->pools[i].lock, &tbl->large_pool.lock); - iommu_table_release_pages(tbl); - - if (!bitmap_empty(tbl->it_map, tbl->it_size)) { + if (iommu_table_in_use(tbl)) { pr_err("iommu_tce: it_map is not empty"); ret = -EBUSY; - /* Undo iommu_table_release_pages, i.e. restore bit#0, etc */ - iommu_table_reserve_pages(tbl, tbl->it_reserved_start, - tbl->it_reserved_end); } else { memset(tbl->it_map, 0xff, sz); } -- cgit v1.2.3-59-g8ed1b From 4ff8677a0b192a58d998d1d34fc5168203041a24 Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Tue, 17 Aug 2021 03:39:21 -0300 Subject: powerpc/pseries/iommu: Add iommu_pseries_alloc_table() helper Creates a helper to allow allocating a new iommu_table without the need to reallocate the iommu_group. This will be helpful for replacing the iommu_table for the new DMA window, after we remove the old one with iommu_tce_table_put(). Signed-off-by: Leonardo Bras Reviewed-by: Alexey Kardashevskiy Reviewed-by: Frederic Barrat Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210817063929.38701-4-leobras.c@gmail.com --- arch/powerpc/platforms/pseries/iommu.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index b1b8d12bab39..33d82865d6e6 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -53,28 +53,31 @@ enum { DDW_EXT_QUERY_OUT_SIZE = 2 }; -static struct iommu_table_group *iommu_pseries_alloc_group(int node) +static struct iommu_table *iommu_pseries_alloc_table(int node) { - struct iommu_table_group *table_group; struct iommu_table *tbl; - table_group = kzalloc_node(sizeof(struct iommu_table_group), GFP_KERNEL, - node); - if (!table_group) - return NULL; - tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, node); if (!tbl) - goto free_group; + return NULL; INIT_LIST_HEAD_RCU(&tbl->it_group_list); kref_init(&tbl->it_kref); + return tbl; +} - table_group->tables[0] = tbl; +static struct iommu_table_group *iommu_pseries_alloc_group(int node) +{ + struct iommu_table_group *table_group; + + table_group = kzalloc_node(sizeof(*table_group), GFP_KERNEL, node); + if (!table_group) + return NULL; - return table_group; + table_group->tables[0] = iommu_pseries_alloc_table(node); + if (table_group->tables[0]) + return table_group; -free_group: kfree(table_group); return NULL; } -- cgit v1.2.3-59-g8ed1b From 92a23219299cedde52e3298788484f4875d5ce0f Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Tue, 17 Aug 2021 03:39:22 -0300 Subject: powerpc/pseries/iommu: Add ddw_list_new_entry() helper There are two functions creating direct_window_list entries in a similar way, so create a ddw_list_new_entry() to avoid duplicity and simplify those functions. Signed-off-by: Leonardo Bras Reviewed-by: Alexey Kardashevskiy Reviewed-by: Frederic Barrat Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210817063929.38701-5-leobras.c@gmail.com --- arch/powerpc/platforms/pseries/iommu.c | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 33d82865d6e6..712d1667144a 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -874,6 +874,21 @@ static u64 find_existing_ddw(struct device_node *pdn, int *window_shift) return dma_addr; } +static struct direct_window *ddw_list_new_entry(struct device_node *pdn, + const struct dynamic_dma_window_prop *dma64) +{ + struct direct_window *window; + + window = kzalloc(sizeof(*window), GFP_KERNEL); + if (!window) + return NULL; + + window->device = pdn; + window->prop = dma64; + + return window; +} + static int find_existing_ddw_windows(void) { int len; @@ -886,18 +901,15 @@ static int find_existing_ddw_windows(void) for_each_node_with_property(pdn, DIRECT64_PROPNAME) { direct64 = of_get_property(pdn, DIRECT64_PROPNAME, &len); - if (!direct64) - continue; - - window = kzalloc(sizeof(*window), GFP_KERNEL); - if (!window || len < sizeof(struct dynamic_dma_window_prop)) { - kfree(window); + if (!direct64 || len < sizeof(*direct64)) { remove_ddw(pdn, true); continue; } - window->device = pdn; - window->prop = direct64; + window = ddw_list_new_entry(pdn, direct64); + if (!window) + break; + spin_lock(&direct_window_list_lock); list_add(&window->list, &direct_window_list); spin_unlock(&direct_window_list_lock); @@ -1307,7 +1319,7 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %pOF\n", create.liobn, dn); - window = kzalloc(sizeof(*window), GFP_KERNEL); + window = ddw_list_new_entry(pdn, ddwprop); if (!window) goto out_clear_window; @@ -1326,8 +1338,6 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) goto out_free_window; } - window->device = pdn; - window->prop = ddwprop; spin_lock(&direct_window_list_lock); list_add(&window->list, &direct_window_list); spin_unlock(&direct_window_list_lock); -- cgit v1.2.3-59-g8ed1b From 2ca73c54ce24489518a56d816331b774044c2445 Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Tue, 17 Aug 2021 03:39:23 -0300 Subject: powerpc/pseries/iommu: Allow DDW windows starting at 0x00 enable_ddw() currently returns the address of the DMA window, which is considered invalid if has the value 0x00. Also, it only considers valid an address returned from find_existing_ddw if it's not 0x00. Changing this behavior makes sense, given the users of enable_ddw() only need to know if direct mapping is possible. It can also allow a DMA window starting at 0x00 to be used. This will be helpful for using a DDW with indirect mapping, as the window address will be different than 0x00, but it will not map the whole partition. Signed-off-by: Leonardo Bras Reviewed-by: Alexey Kardashevskiy Reviewed-by: Frederic Barrat Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210817063929.38701-6-leobras.c@gmail.com --- arch/powerpc/platforms/pseries/iommu.c | 36 +++++++++++++++++----------------- 1 file changed, 18 insertions(+), 18 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 712d1667144a..b34b473bbdc1 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -853,25 +853,26 @@ static void remove_ddw(struct device_node *np, bool remove_prop) np, ret); } -static u64 find_existing_ddw(struct device_node *pdn, int *window_shift) +static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, int *window_shift) { struct direct_window *window; const struct dynamic_dma_window_prop *direct64; - u64 dma_addr = 0; + bool found = false; spin_lock(&direct_window_list_lock); /* check if we already created a window and dupe that config if so */ list_for_each_entry(window, &direct_window_list, list) { if (window->device == pdn) { direct64 = window->prop; - dma_addr = be64_to_cpu(direct64->dma_base); + *dma_addr = be64_to_cpu(direct64->dma_base); *window_shift = be32_to_cpu(direct64->window_shift); + found = true; break; } } spin_unlock(&direct_window_list_lock); - return dma_addr; + return found; } static struct direct_window *ddw_list_new_entry(struct device_node *pdn, @@ -1161,20 +1162,20 @@ static int iommu_get_page_shift(u32 query_page_size) * pdn: the parent pe node with the ibm,dma_window property * Future: also check if we can remap the base window for our base page size * - * returns the dma offset for use by the direct mapped DMA code. + * returns true if can map all pages (direct mapping), false otherwise.. */ -static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) +static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) { int len = 0, ret; int max_ram_len = order_base_2(ddw_memory_hotplug_max()); struct ddw_query_response query; struct ddw_create_response create; int page_shift; - u64 dma_addr; struct device_node *dn; u32 ddw_avail[DDW_APPLICABLE_SIZE]; struct direct_window *window; struct property *win64; + bool ddw_enabled = false; struct dynamic_dma_window_prop *ddwprop; struct failed_ddw_pdn *fpdn; bool default_win_removed = false; @@ -1186,9 +1187,10 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) mutex_lock(&direct_window_init_mutex); - dma_addr = find_existing_ddw(pdn, &len); - if (dma_addr != 0) + if (find_existing_ddw(pdn, &dev->dev.archdata.dma_offset, &len)) { + ddw_enabled = true; goto out_unlock; + } /* * If we already went through this for a previous function of @@ -1342,7 +1344,8 @@ static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) list_add(&window->list, &direct_window_list); spin_unlock(&direct_window_list_lock); - dma_addr = be64_to_cpu(ddwprop->dma_base); + dev->dev.archdata.dma_offset = be64_to_cpu(ddwprop->dma_base); + ddw_enabled = true; goto out_unlock; out_free_window: @@ -1374,10 +1377,10 @@ out_unlock: * as RAM, then we failed to create a window to cover persistent * memory and need to set the DMA limit. */ - if (pmem_present && dma_addr && (len == max_ram_len)) - dev->dev.bus_dma_limit = dma_addr + (1ULL << len); + if (pmem_present && ddw_enabled && (len == max_ram_len)) + dev->dev.bus_dma_limit = dev->dev.archdata.dma_offset + (1ULL << len); - return dma_addr; + return ddw_enabled; } static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) @@ -1456,11 +1459,8 @@ static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask) break; } - if (pdn && PCI_DN(pdn)) { - pdev->dev.archdata.dma_offset = enable_ddw(pdev, pdn); - if (pdev->dev.archdata.dma_offset) - return true; - } + if (pdn && PCI_DN(pdn)) + return enable_ddw(pdev, pdn); return false; } -- cgit v1.2.3-59-g8ed1b From 7ed2ed2db2685a285cb09ab330dc4efea0b64022 Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Tue, 17 Aug 2021 03:39:24 -0300 Subject: powerpc/pseries/iommu: Add ddw_property_create() and refactor enable_ddw() Code used to create a ddw property that was previously scattered in enable_ddw() is now gathered in ddw_property_create(), which deals with allocation and filling the property, letting it ready for of_property_add(), which now occurs in sequence. This created an opportunity to reorganize the second part of enable_ddw(): Without this patch enable_ddw() does, in order: kzalloc() property & members, create_ddw(), fill ddwprop inside property, ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory, of_add_property(), and list_add(). With this patch enable_ddw() does, in order: create_ddw(), ddw_property_create(), of_add_property(), ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory, and list_add(). This change requires of_remove_property() in case anything fails after of_add_property(), but we get to do tce_setrange_multi_pSeriesLP_walk in all memory, which looks the most expensive operation, only if everything else succeeds. Also, the error path got remove_ddw() replaced by a new helper __remove_dma_window(), which only removes the new DDW with an rtas-call. For this, a new helper clean_dma_window() was needed to clean anything that could left if walk_system_ram_range() fails. Signed-off-by: Leonardo Bras Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210817063929.38701-7-leobras.c@gmail.com --- arch/powerpc/platforms/pseries/iommu.c | 129 +++++++++++++++++++++------------ 1 file changed, 84 insertions(+), 45 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index b34b473bbdc1..00392582fe10 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -795,17 +795,10 @@ static int __init disable_ddw_setup(char *str) early_param("disable_ddw", disable_ddw_setup); -static void remove_dma_window(struct device_node *np, u32 *ddw_avail, - struct property *win) +static void clean_dma_window(struct device_node *np, struct dynamic_dma_window_prop *dwp) { - struct dynamic_dma_window_prop *dwp; - u64 liobn; int ret; - dwp = win->value; - liobn = (u64)be32_to_cpu(dwp->liobn); - - /* clear the whole window, note the arg is in kernel pages */ ret = tce_clearrange_multi_pSeriesLP(0, 1ULL << (be32_to_cpu(dwp->window_shift) - PAGE_SHIFT), dwp); if (ret) @@ -814,18 +807,39 @@ static void remove_dma_window(struct device_node *np, u32 *ddw_avail, else pr_debug("%pOF successfully cleared tces in window.\n", np); +} + +/* + * Call only if DMA window is clean. + */ +static void __remove_dma_window(struct device_node *np, u32 *ddw_avail, u64 liobn) +{ + int ret; ret = rtas_call(ddw_avail[DDW_REMOVE_PE_DMA_WIN], 1, 1, NULL, liobn); if (ret) - pr_warn("%pOF: failed to remove direct window: rtas returned " + pr_warn("%pOF: failed to remove DMA window: rtas returned " "%d to ibm,remove-pe-dma-window(%x) %llx\n", np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn); else - pr_debug("%pOF: successfully removed direct window: rtas returned " + pr_debug("%pOF: successfully removed DMA window: rtas returned " "%d to ibm,remove-pe-dma-window(%x) %llx\n", np, ret, ddw_avail[DDW_REMOVE_PE_DMA_WIN], liobn); } +static void remove_dma_window(struct device_node *np, u32 *ddw_avail, + struct property *win) +{ + struct dynamic_dma_window_prop *dwp; + u64 liobn; + + dwp = win->value; + liobn = (u64)be32_to_cpu(dwp->liobn); + + clean_dma_window(np, dwp); + __remove_dma_window(np, ddw_avail, liobn); +} + static void remove_ddw(struct device_node *np, bool remove_prop) { struct property *win; @@ -1153,6 +1167,35 @@ static int iommu_get_page_shift(u32 query_page_size) return 0; } +static struct property *ddw_property_create(const char *propname, u32 liobn, u64 dma_addr, + u32 page_shift, u32 window_shift) +{ + struct dynamic_dma_window_prop *ddwprop; + struct property *win64; + + win64 = kzalloc(sizeof(*win64), GFP_KERNEL); + if (!win64) + return NULL; + + win64->name = kstrdup(propname, GFP_KERNEL); + ddwprop = kzalloc(sizeof(*ddwprop), GFP_KERNEL); + win64->value = ddwprop; + win64->length = sizeof(*ddwprop); + if (!win64->name || !win64->value) { + kfree(win64->name); + kfree(win64->value); + kfree(win64); + return NULL; + } + + ddwprop->liobn = cpu_to_be32(liobn); + ddwprop->dma_base = cpu_to_be64(dma_addr); + ddwprop->tce_shift = cpu_to_be32(page_shift); + ddwprop->window_shift = cpu_to_be32(window_shift); + + return win64; +} + /* * If the PE supports dynamic dma windows, and there is space for a table * that can map all pages in a linear offset, then setup such a table, @@ -1171,12 +1214,12 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) struct ddw_query_response query; struct ddw_create_response create; int page_shift; + u64 win_addr; struct device_node *dn; u32 ddw_avail[DDW_APPLICABLE_SIZE]; struct direct_window *window; struct property *win64; bool ddw_enabled = false; - struct dynamic_dma_window_prop *ddwprop; struct failed_ddw_pdn *fpdn; bool default_win_removed = false; bool pmem_present; @@ -1293,72 +1336,68 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) 1ULL << page_shift); goto out_failed; } - win64 = kzalloc(sizeof(struct property), GFP_KERNEL); - if (!win64) { - dev_info(&dev->dev, - "couldn't allocate property for 64bit dma window\n"); - goto out_failed; - } - win64->name = kstrdup(DIRECT64_PROPNAME, GFP_KERNEL); - win64->value = ddwprop = kmalloc(sizeof(*ddwprop), GFP_KERNEL); - win64->length = sizeof(*ddwprop); - if (!win64->name || !win64->value) { - dev_info(&dev->dev, - "couldn't allocate property name and value\n"); - goto out_free_prop; - } ret = create_ddw(dev, ddw_avail, &create, page_shift, len); if (ret != 0) - goto out_free_prop; - - ddwprop->liobn = cpu_to_be32(create.liobn); - ddwprop->dma_base = cpu_to_be64(((u64)create.addr_hi << 32) | - create.addr_lo); - ddwprop->tce_shift = cpu_to_be32(page_shift); - ddwprop->window_shift = cpu_to_be32(len); + goto out_failed; dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %pOF\n", create.liobn, dn); - window = ddw_list_new_entry(pdn, ddwprop); + win_addr = ((u64)create.addr_hi << 32) | create.addr_lo; + win64 = ddw_property_create(DIRECT64_PROPNAME, create.liobn, win_addr, + page_shift, len); + if (!win64) { + dev_info(&dev->dev, + "couldn't allocate property, property name, or value\n"); + goto out_remove_win; + } + + ret = of_add_property(pdn, win64); + if (ret) { + dev_err(&dev->dev, "unable to add dma window property for %pOF: %d", + pdn, ret); + goto out_free_prop; + } + + window = ddw_list_new_entry(pdn, win64->value); if (!window) - goto out_clear_window; + goto out_del_prop; ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT, win64->value, tce_setrange_multi_pSeriesLP_walk); if (ret) { dev_info(&dev->dev, "failed to map direct window for %pOF: %d\n", dn, ret); - goto out_free_window; - } - ret = of_add_property(pdn, win64); - if (ret) { - dev_err(&dev->dev, "unable to add dma window property for %pOF: %d", - pdn, ret); - goto out_free_window; + /* Make sure to clean DDW if any TCE was set*/ + clean_dma_window(pdn, win64->value); + goto out_del_list; } spin_lock(&direct_window_list_lock); list_add(&window->list, &direct_window_list); spin_unlock(&direct_window_list_lock); - dev->dev.archdata.dma_offset = be64_to_cpu(ddwprop->dma_base); + dev->dev.archdata.dma_offset = win_addr; ddw_enabled = true; goto out_unlock; -out_free_window: +out_del_list: kfree(window); -out_clear_window: - remove_ddw(pdn, true); +out_del_prop: + of_remove_property(pdn, win64); out_free_prop: kfree(win64->name); kfree(win64->value); kfree(win64); +out_remove_win: + /* DDW is clean, so it's ok to call this directly. */ + __remove_dma_window(pdn, ddw_avail, create.liobn); + out_failed: if (default_win_removed) reset_dma_window(dev, pdn); -- cgit v1.2.3-59-g8ed1b From fc8cba8f989fb98e496b33a78476861e246c42a0 Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Tue, 17 Aug 2021 03:39:25 -0300 Subject: powerpc/pseries/iommu: Reorganize iommu_table_setparms*() with new helper Add a new helper _iommu_table_setparms(), and use it in iommu_table_setparms() and iommu_table_setparms_lpar() to avoid duplicated code. Also, setting tbl->it_ops was happening outsite iommu_table_setparms*(), so move it to the new helper. Since we need the iommu_table_ops to be declared before used, declare iommu_table_lpar_multi_ops and iommu_table_pseries_ops to before their respective iommu_table_setparms*(). Signed-off-by: Leonardo Bras Reviewed-by: Frederic Barrat Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210817063929.38701-8-leobras.c@gmail.com --- arch/powerpc/platforms/pseries/iommu.c | 72 ++++++++++++++++++---------------- 1 file changed, 38 insertions(+), 34 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 00392582fe10..a47f59a8f107 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -501,6 +501,24 @@ static int tce_setrange_multi_pSeriesLP_walk(unsigned long start_pfn, return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg); } +static void iommu_table_setparms_common(struct iommu_table *tbl, unsigned long busno, + unsigned long liobn, unsigned long win_addr, + unsigned long window_size, unsigned long page_shift, + void *base, struct iommu_table_ops *table_ops) +{ + tbl->it_busno = busno; + tbl->it_index = liobn; + tbl->it_offset = win_addr >> page_shift; + tbl->it_size = window_size >> page_shift; + tbl->it_page_shift = page_shift; + tbl->it_base = (unsigned long)base; + tbl->it_blocksize = 16; + tbl->it_type = TCE_PCI; + tbl->it_ops = table_ops; +} + +struct iommu_table_ops iommu_table_pseries_ops; + static void iommu_table_setparms(struct pci_controller *phb, struct device_node *dn, struct iommu_table *tbl) @@ -509,8 +527,13 @@ static void iommu_table_setparms(struct pci_controller *phb, const unsigned long *basep; const u32 *sizep; - node = phb->dn; + /* Test if we are going over 2GB of DMA space */ + if (phb->dma_window_base_cur + phb->dma_window_size > SZ_2G) { + udbg_printf("PCI_DMA: Unexpected number of IOAs under this PHB.\n"); + panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n"); + } + node = phb->dn; basep = of_get_property(node, "linux,tce-base", NULL); sizep = of_get_property(node, "linux,tce-size", NULL); if (basep == NULL || sizep == NULL) { @@ -519,33 +542,18 @@ static void iommu_table_setparms(struct pci_controller *phb, return; } - tbl->it_base = (unsigned long)__va(*basep); + iommu_table_setparms_common(tbl, phb->bus->number, 0, phb->dma_window_base_cur, + phb->dma_window_size, IOMMU_PAGE_SHIFT_4K, + __va(*basep), &iommu_table_pseries_ops); if (!is_kdump_kernel()) memset((void *)tbl->it_base, 0, *sizep); - tbl->it_busno = phb->bus->number; - tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K; - - /* Units of tce entries */ - tbl->it_offset = phb->dma_window_base_cur >> tbl->it_page_shift; - - /* Test if we are going over 2GB of DMA space */ - if (phb->dma_window_base_cur + phb->dma_window_size > 0x80000000ul) { - udbg_printf("PCI_DMA: Unexpected number of IOAs under this PHB.\n"); - panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n"); - } - phb->dma_window_base_cur += phb->dma_window_size; - - /* Set the tce table size - measured in entries */ - tbl->it_size = phb->dma_window_size >> tbl->it_page_shift; - - tbl->it_index = 0; - tbl->it_blocksize = 16; - tbl->it_type = TCE_PCI; } +struct iommu_table_ops iommu_table_lpar_multi_ops; + /* * iommu_table_setparms_lpar * @@ -557,17 +565,13 @@ static void iommu_table_setparms_lpar(struct pci_controller *phb, struct iommu_table_group *table_group, const __be32 *dma_window) { - unsigned long offset, size; + unsigned long offset, size, liobn; - of_parse_dma_window(dn, dma_window, &tbl->it_index, &offset, &size); + of_parse_dma_window(dn, dma_window, &liobn, &offset, &size); + + iommu_table_setparms_common(tbl, phb->bus->number, liobn, offset, size, IOMMU_PAGE_SHIFT_4K, NULL, + &iommu_table_lpar_multi_ops); - tbl->it_busno = phb->bus->number; - tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K; - tbl->it_base = 0; - tbl->it_blocksize = 16; - tbl->it_type = TCE_PCI; - tbl->it_offset = offset >> tbl->it_page_shift; - tbl->it_size = size >> tbl->it_page_shift; table_group->tce32_start = offset; table_group->tce32_size = size; @@ -647,7 +651,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) tbl = pci->table_group->tables[0]; iommu_table_setparms(pci->phb, dn, tbl); - tbl->it_ops = &iommu_table_pseries_ops; + if (!iommu_init_table(tbl, pci->phb->node, 0, 0)) panic("Failed to initialize iommu table"); @@ -730,7 +734,7 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) tbl = ppci->table_group->tables[0]; iommu_table_setparms_lpar(ppci->phb, pdn, tbl, ppci->table_group, dma_window); - tbl->it_ops = &iommu_table_lpar_multi_ops; + if (!iommu_init_table(tbl, ppci->phb->node, 0, 0)) panic("Failed to initialize iommu table"); iommu_register_group(ppci->table_group, @@ -760,7 +764,7 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev) PCI_DN(dn)->table_group = iommu_pseries_alloc_group(phb->node); tbl = PCI_DN(dn)->table_group->tables[0]; iommu_table_setparms(phb, dn, tbl); - tbl->it_ops = &iommu_table_pseries_ops; + if (!iommu_init_table(tbl, phb->node, 0, 0)) panic("Failed to initialize iommu table"); @@ -1461,7 +1465,7 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) tbl = pci->table_group->tables[0]; iommu_table_setparms_lpar(pci->phb, pdn, tbl, pci->table_group, dma_window); - tbl->it_ops = &iommu_table_lpar_multi_ops; + iommu_init_table(tbl, pci->phb->node, 0, 0); iommu_register_group(pci->table_group, pci_domain_nr(pci->phb->bus), 0); -- cgit v1.2.3-59-g8ed1b From a5fd95120c653962a9e75e260a35436b96d2c991 Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Tue, 17 Aug 2021 03:39:26 -0300 Subject: powerpc/pseries/iommu: Update remove_dma_window() to accept property name Update remove_dma_window() so it can be used to remove DDW with a given property name. This enables the creation of new property names for DDW, so we can have different usage for it, like indirect mapping. Also, add return values to it so we can check if the property was found while removing the active DDW. This allows skipping the remaining property names while reducing the impact of multiple property names. Signed-off-by: Leonardo Bras Reviewed-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210817063929.38701-9-leobras.c@gmail.com --- arch/powerpc/platforms/pseries/iommu.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index a47f59a8f107..901f290999d0 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -844,31 +844,33 @@ static void remove_dma_window(struct device_node *np, u32 *ddw_avail, __remove_dma_window(np, ddw_avail, liobn); } -static void remove_ddw(struct device_node *np, bool remove_prop) +static int remove_ddw(struct device_node *np, bool remove_prop, const char *win_name) { struct property *win; u32 ddw_avail[DDW_APPLICABLE_SIZE]; int ret = 0; + win = of_find_property(np, win_name, NULL); + if (!win) + return -EINVAL; + ret = of_property_read_u32_array(np, "ibm,ddw-applicable", &ddw_avail[0], DDW_APPLICABLE_SIZE); if (ret) - return; + return 0; - win = of_find_property(np, DIRECT64_PROPNAME, NULL); - if (!win) - return; if (win->length >= sizeof(struct dynamic_dma_window_prop)) remove_dma_window(np, ddw_avail, win); if (!remove_prop) - return; + return 0; ret = of_remove_property(np, win); if (ret) pr_warn("%pOF: failed to remove direct window property: %d\n", np, ret); + return 0; } static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, int *window_shift) @@ -921,7 +923,7 @@ static int find_existing_ddw_windows(void) for_each_node_with_property(pdn, DIRECT64_PROPNAME) { direct64 = of_get_property(pdn, DIRECT64_PROPNAME, &len); if (!direct64 || len < sizeof(*direct64)) { - remove_ddw(pdn, true); + remove_ddw(pdn, true, DIRECT64_PROPNAME); continue; } @@ -1565,7 +1567,7 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti * we have to remove the property when releasing * the device node. */ - remove_ddw(np, false); + remove_ddw(np, false, DIRECT64_PROPNAME); if (pci && pci->table_group) iommu_pseries_free_group(pci->table_group, np->full_name); -- cgit v1.2.3-59-g8ed1b From 8599395d34f2dd7b77bef42da1d99798e7a3d58f Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Tue, 17 Aug 2021 03:39:27 -0300 Subject: powerpc/pseries/iommu: Find existing DDW with given property name At the moment pseries stores information about created directly mapped DDW window in DIRECT64_PROPNAME. With the objective of implementing indirect DMA mapping with DDW, it's necessary to have another propriety name to make sure kexec'ing into older kernels does not break, as it would if we reuse DIRECT64_PROPNAME. In order to have this, find_existing_ddw_windows() needs to be able to look for different property names. Extract find_existing_ddw_windows() into find_existing_ddw_windows_named() and calls it with current property name. Signed-off-by: Leonardo Bras Reviewed-by: Alexey Kardashevskiy Reviewed-by: Frederic Barrat Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210817063929.38701-10-leobras.c@gmail.com --- arch/powerpc/platforms/pseries/iommu.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 901f290999d0..e11c00b2dc1e 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -910,24 +910,21 @@ static struct direct_window *ddw_list_new_entry(struct device_node *pdn, return window; } -static int find_existing_ddw_windows(void) +static void find_existing_ddw_windows_named(const char *name) { int len; struct device_node *pdn; struct direct_window *window; - const struct dynamic_dma_window_prop *direct64; - - if (!firmware_has_feature(FW_FEATURE_LPAR)) - return 0; + const struct dynamic_dma_window_prop *dma64; - for_each_node_with_property(pdn, DIRECT64_PROPNAME) { - direct64 = of_get_property(pdn, DIRECT64_PROPNAME, &len); - if (!direct64 || len < sizeof(*direct64)) { - remove_ddw(pdn, true, DIRECT64_PROPNAME); + for_each_node_with_property(pdn, name) { + dma64 = of_get_property(pdn, name, &len); + if (!dma64 || len < sizeof(*dma64)) { + remove_ddw(pdn, true, name); continue; } - window = ddw_list_new_entry(pdn, direct64); + window = ddw_list_new_entry(pdn, dma64); if (!window) break; @@ -935,6 +932,14 @@ static int find_existing_ddw_windows(void) list_add(&window->list, &direct_window_list); spin_unlock(&direct_window_list_lock); } +} + +static int find_existing_ddw_windows(void) +{ + if (!firmware_has_feature(FW_FEATURE_LPAR)) + return 0; + + find_existing_ddw_windows_named(DIRECT64_PROPNAME); return 0; } -- cgit v1.2.3-59-g8ed1b From 381ceda88c4c4c8345cad1cffa6328892f15dca6 Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Tue, 17 Aug 2021 03:39:28 -0300 Subject: powerpc/pseries/iommu: Make use of DDW for indirect mapping So far it's assumed possible to map the guest RAM 1:1 to the bus, which works with a small number of devices. SRIOV changes it as the user can configure hundreds VFs and since phyp preallocates TCEs and does not allow IOMMU pages bigger than 64K, it has to limit the number of TCEs per a PE to limit waste of physical pages. As of today, if the assumed direct mapping is not possible, DDW creation is skipped and the default DMA window "ibm,dma-window" is used instead. By using DDW, indirect mapping can get more TCEs than available for the default DMA window, and also get access to using much larger pagesizes (16MB as implemented in qemu vs 4k from default DMA window), causing a significant increase on the maximum amount of memory that can be IOMMU mapped at the same time. Indirect mapping will only be used if direct mapping is not a possibility. For indirect mapping, it's necessary to re-create the iommu_table with the new DMA window parameters, so iommu_alloc() can use it. Removing the default DMA window for using DDW with indirect mapping is only allowed if there is no current IOMMU memory allocated in the iommu_table. enable_ddw() is aborted otherwise. Even though there won't be both direct and indirect mappings at the same time, we can't reuse the DIRECT64_PROPNAME property name, or else an older kexec()ed kernel can assume direct mapping, and skip iommu_alloc(), causing undesirable behavior. So a new property name DMA64_PROPNAME "linux,dma64-ddr-window-info" was created to represent a DDW that does not allow direct mapping. Signed-off-by: Leonardo Bras Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210817063929.38701-11-leobras.c@gmail.com --- arch/powerpc/platforms/pseries/iommu.c | 89 ++++++++++++++++++++++++++++------ 1 file changed, 74 insertions(+), 15 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index e11c00b2dc1e..0eccc29f5573 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -375,6 +375,7 @@ static DEFINE_SPINLOCK(direct_window_list_lock); /* protects initializing window twice for same device */ static DEFINE_MUTEX(direct_window_init_mutex); #define DIRECT64_PROPNAME "linux,direct64-ddr-window-info" +#define DMA64_PROPNAME "linux,dma64-ddr-window-info" static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn, unsigned long num_pfn, const void *arg) @@ -940,6 +941,7 @@ static int find_existing_ddw_windows(void) return 0; find_existing_ddw_windows_named(DIRECT64_PROPNAME); + find_existing_ddw_windows_named(DMA64_PROPNAME); return 0; } @@ -1226,14 +1228,17 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) struct ddw_create_response create; int page_shift; u64 win_addr; + const char *win_name; struct device_node *dn; u32 ddw_avail[DDW_APPLICABLE_SIZE]; struct direct_window *window; struct property *win64; bool ddw_enabled = false; struct failed_ddw_pdn *fpdn; - bool default_win_removed = false; + bool default_win_removed = false, direct_mapping = false; bool pmem_present; + struct pci_dn *pci = PCI_DN(pdn); + struct iommu_table *tbl = pci->table_group->tables[0]; dn = of_find_node_by_type(NULL, "ibm,pmemory"); pmem_present = dn != NULL; @@ -1242,6 +1247,7 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) mutex_lock(&direct_window_init_mutex); if (find_existing_ddw(pdn, &dev->dev.archdata.dma_offset, &len)) { + direct_mapping = (len >= max_ram_len); ddw_enabled = true; goto out_unlock; } @@ -1322,8 +1328,8 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) query.page_size); goto out_failed; } - /* verify the window * number of ptes will map the partition */ - /* check largest block * page size > max memory hotplug addr */ + + /* * The "ibm,pmemory" can appear anywhere in the address space. * Assuming it is still backed by page structs, try MAX_PHYSMEM_BITS @@ -1339,13 +1345,25 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) dev_info(&dev->dev, "Skipping ibm,pmemory"); } + /* check if the available block * number of ptes will map everything */ if (query.largest_available_block < (1ULL << (len - page_shift))) { dev_dbg(&dev->dev, "can't map partition max 0x%llx with %llu %llu-sized pages\n", 1ULL << len, query.largest_available_block, 1ULL << page_shift); - goto out_failed; + + /* DDW + IOMMU on single window may fail if there is any allocation */ + if (default_win_removed && iommu_table_in_use(tbl)) { + dev_dbg(&dev->dev, "current IOMMU table in use, can't be replaced.\n"); + goto out_failed; + } + + len = order_base_2(query.largest_available_block << page_shift); + win_name = DMA64_PROPNAME; + } else { + direct_mapping = true; + win_name = DIRECT64_PROPNAME; } ret = create_ddw(dev, ddw_avail, &create, page_shift, len); @@ -1356,8 +1374,8 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) create.liobn, dn); win_addr = ((u64)create.addr_hi << 32) | create.addr_lo; - win64 = ddw_property_create(DIRECT64_PROPNAME, create.liobn, win_addr, - page_shift, len); + win64 = ddw_property_create(win_name, create.liobn, win_addr, page_shift, len); + if (!win64) { dev_info(&dev->dev, "couldn't allocate property, property name, or value\n"); @@ -1375,15 +1393,54 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) if (!window) goto out_del_prop; - ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT, - win64->value, tce_setrange_multi_pSeriesLP_walk); - if (ret) { - dev_info(&dev->dev, "failed to map direct window for %pOF: %d\n", - dn, ret); + if (direct_mapping) { + /* DDW maps the whole partition, so enable direct DMA mapping */ + ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT, + win64->value, tce_setrange_multi_pSeriesLP_walk); + if (ret) { + dev_info(&dev->dev, "failed to map direct window for %pOF: %d\n", + dn, ret); /* Make sure to clean DDW if any TCE was set*/ clean_dma_window(pdn, win64->value); - goto out_del_list; + goto out_del_list; + } + } else { + struct iommu_table *newtbl; + int i; + + for (i = 0; i < ARRAY_SIZE(pci->phb->mem_resources); i++) { + const unsigned long mask = IORESOURCE_MEM_64 | IORESOURCE_MEM; + + /* Look for MMIO32 */ + if ((pci->phb->mem_resources[i].flags & mask) == IORESOURCE_MEM) + break; + } + + if (i == ARRAY_SIZE(pci->phb->mem_resources)) + goto out_del_list; + + /* New table for using DDW instead of the default DMA window */ + newtbl = iommu_pseries_alloc_table(pci->phb->node); + if (!newtbl) { + dev_dbg(&dev->dev, "couldn't create new IOMMU table\n"); + goto out_del_list; + } + + iommu_table_setparms_common(newtbl, pci->phb->bus->number, create.liobn, win_addr, + 1UL << len, page_shift, NULL, &iommu_table_lpar_multi_ops); + iommu_init_table(newtbl, pci->phb->node, pci->phb->mem_resources[i].start, + pci->phb->mem_resources[i].end); + + pci->table_group->tables[1] = newtbl; + + /* Keep default DMA window stuct if removed */ + if (default_win_removed) { + tbl->it_size = 0; + kfree(tbl->it_map); + } + + set_iommu_table_base(&dev->dev, newtbl); } spin_lock(&direct_window_list_lock); @@ -1427,10 +1484,10 @@ out_unlock: * as RAM, then we failed to create a window to cover persistent * memory and need to set the DMA limit. */ - if (pmem_present && ddw_enabled && (len == max_ram_len)) + if (pmem_present && ddw_enabled && direct_mapping && len == max_ram_len) dev->dev.bus_dma_limit = dev->dev.archdata.dma_offset + (1ULL << len); - return ddw_enabled; + return ddw_enabled && direct_mapping; } static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) @@ -1572,7 +1629,9 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti * we have to remove the property when releasing * the device node. */ - remove_ddw(np, false, DIRECT64_PROPNAME); + if (remove_ddw(np, false, DIRECT64_PROPNAME)) + remove_ddw(np, false, DMA64_PROPNAME); + if (pci && pci->table_group) iommu_pseries_free_group(pci->table_group, np->full_name); -- cgit v1.2.3-59-g8ed1b From 57dbbe590f152e5e8a3ff8bf5ba163df34eeae0b Mon Sep 17 00:00:00 2001 From: Leonardo Bras Date: Tue, 17 Aug 2021 03:39:29 -0300 Subject: powerpc/pseries/iommu: Rename "direct window" to "dma window" A previous change introduced the usage of DDW as a bigger indirect DMA mapping when the DDW available size does not map the whole partition. As most of the code that manipulates direct mappings was reused for indirect mappings, it's necessary to rename all names and debug/info messages to reflect that it can be used for both kinds of mapping. This should cause no behavioural change, just adjust naming. Signed-off-by: Leonardo Bras Reviewed-by: Frederic Barrat Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210817063929.38701-12-leobras.c@gmail.com --- arch/powerpc/platforms/pseries/iommu.c | 87 ++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 42 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 0eccc29f5573..dab5c56ffd0e 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -349,7 +349,7 @@ struct dynamic_dma_window_prop { __be32 window_shift; /* ilog2(tce_window_size) */ }; -struct direct_window { +struct dma_win { struct device_node *device; const struct dynamic_dma_window_prop *prop; struct list_head list; @@ -369,11 +369,11 @@ struct ddw_create_response { u32 addr_lo; }; -static LIST_HEAD(direct_window_list); +static LIST_HEAD(dma_win_list); /* prevents races between memory on/offline and window creation */ -static DEFINE_SPINLOCK(direct_window_list_lock); +static DEFINE_SPINLOCK(dma_win_list_lock); /* protects initializing window twice for same device */ -static DEFINE_MUTEX(direct_window_init_mutex); +static DEFINE_MUTEX(dma_win_init_mutex); #define DIRECT64_PROPNAME "linux,direct64-ddr-window-info" #define DMA64_PROPNAME "linux,dma64-ddr-window-info" @@ -713,7 +713,10 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %pOF\n", dn); - /* Find nearest ibm,dma-window, walking up the device tree */ + /* + * Find nearest ibm,dma-window (default DMA window), walking up the + * device tree + */ for (pdn = dn; pdn != NULL; pdn = pdn->parent) { dma_window = of_get_property(pdn, "ibm,dma-window", NULL); if (dma_window != NULL) @@ -869,37 +872,37 @@ static int remove_ddw(struct device_node *np, bool remove_prop, const char *win_ ret = of_remove_property(np, win); if (ret) - pr_warn("%pOF: failed to remove direct window property: %d\n", + pr_warn("%pOF: failed to remove DMA window property: %d\n", np, ret); return 0; } static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, int *window_shift) { - struct direct_window *window; - const struct dynamic_dma_window_prop *direct64; + struct dma_win *window; + const struct dynamic_dma_window_prop *dma64; bool found = false; - spin_lock(&direct_window_list_lock); + spin_lock(&dma_win_list_lock); /* check if we already created a window and dupe that config if so */ - list_for_each_entry(window, &direct_window_list, list) { + list_for_each_entry(window, &dma_win_list, list) { if (window->device == pdn) { - direct64 = window->prop; - *dma_addr = be64_to_cpu(direct64->dma_base); - *window_shift = be32_to_cpu(direct64->window_shift); + dma64 = window->prop; + *dma_addr = be64_to_cpu(dma64->dma_base); + *window_shift = be32_to_cpu(dma64->window_shift); found = true; break; } } - spin_unlock(&direct_window_list_lock); + spin_unlock(&dma_win_list_lock); return found; } -static struct direct_window *ddw_list_new_entry(struct device_node *pdn, - const struct dynamic_dma_window_prop *dma64) +static struct dma_win *ddw_list_new_entry(struct device_node *pdn, + const struct dynamic_dma_window_prop *dma64) { - struct direct_window *window; + struct dma_win *window; window = kzalloc(sizeof(*window), GFP_KERNEL); if (!window) @@ -915,7 +918,7 @@ static void find_existing_ddw_windows_named(const char *name) { int len; struct device_node *pdn; - struct direct_window *window; + struct dma_win *window; const struct dynamic_dma_window_prop *dma64; for_each_node_with_property(pdn, name) { @@ -929,9 +932,9 @@ static void find_existing_ddw_windows_named(const char *name) if (!window) break; - spin_lock(&direct_window_list_lock); - list_add(&window->list, &direct_window_list); - spin_unlock(&direct_window_list_lock); + spin_lock(&dma_win_list_lock); + list_add(&window->list, &dma_win_list); + spin_unlock(&dma_win_list_lock); } } @@ -1231,7 +1234,7 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) const char *win_name; struct device_node *dn; u32 ddw_avail[DDW_APPLICABLE_SIZE]; - struct direct_window *window; + struct dma_win *window; struct property *win64; bool ddw_enabled = false; struct failed_ddw_pdn *fpdn; @@ -1244,7 +1247,7 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) pmem_present = dn != NULL; of_node_put(dn); - mutex_lock(&direct_window_init_mutex); + mutex_lock(&dma_win_init_mutex); if (find_existing_ddw(pdn, &dev->dev.archdata.dma_offset, &len)) { direct_mapping = (len >= max_ram_len); @@ -1324,8 +1327,8 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) page_shift = iommu_get_page_shift(query.page_size); if (!page_shift) { - dev_dbg(&dev->dev, "no supported direct page size in mask %x", - query.page_size); + dev_dbg(&dev->dev, "no supported page size in mask %x", + query.page_size); goto out_failed; } @@ -1384,7 +1387,7 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) ret = of_add_property(pdn, win64); if (ret) { - dev_err(&dev->dev, "unable to add dma window property for %pOF: %d", + dev_err(&dev->dev, "unable to add DMA window property for %pOF: %d", pdn, ret); goto out_free_prop; } @@ -1398,7 +1401,7 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT, win64->value, tce_setrange_multi_pSeriesLP_walk); if (ret) { - dev_info(&dev->dev, "failed to map direct window for %pOF: %d\n", + dev_info(&dev->dev, "failed to map DMA window for %pOF: %d\n", dn, ret); /* Make sure to clean DDW if any TCE was set*/ @@ -1443,9 +1446,9 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) set_iommu_table_base(&dev->dev, newtbl); } - spin_lock(&direct_window_list_lock); - list_add(&window->list, &direct_window_list); - spin_unlock(&direct_window_list_lock); + spin_lock(&dma_win_list_lock); + list_add(&window->list, &dma_win_list); + spin_unlock(&dma_win_list_lock); dev->dev.archdata.dma_offset = win_addr; ddw_enabled = true; @@ -1477,7 +1480,7 @@ out_failed: list_add(&fpdn->list, &failed_ddw_pdn_list); out_unlock: - mutex_unlock(&direct_window_init_mutex); + mutex_unlock(&dma_win_init_mutex); /* * If we have persistent memory and the window size is only as big @@ -1575,29 +1578,29 @@ static bool iommu_bypass_supported_pSeriesLP(struct pci_dev *pdev, u64 dma_mask) static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action, void *data) { - struct direct_window *window; + struct dma_win *window; struct memory_notify *arg = data; int ret = 0; switch (action) { case MEM_GOING_ONLINE: - spin_lock(&direct_window_list_lock); - list_for_each_entry(window, &direct_window_list, list) { + spin_lock(&dma_win_list_lock); + list_for_each_entry(window, &dma_win_list, list) { ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn, arg->nr_pages, window->prop); /* XXX log error */ } - spin_unlock(&direct_window_list_lock); + spin_unlock(&dma_win_list_lock); break; case MEM_CANCEL_ONLINE: case MEM_OFFLINE: - spin_lock(&direct_window_list_lock); - list_for_each_entry(window, &direct_window_list, list) { + spin_lock(&dma_win_list_lock); + list_for_each_entry(window, &dma_win_list, list) { ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn, arg->nr_pages, window->prop); /* XXX log error */ } - spin_unlock(&direct_window_list_lock); + spin_unlock(&dma_win_list_lock); break; default: break; @@ -1618,7 +1621,7 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti struct of_reconfig_data *rd = data; struct device_node *np = rd->dn; struct pci_dn *pci = PCI_DN(np); - struct direct_window *window; + struct dma_win *window; switch (action) { case OF_RECONFIG_DETACH_NODE: @@ -1636,15 +1639,15 @@ static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long acti iommu_pseries_free_group(pci->table_group, np->full_name); - spin_lock(&direct_window_list_lock); - list_for_each_entry(window, &direct_window_list, list) { + spin_lock(&dma_win_list_lock); + list_for_each_entry(window, &dma_win_list, list) { if (window->device == np) { list_del(&window->list); kfree(window); break; } } - spin_unlock(&direct_window_list_lock); + spin_unlock(&dma_win_list_lock); break; default: err = NOTIFY_DONE; -- cgit v1.2.3-59-g8ed1b From 1d78dfde33a02da1d816279c2e3452978b7abd39 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Fri, 27 Aug 2021 14:07:06 +1000 Subject: KVM: PPC: Fix clearing never mapped TCEs in realmode Since commit e1a1ef84cd07 ("KVM: PPC: Book3S: Allocate guest TCEs on demand too"), pages for TCE tables for KVM guests are allocated only when needed. This allows skipping any update when clearing TCEs. This works mostly fine as TCE updates are handled when the MMU is enabled. The realmode handlers fail with H_TOO_HARD when pages are not yet allocated, except when clearing a TCE in which case KVM prints a warning and proceeds to dereference a NULL pointer, which crashes the host OS. This has not been caught so far as the change in commit e1a1ef84cd07 is reasonably new, and POWER9 runs mostly radix which does not use realmode handlers. With hash, the default TCE table is memset() by QEMU when the machine is reset which triggers page faults and the KVM TCE device's kvm_spapr_tce_fault() handles those with MMU on. And the huge DMA windows are not cleared by VMs which instead successfully create a DMA window big enough to map the VM memory 1:1 and then VMs just map everything without clearing. This started crashing now as commit 381ceda88c4c ("powerpc/pseries/iommu: Make use of DDW for indirect mapping") added a mode when a dymanic DMA window not big enough to map the VM memory 1:1 but it is used anyway, and the VM now is the first (i.e. not QEMU) to clear a just created table. Note that upstream QEMU needs to be modified to trigger the VM to trigger the host OS crash. This replaces WARN_ON_ONCE_RM() with a check and return, and adds another warning if TCE is not being cleared. Fixes: e1a1ef84cd07 ("KVM: PPC: Book3S: Allocate guest TCEs on demand too") Signed-off-by: Alexey Kardashevskiy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210827040706.517652-1-aik@ozlabs.ru --- arch/powerpc/kvm/book3s_64_vio_hv.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index dc6591548f0c..636c6ae0939b 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -173,10 +173,13 @@ static void kvmppc_rm_tce_put(struct kvmppc_spapr_tce_table *stt, idx -= stt->offset; page = stt->pages[idx / TCES_PER_PAGE]; /* - * page must not be NULL in real mode, - * kvmppc_rm_ioba_validate() must have taken care of this. + * kvmppc_rm_ioba_validate() allows pages not be allocated if TCE is + * being cleared, otherwise it returns H_TOO_HARD and we skip this. */ - WARN_ON_ONCE_RM(!page); + if (!page) { + WARN_ON_ONCE_RM(tce != 0); + return; + } tbl = kvmppc_page_address(page); tbl[idx % TCES_PER_PAGE] = tce; -- cgit v1.2.3-59-g8ed1b From b14b8b1ed0e15b8f43fba9c25654278a31ee3c2f Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 31 Aug 2021 23:51:51 +1000 Subject: powerpc/ptdump: Fix generic ptdump for 64-bit Since the conversion to generic ptdump we see crashes on 64-bit: BUG: Unable to handle kernel data access on read at 0xc0eeff7f00000000 Faulting instruction address: 0xc00000000045e5fc Oops: Kernel access of bad area, sig: 11 [#1] ... NIP __walk_page_range+0x2bc/0xce0 LR __walk_page_range+0x240/0xce0 Call Trace: __walk_page_range+0x240/0xce0 (unreliable) walk_page_range_novma+0x74/0xb0 ptdump_walk_pgd+0x98/0x170 ptdump_check_wx+0x88/0xd0 mark_rodata_ro+0x48/0x80 kernel_init+0x74/0x1a0 ret_from_kernel_thread+0x5c/0x64 What's happening is that have walked off the end of the kernel page tables, and started dereferencing junk values. That happens because we initialised the ptdump_range to span all the way up to 0xffffffffffffffff: static struct ptdump_range ptdump_range[] __ro_after_init = { {TASK_SIZE_MAX, ~0UL}, But the kernel page tables don't span that far. So on 64-bit set the end of the range to be the address immediately past the end of the kernel page tables, to limit the page table walk to valid addresses. Fixes: e084728393a5 ("powerpc/ptdump: Convert powerpc to GENERIC_PTDUMP") Reported-by: Nathan Chancellor Tested-by: Nathan Chancellor Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210831135151.886620-1-mpe@ellerman.id.au --- arch/powerpc/mm/ptdump/ptdump.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/powerpc') diff --git a/arch/powerpc/mm/ptdump/ptdump.c b/arch/powerpc/mm/ptdump/ptdump.c index 2d80d775d15e..bf251191e78d 100644 --- a/arch/powerpc/mm/ptdump/ptdump.c +++ b/arch/powerpc/mm/ptdump/ptdump.c @@ -359,6 +359,8 @@ static int __init ptdump_init(void) ptdump_range[0].start = KERN_VIRT_START; else ptdump_range[0].start = PAGE_OFFSET; + + ptdump_range[0].end = PAGE_OFFSET + (PGDIR_SIZE * PTRS_PER_PGD); #endif populate_markers(); -- cgit v1.2.3-59-g8ed1b From e432fe97f3e5de325b40021e505cce53877586c5 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 27 Aug 2021 22:51:06 +1000 Subject: powerpc/bug: Cast to unsigned long before passing to inline asm In commit 1e688dd2a3d6 ("powerpc/bug: Provide better flexibility to WARN_ON/__WARN_FLAGS() with asm goto") we changed WARN_ON(). Previously it would take the warning condition, x, and double negate it before converting the result to int, and passing that int to the underlying inline asm. ie: #define WARN_ON(x) ({ int __ret_warn_on = !!(x); if (__builtin_constant_p(__ret_warn_on)) { ... } else { BUG_ENTRY(PPC_TLNEI " %4, 0", BUGFLAG_WARNING | BUGFLAG_TAINT(TAINT_WARN), "r" (__ret_warn_on)); The asm then does a full register width comparison with zero and traps if it is non-zero (PPC_TLNEI). The new code instead passes the full expression, x, with some arbitrary type, to the inline asm: #define WARN_ON(x) ({ ... do { if (__builtin_constant_p((x))) { ... } else { ... WARN_ENTRY(PPC_TLNEI " %4, 0", BUGFLAG_WARNING | BUGFLAG_TAINT(TAINT_WARN), __label_warn_on, "r" (x)); As reported[1] by Nathan, when building with clang this can cause spurious warnings to fire repeatedly at boot: WARNING: CPU: 0 PID: 1 at lib/klist.c:62 .klist_add_tail+0x3c/0x110 Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Tainted: G W 5.14.0-rc7-next-20210825 #1 NIP: c0000000007ff81c LR: c00000000090a038 CTR: 0000000000000000 REGS: c0000000073c32a0 TRAP: 0700 Tainted: G W (5.14.0-rc7-next-20210825) MSR: 8000000002029032 CR: 22000a40 XER: 00000000 CFAR: c00000000090a034 IRQMASK: 0 GPR00: c00000000090a038 c0000000073c3540 c000000001be3200 0000000000000001 GPR04: c0000000072d65c0 0000000000000000 c0000000091ba798 c0000000091bb0a0 GPR08: 0000000000000001 0000000000000000 c000000008581918 fffffffffffffc00 GPR12: 0000000044000240 c000000001dd0000 c000000000012300 0000000000000000 GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR20: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR24: 0000000000000000 c0000000017e3200 0000000000000000 c000000001a0e778 GPR28: c0000000072d65b0 c0000000072d65a8 c000000007de72c8 c0000000073c35d0 NIP .klist_add_tail+0x3c/0x110 LR .bus_add_driver+0x148/0x290 Call Trace: 0xc0000000073c35d0 (unreliable) .bus_add_driver+0x148/0x290 .driver_register+0xb8/0x190 .__hid_register_driver+0x70/0xd0 .redragon_driver_init+0x34/0x58 .do_one_initcall+0x130/0x3b0 .do_initcall_level+0xd8/0x188 .do_initcalls+0x7c/0xdc .kernel_init_freeable+0x178/0x21c .kernel_init+0x34/0x220 .ret_from_kernel_thread+0x58/0x60 Instruction dump: fba10078 7c7d1b78 38600001 fb810070 3b9d0008 fbc10080 7c9e2378 389d0018 fb9d0008 fb9d0010 90640000 fbdd0000 <0b1e0000> e87e0018 28230000 41820024 The instruction dump shows that we are trapping because r30 is not zero: tdnei r30,0 Where r30 = c000000007de72c8 The WARN_ON() comes from: static void knode_set_klist(struct klist_node *knode, struct klist *klist) { knode->n_klist = klist; /* no knode deserves to start its life dead */ WARN_ON(knode_dead(knode)); ^^^^^^^^^^^^^^^^^ Where: #define KNODE_DEAD 1LU static bool knode_dead(struct klist_node *knode) { return (unsigned long)knode->n_klist & KNODE_DEAD; } The full disassembly shows that clang has not generated any code to apply the "& KNODE_DEAD" to the n_klist pointer, which is surprising. Nathan filed an LLVM bug [2], in which Eli Friedman explained that clang believes it is only passing a single bit to the asm (ie. a bool) and so the mask of bit 0 with 1 can be omitted, and suggested that if we want the full 64-bit value passed to the inline asm we should cast to a 64-bit type (or 32-bit on 32-bits). In fact we already do that for BUG_ENTRY(), which was added to fix a possibly similar bug in 2005 in commit 32818c2eb6b8 ("[PATCH] ppc64: Fix issue with gcc 4.0 compiled kernels"). So cast the value we pass to the inline asm to long. For GCC this appears to have no effect on code generation, other than causing sign extension in some cases. [1]: http://lore.kernel.org/r/YSa1O4fcX1nNKqN/@Ryzen-9-3900X.localdomain [2]: https://bugs.llvm.org/show_bug.cgi?id=51634 Fixes: 1e688dd2a3d6 ("powerpc/bug: Provide better flexibility to WARN_ON/__WARN_FLAGS() with asm goto") Reported-by: Nathan Chancellor Reviewed-by: Nathan Chancellor Tested-by: Nathan Chancellor Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210901112522.1085134-1-mpe@ellerman.id.au --- arch/powerpc/include/asm/bug.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/powerpc') diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h index 1ee0f22313ee..02c08d1492f8 100644 --- a/arch/powerpc/include/asm/bug.h +++ b/arch/powerpc/include/asm/bug.h @@ -119,7 +119,8 @@ __label_warn_on: \ \ WARN_ENTRY(PPC_TLNEI " %4, 0", \ BUGFLAG_WARNING | BUGFLAG_TAINT(TAINT_WARN), \ - __label_warn_on, "r" (x)); \ + __label_warn_on, \ + "r" ((__force long)(x))); \ break; \ __label_warn_on: \ __ret_warn_on = true; \ -- cgit v1.2.3-59-g8ed1b