diff options
Diffstat (limited to 'arch/powerpc/kernel/smp.c')
-rw-r--r-- | arch/powerpc/kernel/smp.c | 255 |
1 files changed, 140 insertions, 115 deletions
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index bcefab484ea6..a60e4139214b 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -35,6 +35,7 @@ #include <linux/stackprotector.h> #include <linux/pgtable.h> #include <linux/clockchips.h> +#include <linux/kexec.h> #include <asm/ptrace.h> #include <linux/atomic.h> @@ -46,6 +47,7 @@ #include <asm/smp.h> #include <asm/time.h> #include <asm/machdep.h> +#include <asm/mmu_context.h> #include <asm/cputhreads.h> #include <asm/cputable.h> #include <asm/mpic.h> @@ -55,12 +57,13 @@ #endif #include <asm/vdso.h> #include <asm/debug.h> -#include <asm/kexec.h> #include <asm/cpu_has_feature.h> #include <asm/ftrace.h> #include <asm/kup.h> #include <asm/fadump.h> +#include <trace/events/ipi.h> + #ifdef DEBUG #include <asm/udbg.h> #define DBG(fmt...) udbg_printf(fmt) @@ -74,10 +77,10 @@ static DEFINE_PER_CPU(int, cpu_state) = { 0 }; #endif struct task_struct *secondary_current; -bool has_big_cores; -bool coregroup_enabled; -bool thread_group_shares_l2; -bool thread_group_shares_l3; +bool has_big_cores __ro_after_init; +bool coregroup_enabled __ro_after_init; +bool thread_group_shares_l2 __ro_after_init; +bool thread_group_shares_l3 __ro_after_init; DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map); DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map); @@ -90,15 +93,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map); EXPORT_PER_CPU_SYMBOL(cpu_core_map); EXPORT_SYMBOL_GPL(has_big_cores); -enum { -#ifdef CONFIG_SCHED_SMT - smt_idx, -#endif - cache_idx, - mc_idx, - die_idx, -}; - #define MAX_THREAD_LIST_SIZE 8 #define THREAD_GROUP_SHARE_L1 1 #define THREAD_GROUP_SHARE_L2_L3 2 @@ -289,7 +283,7 @@ void smp_muxed_ipi_set_message(int cpu, int msg) * Order previous accesses before accesses in the IPI handler. */ smp_mb(); - message[msg] = 1; + WRITE_ONCE(message[msg], 1); } void smp_muxed_ipi_message_pass(int cpu, int msg) @@ -348,7 +342,7 @@ irqreturn_t smp_ipi_demux_relaxed(void) if (all & IPI_MESSAGE(PPC_MSG_NMI_IPI)) nmi_ipi_action(0, NULL); #endif - } while (info->messages); + } while (READ_ONCE(info->messages)); return IRQ_HANDLED; } @@ -364,12 +358,12 @@ static inline void do_message_pass(int cpu, int msg) #endif } -void smp_send_reschedule(int cpu) +void arch_smp_send_reschedule(int cpu) { if (likely(smp_ops)) do_message_pass(cpu, PPC_MSG_RESCHEDULE); } -EXPORT_SYMBOL_GPL(smp_send_reschedule); +EXPORT_SYMBOL_GPL(arch_smp_send_reschedule); void arch_send_call_function_single_ipi(int cpu) { @@ -415,9 +409,9 @@ noinstr static void nmi_ipi_lock_start(unsigned long *flags) { raw_local_irq_save(*flags); hard_irq_disable(); - while (arch_atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1) { + while (raw_atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1) { raw_local_irq_restore(*flags); - spin_until_cond(arch_atomic_read(&__nmi_ipi_lock) == 0); + spin_until_cond(raw_atomic_read(&__nmi_ipi_lock) == 0); raw_local_irq_save(*flags); hard_irq_disable(); } @@ -425,15 +419,15 @@ noinstr static void nmi_ipi_lock_start(unsigned long *flags) noinstr static void nmi_ipi_lock(void) { - while (arch_atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1) - spin_until_cond(arch_atomic_read(&__nmi_ipi_lock) == 0); + while (raw_atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1) + spin_until_cond(raw_atomic_read(&__nmi_ipi_lock) == 0); } noinstr static void nmi_ipi_unlock(void) { smp_mb(); - WARN_ON(arch_atomic_read(&__nmi_ipi_lock) != 1); - arch_atomic_set(&__nmi_ipi_lock, 0); + WARN_ON(raw_atomic_read(&__nmi_ipi_lock) != 1); + raw_atomic_set(&__nmi_ipi_lock, 0); } noinstr static void nmi_ipi_unlock_end(unsigned long *flags) @@ -619,20 +613,6 @@ void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *)) } #endif -#ifdef CONFIG_NMI_IPI -static void crash_stop_this_cpu(struct pt_regs *regs) -#else -static void crash_stop_this_cpu(void *dummy) -#endif -{ - /* - * Just busy wait here and avoid marking CPU as offline to ensure - * register data is captured appropriately. - */ - while (1) - cpu_relax(); -} - void crash_smp_send_stop(void) { static bool stopped = false; @@ -651,11 +631,14 @@ void crash_smp_send_stop(void) stopped = true; -#ifdef CONFIG_NMI_IPI - smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, crash_stop_this_cpu, 1000000); -#else - smp_call_function(crash_stop_this_cpu, NULL, 0); -#endif /* CONFIG_NMI_IPI */ +#ifdef CONFIG_KEXEC_CORE + if (kexec_crash_image) { + crash_kexec_prepare(); + return; + } +#endif + + smp_send_stop(); } #ifdef CONFIG_NMI_IPI @@ -719,7 +702,7 @@ static struct task_struct *current_set[NR_CPUS]; static void smp_store_cpu_info(int id) { per_cpu(cpu_pvr, id) = mfspr(SPRN_PVR); -#ifdef CONFIG_PPC_FSL_BOOK3E +#ifdef CONFIG_PPC_E500 per_cpu(next_tlbcam_idx, id) = (mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY) - 1; #endif @@ -995,13 +978,13 @@ static int __init init_thread_group_cache_map(int cpu, int cache_property) return 0; } -static bool shared_caches; +static bool shared_caches __ro_after_init; #ifdef CONFIG_SCHED_SMT /* cpumask of CPUs with asymmetric SMT dependency */ static int powerpc_smt_flags(void) { - int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES; + int flags = SD_SHARE_CPUCAPACITY | SD_SHARE_LLC; if (cpu_has_feature(CPU_FTR_ASYM_SMT)) { printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n"); @@ -1012,6 +995,13 @@ static int powerpc_smt_flags(void) #endif /* + * On shared processor LPARs scheduled on a big core (which has two or more + * independent thread groups per core), prefer lower numbered CPUs, so + * that workload consolidates to lesser number of cores. + */ +static __ro_after_init DEFINE_STATIC_KEY_FALSE(splpar_asym_pack); + +/* * P9 has a slightly odd architecture where pairs of cores share an L2 cache. * This topology makes it *much* cheaper to migrate tasks between adjacent cores * since the migrated task remains cache hot. We want to take advantage of this @@ -1019,7 +1009,18 @@ static int powerpc_smt_flags(void) */ static int powerpc_shared_cache_flags(void) { - return SD_SHARE_PKG_RESOURCES; + if (static_branch_unlikely(&splpar_asym_pack)) + return SD_SHARE_LLC | SD_ASYM_PACKING; + + return SD_SHARE_LLC; +} + +static int powerpc_shared_proc_flags(void) +{ + if (static_branch_unlikely(&splpar_asym_pack)) + return SD_ASYM_PACKING; + + return 0; } /* @@ -1045,6 +1046,10 @@ static struct cpumask *cpu_coregroup_mask(int cpu) static bool has_coregroup_support(void) { + /* Coregroup identification not available on shared systems */ + if (is_shared_processor()) + return 0; + return coregroup_enabled; } @@ -1053,16 +1058,6 @@ static const struct cpumask *cpu_mc_mask(int cpu) return cpu_coregroup_mask(cpu); } -static struct sched_domain_topology_level powerpc_topology[] = { -#ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) }, -#endif - { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) }, - { cpu_mc_mask, SD_INIT_NAME(MC) }, - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, - { NULL, }, -}; - static int __init init_big_cores(void) { int cpu; @@ -1096,7 +1091,7 @@ static int __init init_big_cores(void) void __init smp_prepare_cpus(unsigned int max_cpus) { - unsigned int cpu; + unsigned int cpu, num_threads; DBG("smp_prepare_cpus\n"); @@ -1163,6 +1158,12 @@ void __init smp_prepare_cpus(unsigned int max_cpus) if (smp_ops && smp_ops->probe) smp_ops->probe(); + + // Initalise the generic SMT topology support + num_threads = 1; + if (smt_enabled_at_boot) + num_threads = smt_enabled_at_boot; + cpu_smt_set_num_threads(num_threads, threads_per_core); } void smp_prepare_boot_cpu(void) @@ -1260,7 +1261,7 @@ static void cpu_idle_thread_init(unsigned int cpu, struct task_struct *idle) #ifdef CONFIG_PPC64 paca_ptrs[cpu]->__current = idle; paca_ptrs[cpu]->kstack = (unsigned long)task_stack_page(idle) + - THREAD_SIZE - STACK_FRAME_OVERHEAD; + THREAD_SIZE - STACK_FRAME_MIN_SIZE; #endif task_thread_info(idle)->cpu = cpu; secondary_current = current_set[cpu] = idle; @@ -1268,7 +1269,12 @@ static void cpu_idle_thread_init(unsigned int cpu, struct task_struct *idle) int __cpu_up(unsigned int cpu, struct task_struct *tidle) { - int rc, c; + const unsigned long boot_spin_ms = 5 * MSEC_PER_SEC; + const bool booting = system_state < SYSTEM_RUNNING; + const unsigned long hp_spin_ms = 1; + unsigned long deadline; + int rc; + const unsigned long spin_wait_ms = booting ? boot_spin_ms : hp_spin_ms; /* * Don't allow secondary threads to come online if inhibited @@ -1313,22 +1319,23 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle) } /* - * wait to see if the cpu made a callin (is actually up). - * use this value that I found through experimentation. - * -- Cort + * At boot time, simply spin on the callin word until the + * deadline passes. + * + * At run time, spin for an optimistic amount of time to avoid + * sleeping in the common case. */ - if (system_state < SYSTEM_RUNNING) - for (c = 50000; c && !cpu_callin_map[cpu]; c--) - udelay(100); -#ifdef CONFIG_HOTPLUG_CPU - else - /* - * CPUs can take much longer to come up in the - * hotplug case. Wait five seconds. - */ - for (c = 5000; c && !cpu_callin_map[cpu]; c--) - msleep(1); -#endif + deadline = jiffies + msecs_to_jiffies(spin_wait_ms); + spin_until_cond(cpu_callin_map[cpu] || time_is_before_jiffies(deadline)); + + if (!cpu_callin_map[cpu] && system_state >= SYSTEM_RUNNING) { + const unsigned long sleep_interval_us = 10 * USEC_PER_MSEC; + const unsigned long sleep_wait_ms = 100 * MSEC_PER_SEC; + + deadline = jiffies + msecs_to_jiffies(sleep_wait_ms); + while (!cpu_callin_map[cpu] && time_is_after_jiffies(deadline)) + fsleep(sleep_interval_us); + } if (!cpu_callin_map[cpu]) { printk(KERN_ERR "Processor %u is stuck.\n", cpu); @@ -1591,7 +1598,7 @@ static void add_cpu_to_masks(int cpu) /* Skip all CPUs already part of current CPU core mask */ cpumask_andnot(mask, cpu_online_mask, cpu_core_mask(cpu)); - /* If chip_id is -1; limit the cpu_core_mask to within DIE*/ + /* If chip_id is -1; limit the cpu_core_mask to within PKG */ if (chip_id == -1) cpumask_and(mask, mask, cpu_cpu_mask(cpu)); @@ -1608,6 +1615,7 @@ static void add_cpu_to_masks(int cpu) } /* Activate a secondary processor. */ +__no_stack_protector void start_secondary(void *unused) { unsigned int cpu = raw_smp_processor_id(); @@ -1616,12 +1624,15 @@ void start_secondary(void *unused) if (IS_ENABLED(CONFIG_PPC32)) setup_kup(); - mmgrab(&init_mm); + mmgrab_lazy_tlb(&init_mm); current->active_mm = &init_mm; + VM_WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(&init_mm))); + cpumask_set_cpu(cpu, mm_cpumask(&init_mm)); + inc_mm_active_cpus(&init_mm); smp_store_cpu_info(cpu); set_dec(tb_ticks_per_jiffy); - rcu_cpu_starting(cpu); + rcutree_report_cpu_starting(cpu); cpu_callin_map[cpu] = 1; if (smp_ops->setup_cpu) @@ -1674,50 +1685,45 @@ void start_secondary(void *unused) BUG(); } -#ifdef CONFIG_PROFILING -int setup_profiling_timer(unsigned int multiplier) -{ - return 0; -} -#endif +static struct sched_domain_topology_level powerpc_topology[6]; -static void __init fixup_topology(void) +static void __init build_sched_topology(void) { - int i; + int i = 0; + + if (is_shared_processor() && has_big_cores) + static_branch_enable(&splpar_asym_pack); #ifdef CONFIG_SCHED_SMT if (has_big_cores) { pr_info("Big cores detected but using small core scheduling\n"); - powerpc_topology[smt_idx].mask = smallcore_smt_mask; + powerpc_topology[i++] = (struct sched_domain_topology_level){ + smallcore_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) + }; + } else { + powerpc_topology[i++] = (struct sched_domain_topology_level){ + cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) + }; } #endif + if (shared_caches) { + powerpc_topology[i++] = (struct sched_domain_topology_level){ + shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) + }; + } + if (has_coregroup_support()) { + powerpc_topology[i++] = (struct sched_domain_topology_level){ + cpu_mc_mask, powerpc_shared_proc_flags, SD_INIT_NAME(MC) + }; + } + powerpc_topology[i++] = (struct sched_domain_topology_level){ + cpu_cpu_mask, powerpc_shared_proc_flags, SD_INIT_NAME(PKG) + }; - if (!has_coregroup_support()) - powerpc_topology[mc_idx].mask = powerpc_topology[cache_idx].mask; - - /* - * Try to consolidate topology levels here instead of - * allowing scheduler to degenerate. - * - Dont consolidate if masks are different. - * - Dont consolidate if sd_flags exists and are different. - */ - for (i = 1; i <= die_idx; i++) { - if (powerpc_topology[i].mask != powerpc_topology[i - 1].mask) - continue; - - if (powerpc_topology[i].sd_flags && powerpc_topology[i - 1].sd_flags && - powerpc_topology[i].sd_flags != powerpc_topology[i - 1].sd_flags) - continue; - - if (!powerpc_topology[i - 1].sd_flags) - powerpc_topology[i - 1].sd_flags = powerpc_topology[i].sd_flags; + /* There must be one trailing NULL entry left. */ + BUG_ON(i >= ARRAY_SIZE(powerpc_topology) - 1); - powerpc_topology[i].mask = powerpc_topology[i + 1].mask; - powerpc_topology[i].sd_flags = powerpc_topology[i + 1].sd_flags; -#ifdef CONFIG_SCHED_DEBUG - powerpc_topology[i].name = powerpc_topology[i + 1].name; -#endif - } + set_sched_topology(powerpc_topology); } void __init smp_cpus_done(unsigned int max_cpus) @@ -1732,9 +1738,20 @@ void __init smp_cpus_done(unsigned int max_cpus) smp_ops->bringup_done(); dump_numa_cpu_topology(); + build_sched_topology(); +} - fixup_topology(); - set_sched_topology(powerpc_topology); +/* + * For asym packing, by default lower numbered CPU has higher priority. + * On shared processors, pack to lower numbered core. However avoid moving + * between thread_groups within the same core. + */ +int arch_asym_cpu_priority(int cpu) +{ + if (static_branch_unlikely(&splpar_asym_pack)) + return -cpu / threads_per_core; + + return -cpu; } #ifdef CONFIG_HOTPLUG_CPU @@ -1760,11 +1777,19 @@ int __cpu_disable(void) void __cpu_die(unsigned int cpu) { + /* + * This could perhaps be a generic call in idlea_task_dead(), but + * that requires testing from all archs, so first put it here to + */ + VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(&init_mm))); + dec_mm_active_cpus(&init_mm); + cpumask_clear_cpu(cpu, mm_cpumask(&init_mm)); + if (smp_ops->cpu_die) smp_ops->cpu_die(cpu); } -void arch_cpu_idle_dead(void) +void __noreturn arch_cpu_idle_dead(void) { /* * Disable on the down path. This will be re-enabled by |