diff options
author | 2025-02-25 16:03:25 -0800 | |
---|---|---|
committer | 2025-02-25 16:03:25 -0800 | |
commit | 0b119045b79a672bc6d8f18641c60fc8ce1b4585 (patch) | |
tree | 69c63ecfec55b9576c34dc742e0c38f46f8a317a /kernel/time | |
parent | Input: pm8941-pwrkey - fix dev_dbg() output in pm8941_pwrkey_irq() (diff) | |
parent | Linux 6.14-rc4 (diff) | |
download | wireguard-linux-0b119045b79a672bc6d8f18641c60fc8ce1b4585.tar.xz wireguard-linux-0b119045b79a672bc6d8f18641c60fc8ce1b4585.zip |
Merge tag 'v6.14-rc4' into next
Sync up with the mainline.
Diffstat (limited to 'kernel/time')
-rw-r--r-- | kernel/time/clocksource-wdtest.c | 3 | ||||
-rw-r--r-- | kernel/time/clocksource.c | 9 | ||||
-rw-r--r-- | kernel/time/hrtimer.c | 143 | ||||
-rw-r--r-- | kernel/time/posix-timers.c | 2 | ||||
-rw-r--r-- | kernel/time/tick-broadcast.c | 2 | ||||
-rw-r--r-- | kernel/time/timekeeping.c | 77 | ||||
-rw-r--r-- | kernel/time/timer.c | 18 | ||||
-rw-r--r-- | kernel/time/timer_migration.c | 78 | ||||
-rw-r--r-- | kernel/time/timer_migration.h | 21 |
9 files changed, 207 insertions, 146 deletions
diff --git a/kernel/time/clocksource-wdtest.c b/kernel/time/clocksource-wdtest.c index 62e73444ffe4..38dae590b29f 100644 --- a/kernel/time/clocksource-wdtest.c +++ b/kernel/time/clocksource-wdtest.c @@ -137,7 +137,8 @@ static int wdtest_func(void *arg) udelay(1); j2 = clocksource_wdtest_ktime.read(&clocksource_wdtest_ktime); pr_info("--- tsc-like times: %lu - %lu = %lu.\n", j2, j1, j2 - j1); - WARN_ON_ONCE(time_before(j2, j1 + NSEC_PER_USEC)); + WARN_ONCE(time_before(j2, j1 + NSEC_PER_USEC), + "Expected at least 1000ns, got %lu.\n", j2 - j1); /* Verify tsc-like stability with various numbers of errors injected. */ max_retries = clocksource_get_max_watchdog_retry(); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 7304d7cf47f2..2a7802ec480c 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -373,16 +373,18 @@ void clocksource_verify_percpu(struct clocksource *cs) cpumask_clear(&cpus_ahead); cpumask_clear(&cpus_behind); cpus_read_lock(); - preempt_disable(); + migrate_disable(); clocksource_verify_choose_cpus(); if (cpumask_empty(&cpus_chosen)) { - preempt_enable(); + migrate_enable(); cpus_read_unlock(); pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name); return; } testcpu = smp_processor_id(); - pr_warn("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", cs->name, testcpu, cpumask_pr_args(&cpus_chosen)); + pr_info("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", + cs->name, testcpu, cpumask_pr_args(&cpus_chosen)); + preempt_disable(); for_each_cpu(cpu, &cpus_chosen) { if (cpu == testcpu) continue; @@ -402,6 +404,7 @@ void clocksource_verify_percpu(struct clocksource *cs) cs_nsec_min = cs_nsec; } preempt_enable(); + migrate_enable(); cpus_read_unlock(); if (!cpumask_empty(&cpus_ahead)) pr_warn(" CPUs %*pbl ahead of CPU %d for clocksource %s.\n", diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 80fe3749d2db..deb1aa32814e 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -58,6 +58,8 @@ #define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT) #define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD) +static void retrigger_next_event(void *arg); + /* * The timer bases: * @@ -111,7 +113,8 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, }, - } + }, + .csd = CSD_INIT(retrigger_next_event, NULL) }; static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { @@ -124,6 +127,14 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { [CLOCK_TAI] = HRTIMER_BASE_TAI, }; +static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base) +{ + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) + return true; + else + return likely(base->online); +} + /* * Functions and macros which are different for UP/SMP systems are kept in a * single place @@ -145,11 +156,6 @@ static struct hrtimer_cpu_base migration_cpu_base = { #define migration_base migration_cpu_base.clock_base[0] -static inline bool is_migration_base(struct hrtimer_clock_base *base) -{ - return base == &migration_base; -} - /* * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock * means that all timers which are tied to this base via timer->base are @@ -183,27 +189,54 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, } /* - * We do not migrate the timer when it is expiring before the next - * event on the target cpu. When high resolution is enabled, we cannot - * reprogram the target cpu hardware and we would cause it to fire - * late. To keep it simple, we handle the high resolution enabled and - * disabled case similar. + * Check if the elected target is suitable considering its next + * event and the hotplug state of the current CPU. + * + * If the elected target is remote and its next event is after the timer + * to queue, then a remote reprogram is necessary. However there is no + * guarantee the IPI handling the operation would arrive in time to meet + * the high resolution deadline. In this case the local CPU becomes a + * preferred target, unless it is offline. + * + * High and low resolution modes are handled the same way for simplicity. * * Called with cpu_base->lock of target cpu held. */ -static int -hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) +static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base, + struct hrtimer_cpu_base *new_cpu_base, + struct hrtimer_cpu_base *this_cpu_base) { ktime_t expires; + /* + * The local CPU clockevent can be reprogrammed. Also get_target_base() + * guarantees it is online. + */ + if (new_cpu_base == this_cpu_base) + return true; + + /* + * The offline local CPU can't be the default target if the + * next remote target event is after this timer. Keep the + * elected new base. An IPI will we issued to reprogram + * it as a last resort. + */ + if (!hrtimer_base_is_online(this_cpu_base)) + return true; + expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); - return expires < new_base->cpu_base->expires_next; + + return expires >= new_base->cpu_base->expires_next; } -static inline -struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, - int pinned) +static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned) { + if (!hrtimer_base_is_online(base)) { + int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER)); + + return &per_cpu(hrtimer_bases, cpu); + } + #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) if (static_branch_likely(&timers_migration_enabled) && !pinned) return &per_cpu(hrtimer_bases, get_nohz_timer_target()); @@ -254,8 +287,8 @@ again: raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); - if (new_cpu_base != this_cpu_base && - hrtimer_check_target(timer, new_base)) { + if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, + this_cpu_base)) { raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); new_cpu_base = this_cpu_base; @@ -264,8 +297,7 @@ again: } WRITE_ONCE(timer->base, new_base); } else { - if (new_cpu_base != this_cpu_base && - hrtimer_check_target(timer, new_base)) { + if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, this_cpu_base)) { new_cpu_base = this_cpu_base; goto again; } @@ -275,11 +307,6 @@ again: #else /* CONFIG_SMP */ -static inline bool is_migration_base(struct hrtimer_clock_base *base) -{ - return false; -} - static inline struct hrtimer_clock_base * lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) __acquires(&timer->base->cpu_base->lock) @@ -716,8 +743,6 @@ static inline int hrtimer_is_hres_enabled(void) return hrtimer_hres_enabled; } -static void retrigger_next_event(void *arg); - /* * Switch to high resolution mode */ @@ -1067,11 +1092,10 @@ EXPORT_SYMBOL_GPL(hrtimer_forward); * The timer is inserted in expiry order. Insertion into the * red black tree is O(log(n)). Must hold the base lock. * - * Returns 1 when the new timer is the leftmost timer in the tree. + * Returns true when the new timer is the leftmost timer in the tree. */ -static int enqueue_hrtimer(struct hrtimer *timer, - struct hrtimer_clock_base *base, - enum hrtimer_mode mode) +static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, + enum hrtimer_mode mode) { debug_activate(timer, mode); WARN_ON_ONCE(!base->cpu_base->online); @@ -1206,6 +1230,7 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, const enum hrtimer_mode mode, struct hrtimer_clock_base *base) { + struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases); struct hrtimer_clock_base *new_base; bool force_local, first; @@ -1217,10 +1242,16 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * and enforce reprogramming after it is queued no matter whether * it is the new first expiring timer again or not. */ - force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases); + force_local = base->cpu_base == this_cpu_base; force_local &= base->cpu_base->next_timer == timer; /* + * Don't force local queuing if this enqueue happens on a unplugged + * CPU after hrtimer_cpu_dying() has been invoked. + */ + force_local &= this_cpu_base->online; + + /* * Remove an active timer from the queue. In case it is not queued * on the current CPU, make sure that remove_hrtimer() updates the * remote data correctly. @@ -1249,8 +1280,27 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, } first = enqueue_hrtimer(timer, new_base, mode); - if (!force_local) - return first; + if (!force_local) { + /* + * If the current CPU base is online, then the timer is + * never queued on a remote CPU if it would be the first + * expiring timer there. + */ + if (hrtimer_base_is_online(this_cpu_base)) + return first; + + /* + * Timer was enqueued remote because the current base is + * already offline. If the timer is the first to expire, + * kick the remote CPU to reprogram the clock event. + */ + if (first) { + struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base; + + smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd); + } + return 0; + } /* * Timer was forced to stay on the current CPU to avoid @@ -1371,6 +1421,18 @@ static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, } } +#ifdef CONFIG_SMP +static __always_inline bool is_migration_base(struct hrtimer_clock_base *base) +{ + return base == &migration_base; +} +#else +static __always_inline bool is_migration_base(struct hrtimer_clock_base *base) +{ + return false; +} +#endif + /* * This function is called on PREEMPT_RT kernels when the fast path * deletion of a timer failed because the timer callback function was @@ -2202,6 +2264,15 @@ int hrtimers_prepare_cpu(unsigned int cpu) } cpu_base->cpu = cpu; + hrtimer_cpu_base_init_expiry_lock(cpu_base); + return 0; +} + +int hrtimers_cpu_starting(unsigned int cpu) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + + /* Clear out any left over state from a CPU down operation */ cpu_base->active_bases = 0; cpu_base->hres_active = 0; cpu_base->hang_detected = 0; @@ -2210,7 +2281,6 @@ int hrtimers_prepare_cpu(unsigned int cpu) cpu_base->expires_next = KTIME_MAX; cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->online = 1; - hrtimer_cpu_base_init_expiry_lock(cpu_base); return 0; } @@ -2286,5 +2356,6 @@ int hrtimers_cpu_dying(unsigned int dying_cpu) void __init hrtimers_init(void) { hrtimers_prepare_cpu(smp_processor_id()); + hrtimers_cpu_starting(smp_processor_id()); open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq); } diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 881a9ce96af7..1b675aee99a9 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -538,7 +538,7 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) * When the reference count reaches zero, the timer is scheduled * for RCU removal after the grace period. * - * Holding rcu_read_lock() accross the lookup ensures that + * Holding rcu_read_lock() across the lookup ensures that * the timer cannot be freed. * * The lookup validates locklessly that timr::it_signal == diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index ed58eebb4e8f..0207868c8b4d 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -1020,6 +1020,8 @@ static inline ktime_t tick_get_next_period(void) /** * tick_broadcast_setup_oneshot - setup the broadcast device + * @bc: the broadcast device + * @from_periodic: true if called from periodic mode */ static void tick_broadcast_setup_oneshot(struct clock_event_device *bc, bool from_periodic) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3d128825d343..1e67d076f195 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -485,91 +485,30 @@ u64 notrace ktime_get_tai_fast_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_tai_fast_ns); -static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono) +/** + * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime. + * + * See ktime_get_mono_fast_ns() for documentation of the time stamp ordering. + */ +u64 ktime_get_real_fast_ns(void) { + struct tk_fast *tkf = &tk_fast_mono; struct tk_read_base *tkr; - u64 basem, baser, delta; + u64 baser, delta; unsigned int seq; do { seq = raw_read_seqcount_latch(&tkf->seq); tkr = tkf->base + (seq & 0x01); - basem = ktime_to_ns(tkr->base); baser = ktime_to_ns(tkr->base_real); delta = timekeeping_get_ns(tkr); } while (raw_read_seqcount_latch_retry(&tkf->seq, seq)); - if (mono) - *mono = basem + delta; return baser + delta; } - -/** - * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime. - * - * See ktime_get_mono_fast_ns() for documentation of the time stamp ordering. - */ -u64 ktime_get_real_fast_ns(void) -{ - return __ktime_get_real_fast(&tk_fast_mono, NULL); -} EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns); /** - * ktime_get_fast_timestamps: - NMI safe timestamps - * @snapshot: Pointer to timestamp storage - * - * Stores clock monotonic, boottime and realtime timestamps. - * - * Boot time is a racy access on 32bit systems if the sleep time injection - * happens late during resume and not in timekeeping_resume(). That could - * be avoided by expanding struct tk_read_base with boot offset for 32bit - * and adding more overhead to the update. As this is a hard to observe - * once per resume event which can be filtered with reasonable effort using - * the accurate mono/real timestamps, it's probably not worth the trouble. - * - * Aside of that it might be possible on 32 and 64 bit to observe the - * following when the sleep time injection happens late: - * - * CPU 0 CPU 1 - * timekeeping_resume() - * ktime_get_fast_timestamps() - * mono, real = __ktime_get_real_fast() - * inject_sleep_time() - * update boot offset - * boot = mono + bootoffset; - * - * That means that boot time already has the sleep time adjustment, but - * real time does not. On the next readout both are in sync again. - * - * Preventing this for 64bit is not really feasible without destroying the - * careful cache layout of the timekeeper because the sequence count and - * struct tk_read_base would then need two cache lines instead of one. - * - * Access to the time keeper clock source is disabled across the innermost - * steps of suspend/resume. The accessors still work, but the timestamps - * are frozen until time keeping is resumed which happens very early. - * - * For regular suspend/resume there is no observable difference vs. sched - * clock, but it might affect some of the nasty low level debug printks. - * - * OTOH, access to sched clock is not guaranteed across suspend/resume on - * all systems either so it depends on the hardware in use. - * - * If that turns out to be a real problem then this could be mitigated by - * using sched clock in a similar way as during early boot. But it's not as - * trivial as on early boot because it needs some careful protection - * against the clock monotonic timestamp jumping backwards on resume. - */ -void ktime_get_fast_timestamps(struct ktime_timestamps *snapshot) -{ - struct timekeeper *tk = &tk_core.timekeeper; - - snapshot->real = __ktime_get_real_fast(&tk_fast_mono, &snapshot->mono); - snapshot->boot = snapshot->mono + ktime_to_ns(data_race(tk->offs_boot)); -} - -/** * halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource. * @tk: Timekeeper to snapshot. * diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a5860bf6d16f..c8f776dc6ee0 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -301,7 +301,7 @@ static int timer_migration_handler(const struct ctl_table *table, int write, return ret; } -static struct ctl_table timer_sysctl[] = { +static const struct ctl_table timer_sysctl[] = { { .procname = "timer_migration", .data = &sysctl_timer_migration, @@ -956,33 +956,29 @@ static int detach_if_pending(struct timer_list *timer, struct timer_base *base, static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu) { int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL; - struct timer_base *base; - - base = per_cpu_ptr(&timer_bases[index], cpu); /* * If the timer is deferrable and NO_HZ_COMMON is set then we need * to use the deferrable base. */ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) - base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu); - return base; + index = BASE_DEF; + + return per_cpu_ptr(&timer_bases[index], cpu); } static inline struct timer_base *get_timer_this_cpu_base(u32 tflags) { int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL; - struct timer_base *base; - - base = this_cpu_ptr(&timer_bases[index]); /* * If the timer is deferrable and NO_HZ_COMMON is set then we need * to use the deferrable base. */ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) - base = this_cpu_ptr(&timer_bases[BASE_DEF]); - return base; + index = BASE_DEF; + + return this_cpu_ptr(&timer_bases[index]); } static inline struct timer_base *get_timer_base(u32 tflags) diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c index 8d57f7686bb0..2f6330831f08 100644 --- a/kernel/time/timer_migration.c +++ b/kernel/time/timer_migration.c @@ -534,8 +534,13 @@ static void __walk_groups(up_f up, struct tmigr_walk *data, break; child = group; - group = group->parent; + /* + * Pairs with the store release on group connection + * to make sure group initialization is visible. + */ + group = READ_ONCE(group->parent); data->childmask = child->groupmask; + WARN_ON_ONCE(!data->childmask); } while (group); } @@ -564,7 +569,7 @@ static struct tmigr_event *tmigr_next_groupevt(struct tmigr_group *group) while ((node = timerqueue_getnext(&group->events))) { evt = container_of(node, struct tmigr_event, nextevt); - if (!evt->ignore) { + if (!READ_ONCE(evt->ignore)) { WRITE_ONCE(group->next_expiry, evt->nextevt.expires); return evt; } @@ -660,7 +665,7 @@ static bool tmigr_active_up(struct tmigr_group *group, * lock is held while updating the ignore flag in idle path. So this * state change will not be lost. */ - group->groupevt.ignore = true; + WRITE_ONCE(group->groupevt.ignore, true); return walk_done; } @@ -721,6 +726,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child, union tmigr_state childstate, groupstate; bool remote = data->remote; bool walk_done = false; + bool ignore; u64 nextexp; if (child) { @@ -739,11 +745,19 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child, nextexp = child->next_expiry; evt = &child->groupevt; - evt->ignore = (nextexp == KTIME_MAX) ? true : false; + /* + * This can race with concurrent idle exit (activate). + * If the current writer wins, a useless remote expiration may + * be scheduled. If the activate wins, the event is properly + * ignored. + */ + ignore = (nextexp == KTIME_MAX) ? true : false; + WRITE_ONCE(evt->ignore, ignore); } else { nextexp = data->nextexp; first_childevt = evt = data->evt; + ignore = evt->ignore; /* * Walking the hierarchy is required in any case when a @@ -769,7 +783,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child, * first event information of the group is updated properly and * also handled properly, so skip this fast return path. */ - if (evt->ignore && !remote && group->parent) + if (ignore && !remote && group->parent) return true; raw_spin_lock(&group->lock); @@ -783,7 +797,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child, * queue when the expiry time changed only or when it could be ignored. */ if (timerqueue_node_queued(&evt->nextevt)) { - if ((evt->nextevt.expires == nextexp) && !evt->ignore) { + if ((evt->nextevt.expires == nextexp) && !ignore) { /* Make sure not to miss a new CPU event with the same expiry */ evt->cpu = first_childevt->cpu; goto check_toplvl; @@ -793,7 +807,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child, WRITE_ONCE(group->next_expiry, KTIME_MAX); } - if (evt->ignore) { + if (ignore) { /* * When the next child event could be ignored (nextexp is * KTIME_MAX) and there was no remote timer handling before or @@ -1487,6 +1501,21 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl, s.seq = 0; atomic_set(&group->migr_state, s.state); + /* + * If this is a new top-level, prepare its groupmask in advance. + * This avoids accidents where yet another new top-level is + * created in the future and made visible before the current groupmask. + */ + if (list_empty(&tmigr_level_list[lvl])) { + group->groupmask = BIT(0); + /* + * The previous top level has prepared its groupmask already, + * simply account it as the first child. + */ + if (lvl > 0) + group->num_children = 1; + } + timerqueue_init_head(&group->events); timerqueue_init(&group->groupevt.nextevt); group->groupevt.nextevt.expires = KTIME_MAX; @@ -1550,8 +1579,25 @@ static void tmigr_connect_child_parent(struct tmigr_group *child, raw_spin_lock_irq(&child->lock); raw_spin_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING); - child->parent = parent; - child->groupmask = BIT(parent->num_children++); + if (activate) { + /* + * @child is the old top and @parent the new one. In this + * case groupmask is pre-initialized and @child already + * accounted, along with its new sibling corresponding to the + * CPU going up. + */ + WARN_ON_ONCE(child->groupmask != BIT(0) || parent->num_children != 2); + } else { + /* Adding @child for the CPU going up to @parent. */ + child->groupmask = BIT(parent->num_children++); + } + + /* + * Make sure parent initialization is visible before publishing it to a + * racing CPU entering/exiting idle. This RELEASE barrier enforces an + * address dependency that pairs with the READ_ONCE() in __walk_groups(). + */ + smp_store_release(&child->parent, parent); raw_spin_unlock(&parent->lock); raw_spin_unlock_irq(&child->lock); @@ -1624,13 +1670,14 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) * be different from tmigr_hierarchy_levels, contains only a * single group. */ - if (group->parent || i == tmigr_hierarchy_levels || - (list_empty(&tmigr_level_list[i]) && - list_is_singular(&tmigr_level_list[i - 1]))) + if (group->parent || list_is_singular(&tmigr_level_list[i - 1])) break; } while (i < tmigr_hierarchy_levels); + /* Assert single root */ + WARN_ON_ONCE(!err && !group->parent && !list_is_singular(&tmigr_level_list[top])); + while (i > 0) { group = stack[--i]; @@ -1672,7 +1719,12 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node) WARN_ON_ONCE(top == 0); lvllist = &tmigr_level_list[top]; - if (group->num_children == 1 && list_is_singular(lvllist)) { + + /* + * Newly created root level should have accounted the upcoming + * CPU's child group and pre-accounted the old root. + */ + if (group->num_children == 2 && list_is_singular(lvllist)) { /* * The target CPU must never do the prepare work, except * on early boot when the boot CPU is the target. Otherwise diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h index 154accc7a543..ae19f70f8170 100644 --- a/kernel/time/timer_migration.h +++ b/kernel/time/timer_migration.h @@ -110,22 +110,19 @@ struct tmigr_cpu { * union tmigr_state - state of tmigr_group * @state: Combined version of the state - only used for atomic * read/cmpxchg function - * @struct: Split version of the state - only use the struct members to + * &anon struct: Split version of the state - only use the struct members to * update information to stay independent of endianness + * @active: Contains each mask bit of the active children + * @migrator: Contains mask of the child which is migrator + * @seq: Sequence counter needs to be increased when an update + * to the tmigr_state is done. It prevents a race when + * updates in the child groups are propagated in changed + * order. Detailed information about the scenario is + * given in the documentation at the begin of + * timer_migration.c. */ union tmigr_state { u32 state; - /** - * struct - split state of tmigr_group - * @active: Contains each mask bit of the active children - * @migrator: Contains mask of the child which is migrator - * @seq: Sequence counter needs to be increased when an update - * to the tmigr_state is done. It prevents a race when - * updates in the child groups are propagated in changed - * order. Detailed information about the scenario is - * given in the documentation at the begin of - * timer_migration.c. - */ struct { u8 active; u8 migrator; |