From 26db62f179d112d345031e14926a4cda9cd40d6e Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 7 Oct 2016 16:58:51 -0700 Subject: oom: keep mm of the killed task available oom_reap_task has to call exit_oom_victim in order to make sure that the oom vicim will not block the oom killer for ever. This is, however, opening new problems (e.g oom_killer_disable exclusion - see commit 74070542099c ("oom, suspend: fix oom_reaper vs. oom_killer_disable race")). exit_oom_victim should be only called from the victim's context ideally. One way to achieve this would be to rely on per mm_struct flags. We already have MMF_OOM_REAPED to hide a task from the oom killer since "mm, oom: hide mm which is shared with kthread or global init". The problem is that the exit path: do_exit exit_mm tsk->mm = NULL; mmput __mmput exit_oom_victim doesn't guarantee that exit_oom_victim will get called in a bounded amount of time. At least exit_aio depends on IO which might get blocked due to lack of memory and who knows what else is lurking there. This patch takes a different approach. We remember tsk->mm into the signal_struct and bind it to the signal struct life time for all oom victims. __oom_reap_task_mm as well as oom_scan_process_thread do not have to rely on find_lock_task_mm anymore and they will have a reliable reference to the mm struct. As a result all the oom specific communication inside the OOM killer can be done via tsk->signal->oom_mm. Increasing the signal_struct for something as unlikely as the oom killer is far from ideal but this approach will make the code much more reasonable and long term we even might want to move task->mm into the signal_struct anyway. In the next step we might want to make the oom killer exclusion and access to memory reserves completely independent which would be also nice. Link: http://lkml.kernel.org/r/1472119394-11342-4-git-send-email-mhocko@kernel.org Signed-off-by: Michal Hocko Cc: Tetsuo Handa Cc: Oleg Nesterov Cc: David Rientjes Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 9a05bd93f8e7..48cafe787b75 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -359,6 +359,8 @@ static inline void free_signal_struct(struct signal_struct *sig) { taskstats_tgid_free(sig); sched_autogroup_exit(sig); + if (sig->oom_mm) + mmdrop(sig->oom_mm); kmem_cache_free(signal_cachep, sig); } -- cgit v1.2.3-59-g8ed1b From 7283094ec3db318e87ec9e31cf75f136ac2a4dd3 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 7 Oct 2016 16:58:54 -0700 Subject: kernel, oom: fix potential pgd_lock deadlock from __mmdrop Lockdep complains that __mmdrop is not safe from the softirq context: ================================= [ INFO: inconsistent lock state ] 4.6.0-oomfortification2-00011-geeb3eadeab96-dirty #949 Tainted: G W --------------------------------- inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage. swapper/1/0 [HC0[0]:SC1[1]:HE1:SE0] takes: (pgd_lock){+.?...}, at: pgd_free+0x19/0x6b {SOFTIRQ-ON-W} state was registered at: __lock_acquire+0xa06/0x196e lock_acquire+0x139/0x1e1 _raw_spin_lock+0x32/0x41 __change_page_attr_set_clr+0x2a5/0xacd change_page_attr_set_clr+0x16f/0x32c set_memory_nx+0x37/0x3a free_init_pages+0x9e/0xc7 alternative_instructions+0xa2/0xb3 check_bugs+0xe/0x2d start_kernel+0x3ce/0x3ea x86_64_start_reservations+0x2a/0x2c x86_64_start_kernel+0x17a/0x18d irq event stamp: 105916 hardirqs last enabled at (105916): free_hot_cold_page+0x37e/0x390 hardirqs last disabled at (105915): free_hot_cold_page+0x2c1/0x390 softirqs last enabled at (105878): _local_bh_enable+0x42/0x44 softirqs last disabled at (105879): irq_exit+0x6f/0xd1 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(pgd_lock); lock(pgd_lock); *** DEADLOCK *** 1 lock held by swapper/1/0: #0: (rcu_callback){......}, at: rcu_process_callbacks+0x390/0x800 stack backtrace: CPU: 1 PID: 0 Comm: swapper/1 Tainted: G W 4.6.0-oomfortification2-00011-geeb3eadeab96-dirty #949 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Debian-1.8.2-1 04/01/2014 Call Trace: print_usage_bug.part.25+0x259/0x268 mark_lock+0x381/0x567 __lock_acquire+0x993/0x196e lock_acquire+0x139/0x1e1 _raw_spin_lock+0x32/0x41 pgd_free+0x19/0x6b __mmdrop+0x25/0xb9 __put_task_struct+0x103/0x11e delayed_put_task_struct+0x157/0x15e rcu_process_callbacks+0x660/0x800 __do_softirq+0x1ec/0x4d5 irq_exit+0x6f/0xd1 smp_apic_timer_interrupt+0x42/0x4d apic_timer_interrupt+0x8e/0xa0 arch_cpu_idle+0xf/0x11 default_idle_call+0x32/0x34 cpu_startup_entry+0x20c/0x399 start_secondary+0xfe/0x101 More over commit a79e53d85683 ("x86/mm: Fix pgd_lock deadlock") was explicit about pgd_lock not to be called from the irq context. This means that __mmdrop called from free_signal_struct has to be postponed to a user context. We already have a similar mechanism for mmput_async so we can use it here as well. This is safe because mm_count is pinned by mm_users. This fixes bug introduced by "oom: keep mm of the killed task available" Link: http://lkml.kernel.org/r/1472119394-11342-5-git-send-email-mhocko@kernel.org Signed-off-by: Michal Hocko Cc: Tetsuo Handa Cc: Oleg Nesterov Cc: David Rientjes Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 2 -- include/linux/sched.h | 14 ++++++++++++++ kernel/fork.c | 6 +++++- 3 files changed, 19 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 903200f4ec41..4a8acedf4b7d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -515,9 +515,7 @@ struct mm_struct { #ifdef CONFIG_HUGETLB_PAGE atomic_long_t hugetlb_usage; #endif -#ifdef CONFIG_MMU struct work_struct async_put_work; -#endif }; static inline void mm_init_cpumask(struct mm_struct *mm) diff --git a/include/linux/sched.h b/include/linux/sched.h index 67ea79610e67..c4b588358296 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2877,6 +2877,20 @@ static inline void mmdrop(struct mm_struct *mm) __mmdrop(mm); } +static inline void mmdrop_async_fn(struct work_struct *work) +{ + struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work); + __mmdrop(mm); +} + +static inline void mmdrop_async(struct mm_struct *mm) +{ + if (unlikely(atomic_dec_and_test(&mm->mm_count))) { + INIT_WORK(&mm->async_put_work, mmdrop_async_fn); + schedule_work(&mm->async_put_work); + } +} + static inline bool mmget_not_zero(struct mm_struct *mm) { return atomic_inc_not_zero(&mm->mm_users); diff --git a/kernel/fork.c b/kernel/fork.c index 48cafe787b75..5650e35dda43 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -359,8 +359,12 @@ static inline void free_signal_struct(struct signal_struct *sig) { taskstats_tgid_free(sig); sched_autogroup_exit(sig); + /* + * __mmdrop is not safe to call from softirq context on x86 due to + * pgd_dtor so postpone it to the async context + */ if (sig->oom_mm) - mmdrop(sig->oom_mm); + mmdrop_async(sig->oom_mm); kmem_cache_free(signal_cachep, sig); } -- cgit v1.2.3-59-g8ed1b From 862e3073b3eed13f17bd6be6ca6052db15c0b728 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 7 Oct 2016 16:58:57 -0700 Subject: mm, oom: get rid of signal_struct::oom_victims After "oom: keep mm of the killed task available" we can safely detect an oom victim by checking task->signal->oom_mm so we do not need the signal_struct counter anymore so let's get rid of it. This alone wouldn't be sufficient for nommu archs because exit_oom_victim doesn't hide the process from the oom killer anymore. We can, however, mark the mm with a MMF flag in __mmput. We can reuse MMF_OOM_REAPED and rename it to a more generic MMF_OOM_SKIP. Link: http://lkml.kernel.org/r/1472119394-11342-6-git-send-email-mhocko@kernel.org Signed-off-by: Michal Hocko Cc: Tetsuo Handa Cc: Oleg Nesterov Cc: David Rientjes Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 5 +++++ include/linux/sched.h | 3 +-- kernel/fork.c | 1 + mm/oom_kill.c | 17 +++++++---------- 4 files changed, 14 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/oom.h b/include/linux/oom.h index 17946e5121b6..b61357d07170 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -58,6 +58,11 @@ static inline bool oom_task_origin(const struct task_struct *p) return p->signal->oom_flag_origin; } +static inline bool tsk_is_oom_victim(struct task_struct * tsk) +{ + return tsk->signal->oom_mm; +} + extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages); diff --git a/include/linux/sched.h b/include/linux/sched.h index c4b588358296..af0721364788 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -524,7 +524,7 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_HAS_UPROBES 19 /* has uprobes */ #define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ -#define MMF_OOM_REAPED 21 /* mm has been already reaped */ +#define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */ #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) @@ -672,7 +672,6 @@ struct signal_struct { atomic_t sigcnt; atomic_t live; int nr_threads; - atomic_t oom_victims; /* # of TIF_MEDIE threads in this thread group */ struct list_head thread_head; wait_queue_head_t wait_chldexit; /* for wait4() */ diff --git a/kernel/fork.c b/kernel/fork.c index 5650e35dda43..9a8ec66cd4df 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -862,6 +862,7 @@ static inline void __mmput(struct mm_struct *mm) } if (mm->binfmt) module_put(mm->binfmt->module); + set_bit(MMF_OOM_SKIP, &mm->flags); mmdrop(mm); } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f16ec0840a0e..e2a2c35dd493 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -186,7 +186,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, */ adj = (long)p->signal->oom_score_adj; if (adj == OOM_SCORE_ADJ_MIN || - test_bit(MMF_OOM_REAPED, &p->mm->flags) || + test_bit(MMF_OOM_SKIP, &p->mm->flags) || in_vfork(p)) { task_unlock(p); return 0; @@ -296,11 +296,11 @@ static int oom_evaluate_task(struct task_struct *task, void *arg) /* * This task already has access to memory reserves and is being killed. * Don't allow any other task to have access to the reserves unless - * the task has MMF_OOM_REAPED because chances that it would release + * the task has MMF_OOM_SKIP because chances that it would release * any memory is quite low. */ - if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) { - if (test_bit(MMF_OOM_REAPED, &task->signal->oom_mm->flags)) + if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) { + if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags)) goto next; goto abort; } @@ -572,7 +572,7 @@ done: * Hide this mm from OOM killer because it has been either reaped or * somebody can't call up_write(mmap_sem). */ - set_bit(MMF_OOM_REAPED, &mm->flags); + set_bit(MMF_OOM_SKIP, &mm->flags); /* Drop a reference taken by wake_oom_reaper */ put_task_struct(tsk); @@ -654,8 +654,6 @@ static void mark_oom_victim(struct task_struct *tsk) if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE)) return; - atomic_inc(&tsk->signal->oom_victims); - /* oom_mm is bound to the signal struct life time. */ if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) atomic_inc(&tsk->signal->oom_mm->mm_count); @@ -677,7 +675,6 @@ void exit_oom_victim(struct task_struct *tsk) { if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE)) return; - atomic_dec(&tsk->signal->oom_victims); if (!atomic_dec_return(&oom_victims)) wake_up_all(&oom_victims_wait); @@ -769,7 +766,7 @@ static bool task_will_free_mem(struct task_struct *task) * This task has already been drained by the oom reaper so there are * only small chances it will free some more */ - if (test_bit(MMF_OOM_REAPED, &mm->flags)) + if (test_bit(MMF_OOM_SKIP, &mm->flags)) return false; if (atomic_read(&mm->mm_users) <= 1) @@ -906,7 +903,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message) * killer to guarantee OOM forward progress. */ can_oom_reap = false; - set_bit(MMF_OOM_REAPED, &mm->flags); + set_bit(MMF_OOM_SKIP, &mm->flags); pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n", task_pid_nr(victim), victim->comm, task_pid_nr(p), p->comm); -- cgit v1.2.3-59-g8ed1b From 7d2e7a22cf27e7569e6816ccc05dd74248048b30 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 7 Oct 2016 16:59:00 -0700 Subject: oom, suspend: fix oom_killer_disable vs. pm suspend properly Commit 74070542099c ("oom, suspend: fix oom_reaper vs. oom_killer_disable race") has workaround an existing race between oom_killer_disable and oom_reaper by adding another round of try_to_freeze_tasks after the oom killer was disabled. This was the easiest thing to do for a late 4.7 fix. Let's fix it properly now. After "oom: keep mm of the killed task available" we no longer have to call exit_oom_victim from the oom reaper because we have stable mm available and hide the oom_reaped mm by MMF_OOM_SKIP flag. So let's remove exit_oom_victim and the race described in the above commit doesn't exist anymore if. Unfortunately this alone is not sufficient for the oom_killer_disable usecase because now we do not have any reliable way to reach exit_oom_victim (the victim might get stuck on a way to exit for an unbounded amount of time). OOM killer can cope with that by checking mm flags and move on to another victim but we cannot do the same for oom_killer_disable as we would lose the guarantee of no further interference of the victim with the rest of the system. What we can do instead is to cap the maximum time the oom_killer_disable waits for victims. The only current user of this function (pm suspend) already has a concept of timeout for back off so we can reuse the same value there. Let's drop set_freezable for the oom_reaper kthread because it is no longer needed as the reaper doesn't wake or thaw any processes. Link: http://lkml.kernel.org/r/1472119394-11342-7-git-send-email-mhocko@kernel.org Signed-off-by: Michal Hocko Cc: Tetsuo Handa Cc: Oleg Nesterov Cc: David Rientjes Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 2 +- kernel/power/process.c | 17 +++-------------- mm/oom_kill.c | 40 ++++++++++++++++++++-------------------- 3 files changed, 24 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/include/linux/oom.h b/include/linux/oom.h index b61357d07170..0f1b9da108e4 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -74,7 +74,7 @@ extern void exit_oom_victim(struct task_struct *tsk); extern int register_oom_notifier(struct notifier_block *nb); extern int unregister_oom_notifier(struct notifier_block *nb); -extern bool oom_killer_disable(void); +extern bool oom_killer_disable(signed long timeout); extern void oom_killer_enable(void); extern struct task_struct *find_lock_task_mm(struct task_struct *p); diff --git a/kernel/power/process.c b/kernel/power/process.c index 8f27d5a8adf6..2fba066e125f 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -144,23 +144,12 @@ int freeze_processes(void) /* * Now that the whole userspace is frozen we need to disbale * the OOM killer to disallow any further interference with - * killable tasks. + * killable tasks. There is no guarantee oom victims will + * ever reach a point they go away we have to wait with a timeout. */ - if (!error && !oom_killer_disable()) + if (!error && !oom_killer_disable(msecs_to_jiffies(freeze_timeout_msecs))) error = -EBUSY; - /* - * There is a hard to fix race between oom_reaper kernel thread - * and oom_killer_disable. oom_reaper calls exit_oom_victim - * before the victim reaches exit_mm so try to freeze all the tasks - * again and catch such a left over task. - */ - if (!error) { - pr_info("Double checking all user space processes after OOM killer disable... "); - error = try_to_freeze_tasks(true); - pr_cont("\n"); - } - if (error) thaw_processes(); return error; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index e2a2c35dd493..895a51fe8e18 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -559,14 +559,7 @@ static void oom_reap_task(struct task_struct *tsk) debug_show_all_locks(); done: - /* - * Clear TIF_MEMDIE because the task shouldn't be sitting on a - * reasonably reclaimable memory anymore or it is not a good candidate - * for the oom victim right now because it cannot release its memory - * itself nor by the oom reaper. - */ tsk->oom_reaper_list = NULL; - exit_oom_victim(tsk); /* * Hide this mm from OOM killer because it has been either reaped or @@ -580,8 +573,6 @@ done: static int oom_reaper(void *unused) { - set_freezable(); - while (true) { struct task_struct *tsk = NULL; @@ -680,11 +671,21 @@ void exit_oom_victim(struct task_struct *tsk) wake_up_all(&oom_victims_wait); } +/** + * oom_killer_enable - enable OOM killer + */ +void oom_killer_enable(void) +{ + oom_killer_disabled = false; +} + /** * oom_killer_disable - disable OOM killer + * @timeout: maximum timeout to wait for oom victims in jiffies * * Forces all page allocations to fail rather than trigger OOM killer. - * Will block and wait until all OOM victims are killed. + * Will block and wait until all OOM victims are killed or the given + * timeout expires. * * The function cannot be called when there are runnable user tasks because * the userspace would see unexpected allocation failures as a result. Any @@ -693,8 +694,10 @@ void exit_oom_victim(struct task_struct *tsk) * Returns true if successful and false if the OOM killer cannot be * disabled. */ -bool oom_killer_disable(void) +bool oom_killer_disable(signed long timeout) { + signed long ret; + /* * Make sure to not race with an ongoing OOM killer. Check that the * current is not killed (possibly due to sharing the victim's memory). @@ -704,19 +707,16 @@ bool oom_killer_disable(void) oom_killer_disabled = true; mutex_unlock(&oom_lock); - wait_event(oom_victims_wait, !atomic_read(&oom_victims)); + ret = wait_event_interruptible_timeout(oom_victims_wait, + !atomic_read(&oom_victims), timeout); + if (ret <= 0) { + oom_killer_enable(); + return false; + } return true; } -/** - * oom_killer_enable - enable OOM killer - */ -void oom_killer_enable(void) -{ - oom_killer_disabled = false; -} - static inline bool __task_will_free_mem(struct task_struct *task) { struct signal_struct *sig = task->signal; -- cgit v1.2.3-59-g8ed1b From 38531201c12144cd7d96abfdfe7449c2b01375e8 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 7 Oct 2016 16:59:03 -0700 Subject: mm, oom: enforce exit_oom_victim on current task There are no users of exit_oom_victim on !current task anymore so enforce the API to always work on the current. Link: http://lkml.kernel.org/r/1472119394-11342-8-git-send-email-mhocko@kernel.org Signed-off-by: Tetsuo Handa Signed-off-by: Michal Hocko Cc: Oleg Nesterov Cc: David Rientjes Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 2 +- kernel/exit.c | 2 +- mm/oom_kill.c | 5 ++--- 3 files changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/oom.h b/include/linux/oom.h index 0f1b9da108e4..b4e36e92bc87 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -69,7 +69,7 @@ extern unsigned long oom_badness(struct task_struct *p, extern bool out_of_memory(struct oom_control *oc); -extern void exit_oom_victim(struct task_struct *tsk); +extern void exit_oom_victim(void); extern int register_oom_notifier(struct notifier_block *nb); extern int unregister_oom_notifier(struct notifier_block *nb); diff --git a/kernel/exit.c b/kernel/exit.c index 1e1d913914c0..9d68c45ebbe3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -511,7 +511,7 @@ static void exit_mm(struct task_struct *tsk) mm_update_next_owner(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) - exit_oom_victim(tsk); + exit_oom_victim(); } static struct task_struct *find_alive_thread(struct task_struct *p) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 895a51fe8e18..3b990544db6d 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -662,10 +662,9 @@ static void mark_oom_victim(struct task_struct *tsk) /** * exit_oom_victim - note the exit of an OOM victim */ -void exit_oom_victim(struct task_struct *tsk) +void exit_oom_victim(void) { - if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE)) - return; + clear_thread_flag(TIF_MEMDIE); if (!atomic_dec_return(&oom_victims)) wake_up_all(&oom_victims_wait); -- cgit v1.2.3-59-g8ed1b From 6fcb52a56ff60d240f06296b12827e7f20d45f63 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Fri, 7 Oct 2016 17:00:08 -0700 Subject: thp: reduce usage of huge zero page's atomic counter The global zero page is used to satisfy an anonymous read fault. If THP(Transparent HugePage) is enabled then the global huge zero page is used. The global huge zero page uses an atomic counter for reference counting and is allocated/freed dynamically according to its counter value. CPU time spent on that counter will greatly increase if there are a lot of processes doing anonymous read faults. This patch proposes a way to reduce the access to the global counter so that the CPU load can be reduced accordingly. To do this, a new flag of the mm_struct is introduced: MMF_USED_HUGE_ZERO_PAGE. With this flag, the process only need to touch the global counter in two cases: 1 The first time it uses the global huge zero page; 2 The time when mm_user of its mm_struct reaches zero. Note that right now, the huge zero page is eligible to be freed as soon as its last use goes away. With this patch, the page will not be eligible to be freed until the exit of the last process from which it was ever used. And with the use of mm_user, the kthread is not eligible to use huge zero page either. Since no kthread is using huge zero page today, there is no difference after applying this patch. But if that is not desired, I can change it to when mm_count reaches zero. Case used for test on Haswell EP: usemem -n 72 --readonly -j 0x200000 100G Which spawns 72 processes and each will mmap 100G anonymous space and then do read only access to that space sequentially with a step of 2MB. CPU cycles from perf report for base commit: 54.03% usemem [kernel.kallsyms] [k] get_huge_zero_page CPU cycles from perf report for this commit: 0.11% usemem [kernel.kallsyms] [k] mm_get_huge_zero_page Performance(throughput) of the workload for base commit: 1784430792 Performance(throughput) of the workload for this commit: 4726928591 164% increase. Runtime of the workload for base commit: 707592 us Runtime of the workload for this commit: 303970 us 50% drop. Link: http://lkml.kernel.org/r/fe51a88f-446a-4622-1363-ad1282d71385@intel.com Signed-off-by: Aaron Lu Cc: Sergey Senozhatsky Cc: "Kirill A. Shutemov" Cc: Dave Hansen Cc: Tim Chen Cc: Huang Ying Cc: Vlastimil Babka Cc: Jerome Marchand Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Ebru Akagunduz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 2 +- include/linux/huge_mm.h | 8 ++++---- include/linux/sched.h | 1 + kernel/fork.c | 1 + mm/huge_memory.c | 36 +++++++++++++++++++++++++----------- mm/swap.c | 4 +--- mm/swap_state.c | 4 +--- 7 files changed, 34 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/fs/dax.c b/fs/dax.c index cc025f82ef07..014defd2e744 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1036,7 +1036,7 @@ int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, if (!write && !buffer_mapped(&bh)) { spinlock_t *ptl; pmd_t entry; - struct page *zero_page = get_huge_zero_page(); + struct page *zero_page = mm_get_huge_zero_page(vma->vm_mm); if (unlikely(!zero_page)) { dax_pmd_dbg(&bh, address, "no zero page"); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 4fca5263fd42..9b9f65d99873 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -156,8 +156,8 @@ static inline bool is_huge_zero_pmd(pmd_t pmd) return is_huge_zero_page(pmd_page(pmd)); } -struct page *get_huge_zero_page(void); -void put_huge_zero_page(void); +struct page *mm_get_huge_zero_page(struct mm_struct *mm); +void mm_put_huge_zero_page(struct mm_struct *mm); #define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot)) @@ -220,9 +220,9 @@ static inline bool is_huge_zero_page(struct page *page) return false; } -static inline void put_huge_zero_page(void) +static inline void mm_put_huge_zero_page(struct mm_struct *mm) { - BUILD_BUG(); + return; } static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma, diff --git a/include/linux/sched.h b/include/linux/sched.h index 6bee6f988912..348f51b0ec92 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -526,6 +526,7 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ #define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */ #define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ +#define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) diff --git a/kernel/fork.c b/kernel/fork.c index 9a8ec66cd4df..6d42242485cb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -854,6 +854,7 @@ static inline void __mmput(struct mm_struct *mm) ksm_exit(mm); khugepaged_exit(mm); /* must run before exit_mmap */ exit_mmap(mm); + mm_put_huge_zero_page(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a0b0e562407d..12b9f1a39b63 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -59,7 +59,7 @@ static struct shrinker deferred_split_shrinker; static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; -struct page *get_huge_zero_page(void) +static struct page *get_huge_zero_page(void) { struct page *zero_page; retry: @@ -86,7 +86,7 @@ retry: return READ_ONCE(huge_zero_page); } -void put_huge_zero_page(void) +static void put_huge_zero_page(void) { /* * Counter should never go to zero here. Only shrinker can put @@ -95,6 +95,26 @@ void put_huge_zero_page(void) BUG_ON(atomic_dec_and_test(&huge_zero_refcount)); } +struct page *mm_get_huge_zero_page(struct mm_struct *mm) +{ + if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) + return READ_ONCE(huge_zero_page); + + if (!get_huge_zero_page()) + return NULL; + + if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) + put_huge_zero_page(); + + return READ_ONCE(huge_zero_page); +} + +void mm_put_huge_zero_page(struct mm_struct *mm) +{ + if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) + put_huge_zero_page(); +} + static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink, struct shrink_control *sc) { @@ -644,7 +664,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) pgtable = pte_alloc_one(vma->vm_mm, haddr); if (unlikely(!pgtable)) return VM_FAULT_OOM; - zero_page = get_huge_zero_page(); + zero_page = mm_get_huge_zero_page(vma->vm_mm); if (unlikely(!zero_page)) { pte_free(vma->vm_mm, pgtable); count_vm_event(THP_FAULT_FALLBACK); @@ -666,10 +686,8 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) } } else spin_unlock(fe->ptl); - if (!set) { + if (!set) pte_free(vma->vm_mm, pgtable); - put_huge_zero_page(); - } return ret; } gfp = alloc_hugepage_direct_gfpmask(vma); @@ -823,7 +841,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, * since we already have a zero page to copy. It just takes a * reference. */ - zero_page = get_huge_zero_page(); + zero_page = mm_get_huge_zero_page(dst_mm); set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd, zero_page); ret = 0; @@ -1081,7 +1099,6 @@ alloc: update_mmu_cache_pmd(vma, fe->address, fe->pmd); if (!page) { add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); - put_huge_zero_page(); } else { VM_BUG_ON_PAGE(!PageHead(page), page); page_remove_rmap(page, true); @@ -1542,7 +1559,6 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, } smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); - put_huge_zero_page(); } static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, @@ -1565,8 +1581,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (!vma_is_anonymous(vma)) { _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); - if (is_huge_zero_pmd(_pmd)) - put_huge_zero_page(); if (vma_is_dax(vma)) return; page = pmd_page(_pmd); diff --git a/mm/swap.c b/mm/swap.c index 75c63bb2a1da..4dcf852e1e6d 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -748,10 +748,8 @@ void release_pages(struct page **pages, int nr, bool cold) locked_pgdat = NULL; } - if (is_huge_zero_page(page)) { - put_huge_zero_page(); + if (is_huge_zero_page(page)) continue; - } page = compound_head(page); if (!put_page_testzero(page)) diff --git a/mm/swap_state.c b/mm/swap_state.c index 268b8191982b..8679c997eab6 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -254,9 +254,7 @@ static inline void free_swap_cache(struct page *page) void free_page_and_swap_cache(struct page *page) { free_swap_cache(page); - if (is_huge_zero_page(page)) - put_huge_zero_page(); - else + if (!is_huge_zero_page(page)) put_page(page); } -- cgit v1.2.3-59-g8ed1b From 6727ad9e206cc08b80d8000a4d67f8417e53539d Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Fri, 7 Oct 2016 17:02:55 -0700 Subject: nmi_backtrace: generate one-line reports for idle cpus When doing an nmi backtrace of many cores, most of which are idle, the output is a little overwhelming and very uninformative. Suppress messages for cpus that are idling when they are interrupted and just emit one line, "NMI backtrace for N skipped: idling at pc 0xNNN". We do this by grouping all the cpuidle code together into a new .cpuidle.text section, and then checking the address of the interrupted PC to see if it lies within that section. This commit suitably tags x86 and tile idle routines, and only adds in the minimal framework for other architectures. Link: http://lkml.kernel.org/r/1472487169-14923-5-git-send-email-cmetcalf@mellanox.com Signed-off-by: Chris Metcalf Acked-by: Peter Zijlstra (Intel) Tested-by: Peter Zijlstra (Intel) Tested-by: Daniel Thompson [arm] Tested-by: Petr Mladek Cc: Aaron Tomlin Cc: Peter Zijlstra (Intel) Cc: "Rafael J. Wysocki" Cc: Russell King Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/kernel/vmlinux.lds.S | 1 + arch/arc/kernel/vmlinux.lds.S | 1 + arch/arm/kernel/vmlinux-xip.lds.S | 1 + arch/arm/kernel/vmlinux.lds.S | 1 + arch/arm64/kernel/vmlinux.lds.S | 1 + arch/avr32/kernel/vmlinux.lds.S | 1 + arch/blackfin/kernel/vmlinux.lds.S | 1 + arch/c6x/kernel/vmlinux.lds.S | 1 + arch/cris/kernel/vmlinux.lds.S | 1 + arch/frv/kernel/vmlinux.lds.S | 1 + arch/h8300/kernel/vmlinux.lds.S | 1 + arch/hexagon/kernel/vmlinux.lds.S | 1 + arch/ia64/kernel/vmlinux.lds.S | 1 + arch/m32r/kernel/vmlinux.lds.S | 1 + arch/m68k/kernel/vmlinux-nommu.lds | 1 + arch/m68k/kernel/vmlinux-std.lds | 1 + arch/m68k/kernel/vmlinux-sun3.lds | 1 + arch/metag/kernel/vmlinux.lds.S | 1 + arch/microblaze/kernel/vmlinux.lds.S | 1 + arch/mips/kernel/vmlinux.lds.S | 1 + arch/mn10300/kernel/vmlinux.lds.S | 1 + arch/nios2/kernel/vmlinux.lds.S | 1 + arch/openrisc/kernel/vmlinux.lds.S | 1 + arch/parisc/kernel/vmlinux.lds.S | 1 + arch/powerpc/kernel/vmlinux.lds.S | 1 + arch/s390/kernel/vmlinux.lds.S | 1 + arch/score/kernel/vmlinux.lds.S | 1 + arch/sh/kernel/vmlinux.lds.S | 1 + arch/sparc/kernel/vmlinux.lds.S | 1 + arch/tile/kernel/entry.S | 2 +- arch/tile/kernel/vmlinux.lds.S | 1 + arch/um/kernel/dyn.lds.S | 1 + arch/um/kernel/uml.lds.S | 1 + arch/unicore32/kernel/vmlinux.lds.S | 1 + arch/x86/include/asm/irqflags.h | 12 ++++++++---- arch/x86/kernel/acpi/cstate.c | 2 +- arch/x86/kernel/process.c | 4 ++-- arch/x86/kernel/vmlinux.lds.S | 1 + arch/xtensa/kernel/vmlinux.lds.S | 3 +++ drivers/acpi/processor_idle.c | 5 +++-- drivers/cpuidle/driver.c | 5 +++-- drivers/idle/intel_idle.c | 4 ++-- include/asm-generic/vmlinux.lds.h | 6 ++++++ include/linux/cpu.h | 5 +++++ kernel/sched/idle.c | 13 +++++++++++-- lib/nmi_backtrace.c | 16 +++++++++++----- scripts/mod/modpost.c | 2 +- scripts/recordmcount.c | 1 + scripts/recordmcount.pl | 1 + 49 files changed, 93 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/kernel/vmlinux.lds.S b/arch/alpha/kernel/vmlinux.lds.S index 647b84c15382..cebecfb76fbf 100644 --- a/arch/alpha/kernel/vmlinux.lds.S +++ b/arch/alpha/kernel/vmlinux.lds.S @@ -22,6 +22,7 @@ SECTIONS HEAD_TEXT TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT *(.fixup) *(.gnu.warning) diff --git a/arch/arc/kernel/vmlinux.lds.S b/arch/arc/kernel/vmlinux.lds.S index 36611072305f..f35ed578e007 100644 --- a/arch/arc/kernel/vmlinux.lds.S +++ b/arch/arc/kernel/vmlinux.lds.S @@ -89,6 +89,7 @@ SECTIONS _text = .; TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT *(.fixup) diff --git a/arch/arm/kernel/vmlinux-xip.lds.S b/arch/arm/kernel/vmlinux-xip.lds.S index cba1ec899a69..7fa487ef7e2f 100644 --- a/arch/arm/kernel/vmlinux-xip.lds.S +++ b/arch/arm/kernel/vmlinux-xip.lds.S @@ -98,6 +98,7 @@ SECTIONS IRQENTRY_TEXT TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT *(.gnu.warning) diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S index d24e5dd2aa7a..f7f55df0bf7b 100644 --- a/arch/arm/kernel/vmlinux.lds.S +++ b/arch/arm/kernel/vmlinux.lds.S @@ -111,6 +111,7 @@ SECTIONS SOFTIRQENTRY_TEXT TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT HYPERVISOR_TEXT KPROBES_TEXT diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 5ce9b2929e0d..1105aab1e6d6 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -122,6 +122,7 @@ SECTIONS ENTRY_TEXT TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT HYPERVISOR_TEXT diff --git a/arch/avr32/kernel/vmlinux.lds.S b/arch/avr32/kernel/vmlinux.lds.S index a4589176bed5..17f2730eb497 100644 --- a/arch/avr32/kernel/vmlinux.lds.S +++ b/arch/avr32/kernel/vmlinux.lds.S @@ -52,6 +52,7 @@ SECTIONS KPROBES_TEXT TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT *(.fixup) *(.gnu.warning) diff --git a/arch/blackfin/kernel/vmlinux.lds.S b/arch/blackfin/kernel/vmlinux.lds.S index d920b959ff3a..68069a120055 100644 --- a/arch/blackfin/kernel/vmlinux.lds.S +++ b/arch/blackfin/kernel/vmlinux.lds.S @@ -33,6 +33,7 @@ SECTIONS #ifndef CONFIG_SCHEDULE_L1 SCHED_TEXT #endif + CPUIDLE_TEXT LOCK_TEXT IRQENTRY_TEXT SOFTIRQENTRY_TEXT diff --git a/arch/c6x/kernel/vmlinux.lds.S b/arch/c6x/kernel/vmlinux.lds.S index 50bc10f97bcb..a1a5c166bc9b 100644 --- a/arch/c6x/kernel/vmlinux.lds.S +++ b/arch/c6x/kernel/vmlinux.lds.S @@ -70,6 +70,7 @@ SECTIONS _stext = .; TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT IRQENTRY_TEXT SOFTIRQENTRY_TEXT diff --git a/arch/cris/kernel/vmlinux.lds.S b/arch/cris/kernel/vmlinux.lds.S index 7552c2557506..979586261520 100644 --- a/arch/cris/kernel/vmlinux.lds.S +++ b/arch/cris/kernel/vmlinux.lds.S @@ -43,6 +43,7 @@ SECTIONS HEAD_TEXT TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT *(.fixup) *(.text.__*) diff --git a/arch/frv/kernel/vmlinux.lds.S b/arch/frv/kernel/vmlinux.lds.S index 7e958d829ec9..aa6e573d57da 100644 --- a/arch/frv/kernel/vmlinux.lds.S +++ b/arch/frv/kernel/vmlinux.lds.S @@ -63,6 +63,7 @@ SECTIONS *(.text..tlbmiss) TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT #ifdef CONFIG_DEBUG_INFO INIT_TEXT diff --git a/arch/h8300/kernel/vmlinux.lds.S b/arch/h8300/kernel/vmlinux.lds.S index cb5dfb02c88d..7f11da1b895e 100644 --- a/arch/h8300/kernel/vmlinux.lds.S +++ b/arch/h8300/kernel/vmlinux.lds.S @@ -29,6 +29,7 @@ SECTIONS _stext = . ; TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT #if defined(CONFIG_ROMKERNEL) *(.int_redirect) diff --git a/arch/hexagon/kernel/vmlinux.lds.S b/arch/hexagon/kernel/vmlinux.lds.S index 5f268c1071b3..ec87e67feb19 100644 --- a/arch/hexagon/kernel/vmlinux.lds.S +++ b/arch/hexagon/kernel/vmlinux.lds.S @@ -50,6 +50,7 @@ SECTIONS _text = .; TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT *(.fixup) diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S index dc506b05ffbd..f89d20c97412 100644 --- a/arch/ia64/kernel/vmlinux.lds.S +++ b/arch/ia64/kernel/vmlinux.lds.S @@ -46,6 +46,7 @@ SECTIONS { __end_ivt_text = .; TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT *(.gnu.linkonce.t*) diff --git a/arch/m32r/kernel/vmlinux.lds.S b/arch/m32r/kernel/vmlinux.lds.S index 018e4a711d79..ad1fe56455aa 100644 --- a/arch/m32r/kernel/vmlinux.lds.S +++ b/arch/m32r/kernel/vmlinux.lds.S @@ -31,6 +31,7 @@ SECTIONS HEAD_TEXT TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT *(.fixup) *(.gnu.warning) diff --git a/arch/m68k/kernel/vmlinux-nommu.lds b/arch/m68k/kernel/vmlinux-nommu.lds index 06a763f49fd3..d2c8abf1c8c4 100644 --- a/arch/m68k/kernel/vmlinux-nommu.lds +++ b/arch/m68k/kernel/vmlinux-nommu.lds @@ -45,6 +45,7 @@ SECTIONS { HEAD_TEXT TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT *(.fixup) . = ALIGN(16); diff --git a/arch/m68k/kernel/vmlinux-std.lds b/arch/m68k/kernel/vmlinux-std.lds index d0993594f558..5b5ce1e4d1ed 100644 --- a/arch/m68k/kernel/vmlinux-std.lds +++ b/arch/m68k/kernel/vmlinux-std.lds @@ -16,6 +16,7 @@ SECTIONS HEAD_TEXT TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT *(.fixup) *(.gnu.warning) diff --git a/arch/m68k/kernel/vmlinux-sun3.lds b/arch/m68k/kernel/vmlinux-sun3.lds index 8080469ee6c1..fe5ea1974b16 100644 --- a/arch/m68k/kernel/vmlinux-sun3.lds +++ b/arch/m68k/kernel/vmlinux-sun3.lds @@ -16,6 +16,7 @@ SECTIONS HEAD_TEXT TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT *(.fixup) *(.gnu.warning) diff --git a/arch/metag/kernel/vmlinux.lds.S b/arch/metag/kernel/vmlinux.lds.S index 150ace92c7ad..e6c700eaf207 100644 --- a/arch/metag/kernel/vmlinux.lds.S +++ b/arch/metag/kernel/vmlinux.lds.S @@ -21,6 +21,7 @@ SECTIONS .text : { TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT diff --git a/arch/microblaze/kernel/vmlinux.lds.S b/arch/microblaze/kernel/vmlinux.lds.S index 0a47f0410554..289d0e7f3e3a 100644 --- a/arch/microblaze/kernel/vmlinux.lds.S +++ b/arch/microblaze/kernel/vmlinux.lds.S @@ -33,6 +33,7 @@ SECTIONS { EXIT_TEXT EXIT_CALL SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S index a82c178d0bb9..d5de67591735 100644 --- a/arch/mips/kernel/vmlinux.lds.S +++ b/arch/mips/kernel/vmlinux.lds.S @@ -55,6 +55,7 @@ SECTIONS .text : { TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT diff --git a/arch/mn10300/kernel/vmlinux.lds.S b/arch/mn10300/kernel/vmlinux.lds.S index 13c4814c29f8..2d5f1c3f1afb 100644 --- a/arch/mn10300/kernel/vmlinux.lds.S +++ b/arch/mn10300/kernel/vmlinux.lds.S @@ -30,6 +30,7 @@ SECTIONS HEAD_TEXT TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT *(.fixup) diff --git a/arch/nios2/kernel/vmlinux.lds.S b/arch/nios2/kernel/vmlinux.lds.S index e23e89539967..6a8045bb1a77 100644 --- a/arch/nios2/kernel/vmlinux.lds.S +++ b/arch/nios2/kernel/vmlinux.lds.S @@ -37,6 +37,7 @@ SECTIONS .text : { TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT IRQENTRY_TEXT SOFTIRQENTRY_TEXT diff --git a/arch/openrisc/kernel/vmlinux.lds.S b/arch/openrisc/kernel/vmlinux.lds.S index d936de4c07ca..d68b9ede8423 100644 --- a/arch/openrisc/kernel/vmlinux.lds.S +++ b/arch/openrisc/kernel/vmlinux.lds.S @@ -47,6 +47,7 @@ SECTIONS _stext = .; TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S index f3ead0b6ce46..9ec8ec075dae 100644 --- a/arch/parisc/kernel/vmlinux.lds.S +++ b/arch/parisc/kernel/vmlinux.lds.S @@ -69,6 +69,7 @@ SECTIONS .text ALIGN(PAGE_SIZE) : { TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index b5fba689fca6..7ed59f0d947f 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -52,6 +52,7 @@ SECTIONS /* careful! __ftr_alt_* sections need to be close to .text */ *(.text .fixup __ftr_alt_* .ref.text) SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 429bfd111961..000e6e91f6a0 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -35,6 +35,7 @@ SECTIONS HEAD_TEXT TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT diff --git a/arch/score/kernel/vmlinux.lds.S b/arch/score/kernel/vmlinux.lds.S index 7274b5c4287e..4117890b1db1 100644 --- a/arch/score/kernel/vmlinux.lds.S +++ b/arch/score/kernel/vmlinux.lds.S @@ -40,6 +40,7 @@ SECTIONS _text = .; /* Text and read-only data */ TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT *(.text.*) diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S index 235a4101999f..5b9a3cc90c58 100644 --- a/arch/sh/kernel/vmlinux.lds.S +++ b/arch/sh/kernel/vmlinux.lds.S @@ -36,6 +36,7 @@ SECTIONS TEXT_TEXT EXTRA_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S index d79b3b734245..572db686f845 100644 --- a/arch/sparc/kernel/vmlinux.lds.S +++ b/arch/sparc/kernel/vmlinux.lds.S @@ -49,6 +49,7 @@ SECTIONS HEAD_TEXT TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT diff --git a/arch/tile/kernel/entry.S b/arch/tile/kernel/entry.S index 670a3569450f..101de132e363 100644 --- a/arch/tile/kernel/entry.S +++ b/arch/tile/kernel/entry.S @@ -50,7 +50,7 @@ STD_ENTRY(smp_nap) * When interrupted at _cpu_idle_nap, we bump the PC forward 8, and * as a result return to the function that called _cpu_idle(). */ -STD_ENTRY(_cpu_idle) +STD_ENTRY_SECTION(_cpu_idle, .cpuidle.text) movei r1, 1 IRQ_ENABLE_LOAD(r2, r3) mtspr INTERRUPT_CRITICAL_SECTION, r1 diff --git a/arch/tile/kernel/vmlinux.lds.S b/arch/tile/kernel/vmlinux.lds.S index 9d449caf8910..e1baf094fba4 100644 --- a/arch/tile/kernel/vmlinux.lds.S +++ b/arch/tile/kernel/vmlinux.lds.S @@ -42,6 +42,7 @@ SECTIONS .text : AT (ADDR(.text) - LOAD_OFFSET) { HEAD_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT diff --git a/arch/um/kernel/dyn.lds.S b/arch/um/kernel/dyn.lds.S index adde088aeeff..4fdbcf958cd5 100644 --- a/arch/um/kernel/dyn.lds.S +++ b/arch/um/kernel/dyn.lds.S @@ -68,6 +68,7 @@ SECTIONS _stext = .; TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT *(.fixup) *(.stub .text.* .gnu.linkonce.t.*) diff --git a/arch/um/kernel/uml.lds.S b/arch/um/kernel/uml.lds.S index 6899195602b7..1840f55ed042 100644 --- a/arch/um/kernel/uml.lds.S +++ b/arch/um/kernel/uml.lds.S @@ -28,6 +28,7 @@ SECTIONS _stext = .; TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT *(.fixup) /* .gnu.warning sections are handled specially by elf32.em. */ diff --git a/arch/unicore32/kernel/vmlinux.lds.S b/arch/unicore32/kernel/vmlinux.lds.S index 77e407e49a63..56e788e8ee83 100644 --- a/arch/unicore32/kernel/vmlinux.lds.S +++ b/arch/unicore32/kernel/vmlinux.lds.S @@ -37,6 +37,7 @@ SECTIONS .text : { /* Real text segment */ TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT *(.fixup) diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index b77f5edb03b0..ac7692dcfa2e 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -4,6 +4,10 @@ #include #ifndef __ASSEMBLY__ + +/* Provide __cpuidle; we can't safely include */ +#define __cpuidle __attribute__((__section__(".cpuidle.text"))) + /* * Interrupt control: */ @@ -44,12 +48,12 @@ static inline void native_irq_enable(void) asm volatile("sti": : :"memory"); } -static inline void native_safe_halt(void) +static inline __cpuidle void native_safe_halt(void) { asm volatile("sti; hlt": : :"memory"); } -static inline void native_halt(void) +static inline __cpuidle void native_halt(void) { asm volatile("hlt": : :"memory"); } @@ -86,7 +90,7 @@ static inline notrace void arch_local_irq_enable(void) * Used in the idle loop; sti takes one instruction cycle * to complete: */ -static inline void arch_safe_halt(void) +static inline __cpuidle void arch_safe_halt(void) { native_safe_halt(); } @@ -95,7 +99,7 @@ static inline void arch_safe_halt(void) * Used when interrupts are already enabled or to * shutdown the processor: */ -static inline void halt(void) +static inline __cpuidle void halt(void) { native_halt(); } diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index bdfad642123f..af15f4444330 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -152,7 +152,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, } EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); -void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) +void __cpuidle acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) { unsigned int cpu = smp_processor_id(); struct cstate_entry *percpu_entry; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 4002b475171c..28cea7802ecb 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -302,7 +302,7 @@ void arch_cpu_idle(void) /* * We use this if we don't have any better idle routine.. */ -void default_idle(void) +void __cpuidle default_idle(void) { trace_cpu_idle_rcuidle(1, smp_processor_id()); safe_halt(); @@ -417,7 +417,7 @@ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) * with interrupts enabled and no flags, which is backwards compatible with the * original MWAIT implementation. */ -static void mwait_idle(void) +static __cpuidle void mwait_idle(void) { if (!current_set_polling_and_test()) { trace_cpu_idle_rcuidle(1, smp_processor_id()); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 9297a002d8e5..dbf67f64d5ec 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -97,6 +97,7 @@ SECTIONS _stext = .; TEXT_TEXT SCHED_TEXT + CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT ENTRY_TEXT diff --git a/arch/xtensa/kernel/vmlinux.lds.S b/arch/xtensa/kernel/vmlinux.lds.S index 72cfe3587dd8..31411fc82662 100644 --- a/arch/xtensa/kernel/vmlinux.lds.S +++ b/arch/xtensa/kernel/vmlinux.lds.S @@ -89,6 +89,9 @@ SECTIONS VMLINUX_SYMBOL(__sched_text_start) = .; *(.sched.literal .sched.text) VMLINUX_SYMBOL(__sched_text_end) = .; + VMLINUX_SYMBOL(__cpuidle_text_start) = .; + *(.cpuidle.literal .cpuidle.text) + VMLINUX_SYMBOL(__cpuidle_text_end) = .; VMLINUX_SYMBOL(__lock_text_start) = .; *(.spinlock.literal .spinlock.text) VMLINUX_SYMBOL(__lock_text_end) = .; diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index cea52528aa18..2237d3f24f0e 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -31,6 +31,7 @@ #include /* need_resched() */ #include #include +#include #include /* @@ -115,7 +116,7 @@ static const struct dmi_system_id processor_power_dmi_table[] = { * Callers should disable interrupts before the call and enable * interrupts after return. */ -static void acpi_safe_halt(void) +static void __cpuidle acpi_safe_halt(void) { if (!tif_need_resched()) { safe_halt(); @@ -645,7 +646,7 @@ static int acpi_idle_bm_check(void) * * Caller disables interrupt before call and enables interrupt after return. */ -static void acpi_idle_do_entry(struct acpi_processor_cx *cx) +static void __cpuidle acpi_idle_do_entry(struct acpi_processor_cx *cx) { if (cx->entry_method == ACPI_CSTATE_FFH) { /* Call into architectural FFH based C-state */ diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c index 389ade4572be..ab264d393233 100644 --- a/drivers/cpuidle/driver.c +++ b/drivers/cpuidle/driver.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "cpuidle.h" @@ -178,8 +179,8 @@ static void __cpuidle_driver_init(struct cpuidle_driver *drv) } #ifdef CONFIG_ARCH_HAS_CPU_RELAX -static int poll_idle(struct cpuidle_device *dev, - struct cpuidle_driver *drv, int index) +static int __cpuidle poll_idle(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index) { local_irq_enable(); if (!current_set_polling_and_test()) { diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 67ec58f9ef99..4466a2f969d7 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -863,8 +863,8 @@ static struct cpuidle_state dnv_cstates[] = { * * Must be called under local_irq_disable(). */ -static int intel_idle(struct cpuidle_device *dev, - struct cpuidle_driver *drv, int index) +static __cpuidle int intel_idle(struct cpuidle_device *dev, + struct cpuidle_driver *drv, int index) { unsigned long ecx = 1; /* break on interrupt flag */ struct cpuidle_state *state = &drv->states[index]; diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 24563970ff7b..3e42bcdd014b 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -454,6 +454,12 @@ *(.spinlock.text) \ VMLINUX_SYMBOL(__lock_text_end) = .; +#define CPUIDLE_TEXT \ + ALIGN_FUNCTION(); \ + VMLINUX_SYMBOL(__cpuidle_text_start) = .; \ + *(.cpuidle.text) \ + VMLINUX_SYMBOL(__cpuidle_text_end) = .; + #define KPROBES_TEXT \ ALIGN_FUNCTION(); \ VMLINUX_SYMBOL(__kprobes_text_start) = .; \ diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 7572d9e9dced..b886dc17f2f3 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -231,6 +231,11 @@ void cpu_startup_entry(enum cpuhp_state state); void cpu_idle_poll_ctrl(bool enable); +/* Attach to any functions which should be considered cpuidle. */ +#define __cpuidle __attribute__((__section__(".cpuidle.text"))) + +bool cpu_in_idle(unsigned long pc); + void arch_cpu_idle(void); void arch_cpu_idle_prepare(void); void arch_cpu_idle_enter(void); diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 9fb873cfc75c..1d8718d5300d 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -16,6 +16,9 @@ #include "sched.h" +/* Linker adds these: start and end of __cpuidle functions */ +extern char __cpuidle_text_start[], __cpuidle_text_end[]; + /** * sched_idle_set_state - Record idle state for the current CPU. * @idle_state: State to record. @@ -53,7 +56,7 @@ static int __init cpu_idle_nopoll_setup(char *__unused) __setup("hlt", cpu_idle_nopoll_setup); #endif -static inline int cpu_idle_poll(void) +static noinline int __cpuidle cpu_idle_poll(void) { rcu_idle_enter(); trace_cpu_idle_rcuidle(0, smp_processor_id()); @@ -84,7 +87,7 @@ void __weak arch_cpu_idle(void) * * To use when the cpuidle framework cannot be used. */ -void default_idle_call(void) +void __cpuidle default_idle_call(void) { if (current_clr_polling_and_test()) { local_irq_enable(); @@ -271,6 +274,12 @@ static void cpu_idle_loop(void) } } +bool cpu_in_idle(unsigned long pc) +{ + return pc >= (unsigned long)__cpuidle_text_start && + pc < (unsigned long)__cpuidle_text_end; +} + void cpu_startup_entry(enum cpuhp_state state) { /* diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c index 393a3cca1f47..75554754eadf 100644 --- a/lib/nmi_backtrace.c +++ b/lib/nmi_backtrace.c @@ -16,6 +16,7 @@ #include #include #include +#include #ifdef arch_trigger_cpumask_backtrace /* For reliability, we're prepared to waste bits here. */ @@ -87,11 +88,16 @@ bool nmi_cpu_backtrace(struct pt_regs *regs) int cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { - pr_warn("NMI backtrace for cpu %d\n", cpu); - if (regs) - show_regs(regs); - else - dump_stack(); + if (regs && cpu_in_idle(instruction_pointer(regs))) { + pr_warn("NMI backtrace for cpu %d skipped: idling at pc %#lx\n", + cpu, instruction_pointer(regs)); + } else { + pr_warn("NMI backtrace for cpu %d\n", cpu); + if (regs) + show_regs(regs); + else + dump_stack(); + } cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); return true; } diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 48958d3cec9e..bd8349759095 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -888,7 +888,7 @@ static void check_section(const char *modname, struct elf_info *elf, #define DATA_SECTIONS ".data", ".data.rel" #define TEXT_SECTIONS ".text", ".text.unlikely", ".sched.text", \ - ".kprobes.text" + ".kprobes.text", ".cpuidle.text" #define OTHER_TEXT_SECTIONS ".ref.text", ".head.text", ".spinlock.text", \ ".fixup", ".entry.text", ".exception.text", ".text.*", \ ".coldtext" diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c index a68f03133df9..5423a58d1b06 100644 --- a/scripts/recordmcount.c +++ b/scripts/recordmcount.c @@ -365,6 +365,7 @@ is_mcounted_section_name(char const *const txtname) strcmp(".irqentry.text", txtname) == 0 || strcmp(".softirqentry.text", txtname) == 0 || strcmp(".kprobes.text", txtname) == 0 || + strcmp(".cpuidle.text", txtname) == 0 || strcmp(".text.unlikely", txtname) == 0; } diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 2d48011bc362..faac4b10d8ea 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -136,6 +136,7 @@ my %text_sections = ( ".irqentry.text" => 1, ".softirqentry.text" => 1, ".kprobes.text" => 1, + ".cpuidle.text" => 1, ".text.unlikely" => 1, ); -- cgit v1.2.3-59-g8ed1b From 81243eacfa400f5f7b89f4c2323d0de9982bb0fb Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 7 Oct 2016 17:03:12 -0700 Subject: cred: simpler, 1D supplementary groups Current supplementary groups code can massively overallocate memory and is implemented in a way so that access to individual gid is done via 2D array. If number of gids is <= 32, memory allocation is more or less tolerable (140/148 bytes). But if it is not, code allocates full page (!) regardless and, what's even more fun, doesn't reuse small 32-entry array. 2D array means dependent shifts, loads and LEAs without possibility to optimize them (gid is never known at compile time). All of the above is unnecessary. Switch to the usual trailing-zero-len-array scheme. Memory is allocated with kmalloc/vmalloc() and only as much as needed. Accesses become simpler (LEA 8(gi,idx,4) or even without displacement). Maximum number of gids is 65536 which translates to 256KB+8 bytes. I think kernel can handle such allocation. On my usual desktop system with whole 9 (nine) aux groups, struct group_info shrinks from 148 bytes to 44 bytes, yay! Nice side effects: - "gi->gid[i]" is shorter than "GROUP_AT(gi, i)", less typing, - fix little mess in net/ipv4/ping.c should have been using GROUP_AT macro but this point becomes moot, - aux group allocation is persistent and should be accounted as such. Link: http://lkml.kernel.org/r/20160817201927.GA2096@p183.telecom.by Signed-off-by: Alexey Dobriyan Cc: Vasily Kulikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/kernel/compat_linux.c | 4 +- drivers/staging/lustre/lustre/ptlrpc/sec.c | 2 +- fs/nfsd/auth.c | 6 +-- fs/nfsd/nfs4state.c | 2 +- fs/proc/array.c | 2 +- include/linux/cred.h | 11 +---- kernel/groups.c | 67 ++++++++++-------------------- kernel/uid16.c | 4 +- net/ipv4/ping.c | 15 +++---- net/sunrpc/auth_generic.c | 4 +- net/sunrpc/auth_gss/gss_rpc_xdr.c | 2 +- net/sunrpc/auth_gss/svcauth_gss.c | 2 +- net/sunrpc/auth_unix.c | 4 +- net/sunrpc/svcauth_unix.c | 6 +-- 14 files changed, 46 insertions(+), 85 deletions(-) (limited to 'kernel') diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c index 437e61159279..0f9cd90c11af 100644 --- a/arch/s390/kernel/compat_linux.c +++ b/arch/s390/kernel/compat_linux.c @@ -189,7 +189,7 @@ static int groups16_to_user(u16 __user *grouplist, struct group_info *group_info kgid_t kgid; for (i = 0; i < group_info->ngroups; i++) { - kgid = GROUP_AT(group_info, i); + kgid = group_info->gid[i]; group = (u16)from_kgid_munged(user_ns, kgid); if (put_user(group, grouplist+i)) return -EFAULT; @@ -213,7 +213,7 @@ static int groups16_from_user(struct group_info *group_info, u16 __user *groupli if (!gid_valid(kgid)) return -EINVAL; - GROUP_AT(group_info, i) = kgid; + group_info->gid[i] = kgid; } return 0; diff --git a/drivers/staging/lustre/lustre/ptlrpc/sec.c b/drivers/staging/lustre/lustre/ptlrpc/sec.c index 5d3995d5c69a..a7416cd9ac71 100644 --- a/drivers/staging/lustre/lustre/ptlrpc/sec.c +++ b/drivers/staging/lustre/lustre/ptlrpc/sec.c @@ -2220,7 +2220,7 @@ int sptlrpc_pack_user_desc(struct lustre_msg *msg, int offset) task_lock(current); if (pud->pud_ngroups > current_ngroups) pud->pud_ngroups = current_ngroups; - memcpy(pud->pud_groups, current_cred()->group_info->blocks[0], + memcpy(pud->pud_groups, current_cred()->group_info->gid, pud->pud_ngroups * sizeof(__u32)); task_unlock(current); diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 9d46a0bdd9f9..62469c60be23 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -55,10 +55,10 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) goto oom; for (i = 0; i < rqgi->ngroups; i++) { - if (gid_eq(GLOBAL_ROOT_GID, GROUP_AT(rqgi, i))) - GROUP_AT(gi, i) = exp->ex_anon_gid; + if (gid_eq(GLOBAL_ROOT_GID, rqgi->gid[i])) + gi->gid[i] = exp->ex_anon_gid; else - GROUP_AT(gi, i) = GROUP_AT(rqgi, i); + gi->gid[i] = rqgi->gid[i]; } } else { gi = get_group_info(rqgi); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index a204d7e109d4..39bfaba9c99c 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1903,7 +1903,7 @@ static bool groups_equal(struct group_info *g1, struct group_info *g2) if (g1->ngroups != g2->ngroups) return false; for (i=0; ingroups; i++) - if (!gid_eq(GROUP_AT(g1, i), GROUP_AT(g2, i))) + if (!gid_eq(g1->gid[i], g2->gid[i])) return false; return true; } diff --git a/fs/proc/array.c b/fs/proc/array.c index d25b44601b30..89600fd5963d 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -207,7 +207,7 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, group_info = cred->group_info; for (g = 0; g < group_info->ngroups; g++) seq_put_decimal_ull(m, g ? " " : "", - from_kgid_munged(user_ns, GROUP_AT(group_info, g))); + from_kgid_munged(user_ns, group_info->gid[g])); put_cred(cred); /* Trailing space shouldn't have been added in the first place. */ seq_putc(m, ' '); diff --git a/include/linux/cred.h b/include/linux/cred.h index 257db64562e5..f0e70a1bb3ac 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -26,15 +26,10 @@ struct inode; /* * COW Supplementary groups list */ -#define NGROUPS_SMALL 32 -#define NGROUPS_PER_BLOCK ((unsigned int)(PAGE_SIZE / sizeof(kgid_t))) - struct group_info { atomic_t usage; int ngroups; - int nblocks; - kgid_t small_block[NGROUPS_SMALL]; - kgid_t *blocks[0]; + kgid_t gid[0]; }; /** @@ -88,10 +83,6 @@ extern void set_groups(struct cred *, struct group_info *); extern int groups_search(const struct group_info *, kgid_t); extern bool may_setgroups(void); -/* access the groups "array" with this macro */ -#define GROUP_AT(gi, i) \ - ((gi)->blocks[(i) / NGROUPS_PER_BLOCK][(i) % NGROUPS_PER_BLOCK]) - /* * The security context of a task * diff --git a/kernel/groups.c b/kernel/groups.c index 74d431d25251..2fcadd66a8fd 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -7,55 +7,31 @@ #include #include #include +#include #include struct group_info *groups_alloc(int gidsetsize) { - struct group_info *group_info; - int nblocks; - int i; - - nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK; - /* Make sure we always allocate at least one indirect block pointer */ - nblocks = nblocks ? : 1; - group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER); - if (!group_info) + struct group_info *gi; + unsigned int len; + + len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize; + gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY); + if (!gi) + gi = __vmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_HIGHMEM, PAGE_KERNEL); + if (!gi) return NULL; - group_info->ngroups = gidsetsize; - group_info->nblocks = nblocks; - atomic_set(&group_info->usage, 1); - - if (gidsetsize <= NGROUPS_SMALL) - group_info->blocks[0] = group_info->small_block; - else { - for (i = 0; i < nblocks; i++) { - kgid_t *b; - b = (void *)__get_free_page(GFP_USER); - if (!b) - goto out_undo_partial_alloc; - group_info->blocks[i] = b; - } - } - return group_info; -out_undo_partial_alloc: - while (--i >= 0) { - free_page((unsigned long)group_info->blocks[i]); - } - kfree(group_info); - return NULL; + atomic_set(&gi->usage, 1); + gi->ngroups = gidsetsize; + return gi; } EXPORT_SYMBOL(groups_alloc); void groups_free(struct group_info *group_info) { - if (group_info->blocks[0] != group_info->small_block) { - int i; - for (i = 0; i < group_info->nblocks; i++) - free_page((unsigned long)group_info->blocks[i]); - } - kfree(group_info); + kvfree(group_info); } EXPORT_SYMBOL(groups_free); @@ -70,7 +46,7 @@ static int groups_to_user(gid_t __user *grouplist, for (i = 0; i < count; i++) { gid_t gid; - gid = from_kgid_munged(user_ns, GROUP_AT(group_info, i)); + gid = from_kgid_munged(user_ns, group_info->gid[i]); if (put_user(gid, grouplist+i)) return -EFAULT; } @@ -95,7 +71,7 @@ static int groups_from_user(struct group_info *group_info, if (!gid_valid(kgid)) return -EINVAL; - GROUP_AT(group_info, i) = kgid; + group_info->gid[i] = kgid; } return 0; } @@ -115,15 +91,14 @@ static void groups_sort(struct group_info *group_info) for (base = 0; base < max; base++) { int left = base; int right = left + stride; - kgid_t tmp = GROUP_AT(group_info, right); + kgid_t tmp = group_info->gid[right]; - while (left >= 0 && gid_gt(GROUP_AT(group_info, left), tmp)) { - GROUP_AT(group_info, right) = - GROUP_AT(group_info, left); + while (left >= 0 && gid_gt(group_info->gid[left], tmp)) { + group_info->gid[right] = group_info->gid[left]; right = left; left -= stride; } - GROUP_AT(group_info, right) = tmp; + group_info->gid[right] = tmp; } stride /= 3; } @@ -141,9 +116,9 @@ int groups_search(const struct group_info *group_info, kgid_t grp) right = group_info->ngroups; while (left < right) { unsigned int mid = (left+right)/2; - if (gid_gt(grp, GROUP_AT(group_info, mid))) + if (gid_gt(grp, group_info->gid[mid])) left = mid + 1; - else if (gid_lt(grp, GROUP_AT(group_info, mid))) + else if (gid_lt(grp, group_info->gid[mid])) right = mid; else return 1; diff --git a/kernel/uid16.c b/kernel/uid16.c index d58cc4d8f0d1..cc40793464e3 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -117,7 +117,7 @@ static int groups16_to_user(old_gid_t __user *grouplist, kgid_t kgid; for (i = 0; i < group_info->ngroups; i++) { - kgid = GROUP_AT(group_info, i); + kgid = group_info->gid[i]; group = high2lowgid(from_kgid_munged(user_ns, kgid)); if (put_user(group, grouplist+i)) return -EFAULT; @@ -142,7 +142,7 @@ static int groups16_from_user(struct group_info *group_info, if (!gid_valid(kgid)) return -EINVAL; - GROUP_AT(group_info, i) = kgid; + group_info->gid[i] = kgid; } return 0; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 66ddcb60519a..7cf7d6e380c2 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -258,7 +258,7 @@ int ping_init_sock(struct sock *sk) struct net *net = sock_net(sk); kgid_t group = current_egid(); struct group_info *group_info; - int i, j, count; + int i; kgid_t low, high; int ret = 0; @@ -270,16 +270,11 @@ int ping_init_sock(struct sock *sk) return 0; group_info = get_current_groups(); - count = group_info->ngroups; - for (i = 0; i < group_info->nblocks; i++) { - int cp_count = min_t(int, NGROUPS_PER_BLOCK, count); - for (j = 0; j < cp_count; j++) { - kgid_t gid = group_info->blocks[i][j]; - if (gid_lte(low, gid) && gid_lte(gid, high)) - goto out_release_group; - } + for (i = 0; i < group_info->ngroups; i++) { + kgid_t gid = group_info->gid[i]; - count -= cp_count; + if (gid_lte(low, gid) && gid_lte(gid, high)) + goto out_release_group; } ret = -EACCES; diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c index 168219535a34..83dffeadf20a 100644 --- a/net/sunrpc/auth_generic.c +++ b/net/sunrpc/auth_generic.c @@ -176,8 +176,8 @@ generic_match(struct auth_cred *acred, struct rpc_cred *cred, int flags) if (gcred->acred.group_info->ngroups != acred->group_info->ngroups) goto out_nomatch; for (i = 0; i < gcred->acred.group_info->ngroups; i++) { - if (!gid_eq(GROUP_AT(gcred->acred.group_info, i), - GROUP_AT(acred->group_info, i))) + if (!gid_eq(gcred->acred.group_info->gid[i], + acred->group_info->gid[i])) goto out_nomatch; } out_match: diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c index eeeba5adee6d..dc6fb79a361f 100644 --- a/net/sunrpc/auth_gss/gss_rpc_xdr.c +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c @@ -229,7 +229,7 @@ static int gssx_dec_linux_creds(struct xdr_stream *xdr, kgid = make_kgid(&init_user_ns, tmp); if (!gid_valid(kgid)) goto out_free_groups; - GROUP_AT(creds->cr_group_info, i) = kgid; + creds->cr_group_info->gid[i] = kgid; } return 0; diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index d8582028b346..d67f7e1bc82d 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -479,7 +479,7 @@ static int rsc_parse(struct cache_detail *cd, kgid = make_kgid(&init_user_ns, id); if (!gid_valid(kgid)) goto out; - GROUP_AT(rsci.cred.cr_group_info, i) = kgid; + rsci.cred.cr_group_info->gid[i] = kgid; } /* mech name */ diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index a99278c984e8..a1d768a973f5 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -79,7 +79,7 @@ unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags, gfp_t cred->uc_gid = acred->gid; for (i = 0; i < groups; i++) - cred->uc_gids[i] = GROUP_AT(acred->group_info, i); + cred->uc_gids[i] = acred->group_info->gid[i]; if (i < NFS_NGROUPS) cred->uc_gids[i] = INVALID_GID; @@ -127,7 +127,7 @@ unx_match(struct auth_cred *acred, struct rpc_cred *rcred, int flags) if (groups > NFS_NGROUPS) groups = NFS_NGROUPS; for (i = 0; i < groups ; i++) - if (!gid_eq(cred->uc_gids[i], GROUP_AT(acred->group_info, i))) + if (!gid_eq(cred->uc_gids[i], acred->group_info->gid[i])) return 0; if (groups < NFS_NGROUPS && gid_valid(cred->uc_gids[groups])) return 0; diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index dfacdc95b3f5..64af4f034de6 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -517,7 +517,7 @@ static int unix_gid_parse(struct cache_detail *cd, kgid = make_kgid(&init_user_ns, gid); if (!gid_valid(kgid)) goto out; - GROUP_AT(ug.gi, i) = kgid; + ug.gi->gid[i] = kgid; } ugp = unix_gid_lookup(cd, uid); @@ -564,7 +564,7 @@ static int unix_gid_show(struct seq_file *m, seq_printf(m, "%u %d:", from_kuid_munged(user_ns, ug->uid), glen); for (i = 0; i < glen; i++) - seq_printf(m, " %d", from_kgid_munged(user_ns, GROUP_AT(ug->gi, i))); + seq_printf(m, " %d", from_kgid_munged(user_ns, ug->gi->gid[i])); seq_printf(m, "\n"); return 0; } @@ -817,7 +817,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp) return SVC_CLOSE; for (i = 0; i < slen; i++) { kgid_t kgid = make_kgid(&init_user_ns, svc_getnl(argv)); - GROUP_AT(cred->cr_group_info, i) = kgid; + cred->cr_group_info->gid[i] = kgid; } if (svc_getu32(argv) != htonl(RPC_AUTH_NULL) || svc_getu32(argv) != 0) { *authp = rpc_autherr_badverf; -- cgit v1.2.3-59-g8ed1b From 05fd007e46296afb24d15c7d589d535e5a5b9d5c Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Fri, 7 Oct 2016 17:03:15 -0700 Subject: console: don't prefer first registered if DT specifies stdout-path If a device tree specifies a preferred device for kernel console output via the stdout-path or linux,stdout-path chosen node properties or the stdout alias then the kernel ought to honor it & output the kernel console to that device. As it stands, this isn't the case. Whilst we parse the stdout-path properties & set an of_stdout variable from of_alias_scan(), and use that from of_console_check() to determine whether to add a console device as a preferred console whilst registering it, we also prefer the first registered console if no other has been selected at the time of its registration. This means that if a console other than the one the device tree selects via stdout-path is registered first, we will switch to using it & when the stdout-path console is later registered the call to add_preferred_console() via of_console_check() is too late to do anything useful. In practice this seems to mean that we switch to the dummy console device fairly early & see no further console output: Console: colour dummy device 80x25 console [tty0] enabled bootconsole [ns16550a0] disabled Fix this by not automatically preferring the first registered console if one is specified by the device tree. This allows consoles to be registered but not enabled, and once the driver for the console selected by stdout-path calls of_console_check() the driver will be added to the list of preferred consoles before any other console has been enabled. When that console is then registered via register_console() it will be enabled as expected. Link: http://lkml.kernel.org/r/20160809151937.26118-1-paul.burton@imgtec.com Signed-off-by: Paul Burton Cc: Ralf Baechle Cc: Paul Burton Cc: Tejun Heo Cc: Sergey Senozhatsky Cc: Jiri Slaby Cc: Daniel Vetter Cc: Ivan Delalande Cc: Thierry Reding Cc: Borislav Petkov Cc: Jan Kara Cc: Petr Mladek Cc: Joe Perches Cc: Greg Kroah-Hartman Cc: Rob Herring Cc: Frank Rowand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/of/base.c | 2 ++ include/linux/console.h | 6 ++++++ kernel/printk/printk.c | 13 ++++++++++++- 3 files changed, 20 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/drivers/of/base.c b/drivers/of/base.c index a0bccb54a9bd..d687e6de24a0 100644 --- a/drivers/of/base.c +++ b/drivers/of/base.c @@ -2077,6 +2077,8 @@ void of_alias_scan(void * (*dt_alloc)(u64 size, u64 align)) name = of_get_property(of_aliases, "stdout", NULL); if (name) of_stdout = of_find_node_opts_by_path(name, &of_stdout_options); + if (of_stdout) + console_set_by_of(); } if (!of_aliases) diff --git a/include/linux/console.h b/include/linux/console.h index d530c4627e54..3672809234a7 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -173,6 +173,12 @@ static inline void console_sysfs_notify(void) #endif extern bool console_suspend_enabled; +#ifdef CONFIG_OF +extern void console_set_by_of(void); +#else +static inline void console_set_by_of(void) {} +#endif + /* Suspend and resume console messages over PM events */ extern void suspend_console(void); extern void resume_console(void); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index eea6dbc2d8cf..8019cc0d3a73 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -253,6 +253,17 @@ static int preferred_console = -1; int console_set_on_cmdline; EXPORT_SYMBOL(console_set_on_cmdline); +#ifdef CONFIG_OF +static bool of_specified_console; + +void console_set_by_of(void) +{ + of_specified_console = true; +} +#else +# define of_specified_console false +#endif + /* Flag: console code may call schedule() */ static int console_may_schedule; @@ -2647,7 +2658,7 @@ void register_console(struct console *newcon) * didn't select a console we take the first one * that registers here. */ - if (preferred_console < 0) { + if (preferred_console < 0 && !of_specified_console) { if (newcon->index < 0) newcon->index = 0; if (newcon->setup == NULL || -- cgit v1.2.3-59-g8ed1b