From 0633da48f0793aeba27f82d30605624416723a91 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Tue, 21 Aug 2018 21:51:45 -0700 Subject: autofs: fix autofs_sbi() does not check super block type autofs_sbi() does not check the superblock magic number to verify it has been given an autofs super block. Link: http://lkml.kernel.org/r/153475422934.17131.7563724552005298277.stgit@pluto.themaw.net Reported-by: Signed-off-by: Ian Kent Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs/autofs_i.h | 4 +++- fs/autofs/inode.c | 1 - 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index 9400a9f6318a..5057b9f0f846 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -26,6 +26,7 @@ #include #include #include +#include /* This is the range of ioctl() numbers we claim as ours */ #define AUTOFS_IOC_FIRST AUTOFS_IOC_READY @@ -124,7 +125,8 @@ struct autofs_sb_info { static inline struct autofs_sb_info *autofs_sbi(struct super_block *sb) { - return (struct autofs_sb_info *)(sb->s_fs_info); + return sb->s_magic != AUTOFS_SUPER_MAGIC ? + NULL : (struct autofs_sb_info *)(sb->s_fs_info); } static inline struct autofs_info *autofs_dentry_ino(struct dentry *dentry) diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index b51980fc274e..846c052569dd 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -10,7 +10,6 @@ #include #include #include -#include #include "autofs_i.h" -- cgit v1.2.3-59-g8ed1b From 871305bb20280804882bd08b39a38ccf1b4b68f9 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 21 Aug 2018 21:52:48 -0700 Subject: mm: /proc/pid/*maps remove is_pid and related wrappers Patch series "cleanups and refactor of /proc/pid/smaps*". The recent regression in /proc/pid/smaps made me look more into the code. Especially the issues with smaps_rollup reported in [1] as explained in Patch 4, which fixes them by refactoring the code. Patches 2 and 3 are preparations for that. Patch 1 is me realizing that there's a lot of boilerplate left from times where we tried (unsuccessfuly) to mark thread stacks in the output. Originally I had also plans to rework the translation from /proc/pid/*maps* file offsets to the internal structures. Now the offset means "vma number", which is not really stable (vma's can come and go between read() calls) and there's an extra caching of last vma's address. My idea was that offsets would be interpreted directly as addresses, which would also allow meaningful seeks (see the ugly seek_to_smaps_entry() in tools/testing/selftests/vm/mlock2.h). However loff_t is (signed) long long so that might be insufficient somewhere for the unsigned long addresses. So the result is fixed issues with skewed /proc/pid/smaps_rollup results, simpler smaps code, and a lot of unused code removed. [1] https://marc.info/?l=linux-mm&m=151927723128134&w=2 This patch (of 4): Commit b76437579d13 ("procfs: mark thread stack correctly in proc//maps") introduced differences between /proc/PID/maps and /proc/PID/task/TID/maps to mark thread stacks properly, and this was also done for smaps and numa_maps. However it didn't work properly and was ultimately removed by commit b18cb64ead40 ("fs/proc: Stop trying to report thread stacks"). Now the is_pid parameter for the related show_*() functions is unused and we can remove it together with wrapper functions and ops structures that differ for PID and TID cases only in this parameter. Link: http://lkml.kernel.org/r/20180723111933.15443-2-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Alexey Dobriyan Cc: Daniel Colascione Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 6 +-- fs/proc/internal.h | 3 -- fs/proc/task_mmu.c | 114 +++++---------------------------------------------- fs/proc/task_nommu.c | 39 ++---------------- 4 files changed, 18 insertions(+), 144 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index aaffc0c30216..ad047977ed04 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3309,12 +3309,12 @@ static const struct pid_entry tid_base_stuff[] = { REG("cmdline", S_IRUGO, proc_pid_cmdline_ops), ONE("stat", S_IRUGO, proc_tid_stat), ONE("statm", S_IRUGO, proc_pid_statm), - REG("maps", S_IRUGO, proc_tid_maps_operations), + REG("maps", S_IRUGO, proc_pid_maps_operations), #ifdef CONFIG_PROC_CHILDREN REG("children", S_IRUGO, proc_tid_children_operations), #endif #ifdef CONFIG_NUMA - REG("numa_maps", S_IRUGO, proc_tid_numa_maps_operations), + REG("numa_maps", S_IRUGO, proc_pid_numa_maps_operations), #endif REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), LNK("cwd", proc_cwd_link), @@ -3324,7 +3324,7 @@ static const struct pid_entry tid_base_stuff[] = { REG("mountinfo", S_IRUGO, proc_mountinfo_operations), #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, proc_clear_refs_operations), - REG("smaps", S_IRUGO, proc_tid_smaps_operations), + REG("smaps", S_IRUGO, proc_pid_smaps_operations), REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif diff --git a/fs/proc/internal.h b/fs/proc/internal.h index da3dbfa09e79..0c538769512a 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -297,12 +297,9 @@ struct proc_maps_private { struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode); extern const struct file_operations proc_pid_maps_operations; -extern const struct file_operations proc_tid_maps_operations; extern const struct file_operations proc_pid_numa_maps_operations; -extern const struct file_operations proc_tid_numa_maps_operations; extern const struct file_operations proc_pid_smaps_operations; extern const struct file_operations proc_pid_smaps_rollup_operations; -extern const struct file_operations proc_tid_smaps_operations; extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index dfd73a4616ce..a3f98ca50981 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -294,7 +294,7 @@ static void show_vma_header_prefix(struct seq_file *m, } static void -show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) +show_map_vma(struct seq_file *m, struct vm_area_struct *vma) { struct mm_struct *mm = vma->vm_mm; struct file *file = vma->vm_file; @@ -357,35 +357,18 @@ done: seq_putc(m, '\n'); } -static int show_map(struct seq_file *m, void *v, int is_pid) +static int show_map(struct seq_file *m, void *v) { - show_map_vma(m, v, is_pid); + show_map_vma(m, v); m_cache_vma(m, v); return 0; } -static int show_pid_map(struct seq_file *m, void *v) -{ - return show_map(m, v, 1); -} - -static int show_tid_map(struct seq_file *m, void *v) -{ - return show_map(m, v, 0); -} - static const struct seq_operations proc_pid_maps_op = { .start = m_start, .next = m_next, .stop = m_stop, - .show = show_pid_map -}; - -static const struct seq_operations proc_tid_maps_op = { - .start = m_start, - .next = m_next, - .stop = m_stop, - .show = show_tid_map + .show = show_map }; static int pid_maps_open(struct inode *inode, struct file *file) @@ -393,11 +376,6 @@ static int pid_maps_open(struct inode *inode, struct file *file) return do_maps_open(inode, file, &proc_pid_maps_op); } -static int tid_maps_open(struct inode *inode, struct file *file) -{ - return do_maps_open(inode, file, &proc_tid_maps_op); -} - const struct file_operations proc_pid_maps_operations = { .open = pid_maps_open, .read = seq_read, @@ -405,13 +383,6 @@ const struct file_operations proc_pid_maps_operations = { .release = proc_map_release, }; -const struct file_operations proc_tid_maps_operations = { - .open = tid_maps_open, - .read = seq_read, - .llseek = seq_lseek, - .release = proc_map_release, -}; - /* * Proportional Set Size(PSS): my share of RSS. * @@ -733,7 +704,7 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, #define SEQ_PUT_DEC(str, val) \ seq_put_decimal_ull_width(m, str, (val) >> 10, 8) -static int show_smap(struct seq_file *m, void *v, int is_pid) +static int show_smap(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; struct vm_area_struct *vma = v; @@ -796,7 +767,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) mss->pss_locked += mss->pss; if (!rollup_mode) { - show_map_vma(m, vma, is_pid); + show_map_vma(m, vma); } else if (last_vma) { show_vma_header_prefix( m, mss->first_vma_start, vma->vm_end, 0, 0, 0, 0); @@ -845,28 +816,11 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) } #undef SEQ_PUT_DEC -static int show_pid_smap(struct seq_file *m, void *v) -{ - return show_smap(m, v, 1); -} - -static int show_tid_smap(struct seq_file *m, void *v) -{ - return show_smap(m, v, 0); -} - static const struct seq_operations proc_pid_smaps_op = { .start = m_start, .next = m_next, .stop = m_stop, - .show = show_pid_smap -}; - -static const struct seq_operations proc_tid_smaps_op = { - .start = m_start, - .next = m_next, - .stop = m_stop, - .show = show_tid_smap + .show = show_smap }; static int pid_smaps_open(struct inode *inode, struct file *file) @@ -893,11 +847,6 @@ static int pid_smaps_rollup_open(struct inode *inode, struct file *file) return 0; } -static int tid_smaps_open(struct inode *inode, struct file *file) -{ - return do_maps_open(inode, file, &proc_tid_smaps_op); -} - const struct file_operations proc_pid_smaps_operations = { .open = pid_smaps_open, .read = seq_read, @@ -912,13 +861,6 @@ const struct file_operations proc_pid_smaps_rollup_operations = { .release = proc_map_release, }; -const struct file_operations proc_tid_smaps_operations = { - .open = tid_smaps_open, - .read = seq_read, - .llseek = seq_lseek, - .release = proc_map_release, -}; - enum clear_refs_types { CLEAR_REFS_ALL = 1, CLEAR_REFS_ANON, @@ -1728,7 +1670,7 @@ static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, /* * Display pages allocated per node and memory policy via /proc. */ -static int show_numa_map(struct seq_file *m, void *v, int is_pid) +static int show_numa_map(struct seq_file *m, void *v) { struct numa_maps_private *numa_priv = m->private; struct proc_maps_private *proc_priv = &numa_priv->proc_maps; @@ -1812,45 +1754,17 @@ out: return 0; } -static int show_pid_numa_map(struct seq_file *m, void *v) -{ - return show_numa_map(m, v, 1); -} - -static int show_tid_numa_map(struct seq_file *m, void *v) -{ - return show_numa_map(m, v, 0); -} - static const struct seq_operations proc_pid_numa_maps_op = { .start = m_start, .next = m_next, .stop = m_stop, - .show = show_pid_numa_map, + .show = show_numa_map, }; -static const struct seq_operations proc_tid_numa_maps_op = { - .start = m_start, - .next = m_next, - .stop = m_stop, - .show = show_tid_numa_map, -}; - -static int numa_maps_open(struct inode *inode, struct file *file, - const struct seq_operations *ops) -{ - return proc_maps_open(inode, file, ops, - sizeof(struct numa_maps_private)); -} - static int pid_numa_maps_open(struct inode *inode, struct file *file) { - return numa_maps_open(inode, file, &proc_pid_numa_maps_op); -} - -static int tid_numa_maps_open(struct inode *inode, struct file *file) -{ - return numa_maps_open(inode, file, &proc_tid_numa_maps_op); + return proc_maps_open(inode, file, &proc_pid_numa_maps_op, + sizeof(struct numa_maps_private)); } const struct file_operations proc_pid_numa_maps_operations = { @@ -1860,10 +1774,4 @@ const struct file_operations proc_pid_numa_maps_operations = { .release = proc_map_release, }; -const struct file_operations proc_tid_numa_maps_operations = { - .open = tid_numa_maps_open, - .read = seq_read, - .llseek = seq_lseek, - .release = proc_map_release, -}; #endif /* CONFIG_NUMA */ diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 5b62f57bd9bc..0b63d68dedb2 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -142,8 +142,7 @@ static int is_stack(struct vm_area_struct *vma) /* * display a single VMA to a sequenced file */ -static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, - int is_pid) +static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) { struct mm_struct *mm = vma->vm_mm; unsigned long ino = 0; @@ -189,22 +188,11 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, /* * display mapping lines for a particular process's /proc/pid/maps */ -static int show_map(struct seq_file *m, void *_p, int is_pid) +static int show_map(struct seq_file *m, void *_p) { struct rb_node *p = _p; - return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb), - is_pid); -} - -static int show_pid_map(struct seq_file *m, void *_p) -{ - return show_map(m, _p, 1); -} - -static int show_tid_map(struct seq_file *m, void *_p) -{ - return show_map(m, _p, 0); + return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb)); } static void *m_start(struct seq_file *m, loff_t *pos) @@ -260,14 +248,7 @@ static const struct seq_operations proc_pid_maps_ops = { .start = m_start, .next = m_next, .stop = m_stop, - .show = show_pid_map -}; - -static const struct seq_operations proc_tid_maps_ops = { - .start = m_start, - .next = m_next, - .stop = m_stop, - .show = show_tid_map + .show = show_map }; static int maps_open(struct inode *inode, struct file *file, @@ -308,11 +289,6 @@ static int pid_maps_open(struct inode *inode, struct file *file) return maps_open(inode, file, &proc_pid_maps_ops); } -static int tid_maps_open(struct inode *inode, struct file *file) -{ - return maps_open(inode, file, &proc_tid_maps_ops); -} - const struct file_operations proc_pid_maps_operations = { .open = pid_maps_open, .read = seq_read, @@ -320,10 +296,3 @@ const struct file_operations proc_pid_maps_operations = { .release = map_release, }; -const struct file_operations proc_tid_maps_operations = { - .open = tid_maps_open, - .read = seq_read, - .llseek = seq_lseek, - .release = map_release, -}; - -- cgit v1.2.3-59-g8ed1b From 8e68d689afe3284a5bc1663562d3e0a45d1c64fd Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 21 Aug 2018 21:52:52 -0700 Subject: mm: /proc/pid/smaps: factor out mem stats gathering To prepare for handling /proc/pid/smaps_rollup differently from /proc/pid/smaps factor out vma mem stats gathering from show_smap() - it will be used by both. Link: http://lkml.kernel.org/r/20180723111933.15443-3-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Alexey Dobriyan Cc: Daniel Colascione Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 55 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 31 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index a3f98ca50981..d2ca88c92d9d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -702,14 +702,9 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, } #endif /* HUGETLB_PAGE */ -#define SEQ_PUT_DEC(str, val) \ - seq_put_decimal_ull_width(m, str, (val) >> 10, 8) -static int show_smap(struct seq_file *m, void *v) +static void smap_gather_stats(struct vm_area_struct *vma, + struct mem_size_stats *mss) { - struct proc_maps_private *priv = m->private; - struct vm_area_struct *vma = v; - struct mem_size_stats mss_stack; - struct mem_size_stats *mss; struct mm_walk smaps_walk = { .pmd_entry = smaps_pte_range, #ifdef CONFIG_HUGETLB_PAGE @@ -717,23 +712,6 @@ static int show_smap(struct seq_file *m, void *v) #endif .mm = vma->vm_mm, }; - int ret = 0; - bool rollup_mode; - bool last_vma; - - if (priv->rollup) { - rollup_mode = true; - mss = priv->rollup; - if (mss->first) { - mss->first_vma_start = vma->vm_start; - mss->first = false; - } - last_vma = !m_next_vma(priv, vma); - } else { - rollup_mode = false; - memset(&mss_stack, 0, sizeof(mss_stack)); - mss = &mss_stack; - } smaps_walk.private = mss; @@ -765,6 +743,35 @@ static int show_smap(struct seq_file *m, void *v) walk_page_vma(vma, &smaps_walk); if (vma->vm_flags & VM_LOCKED) mss->pss_locked += mss->pss; +} + +#define SEQ_PUT_DEC(str, val) \ + seq_put_decimal_ull_width(m, str, (val) >> 10, 8) +static int show_smap(struct seq_file *m, void *v) +{ + struct proc_maps_private *priv = m->private; + struct vm_area_struct *vma = v; + struct mem_size_stats mss_stack; + struct mem_size_stats *mss; + int ret = 0; + bool rollup_mode; + bool last_vma; + + if (priv->rollup) { + rollup_mode = true; + mss = priv->rollup; + if (mss->first) { + mss->first_vma_start = vma->vm_start; + mss->first = false; + } + last_vma = !m_next_vma(priv, vma); + } else { + rollup_mode = false; + memset(&mss_stack, 0, sizeof(mss_stack)); + mss = &mss_stack; + } + + smap_gather_stats(vma, mss); if (!rollup_mode) { show_map_vma(m, vma); -- cgit v1.2.3-59-g8ed1b From f1547959d9efd0be6cac2a2fd32f05dd7144dd6c Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 21 Aug 2018 21:52:56 -0700 Subject: mm: /proc/pid/smaps: factor out common stats printing To prepare for handling /proc/pid/smaps_rollup differently from /proc/pid/smaps factor out from show_smap() printing the parts of output that are common for both variants, which is the bulk of the gathered memory stats. [vbabka@suse.cz: add const, per Alexey] Link: http://lkml.kernel.org/r/b45f319f-cd04-337b-37f8-77f99786aa8a@suse.cz Link: http://lkml.kernel.org/r/20180723111933.15443-4-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Alexey Dobriyan Cc: Daniel Colascione Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 51 +++++++++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index d2ca88c92d9d..c47f3cab70a1 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -747,6 +747,32 @@ static void smap_gather_stats(struct vm_area_struct *vma, #define SEQ_PUT_DEC(str, val) \ seq_put_decimal_ull_width(m, str, (val) >> 10, 8) + +/* Show the contents common for smaps and smaps_rollup */ +static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss) +{ + SEQ_PUT_DEC("Rss: ", mss->resident); + SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT); + SEQ_PUT_DEC(" kB\nShared_Clean: ", mss->shared_clean); + SEQ_PUT_DEC(" kB\nShared_Dirty: ", mss->shared_dirty); + SEQ_PUT_DEC(" kB\nPrivate_Clean: ", mss->private_clean); + SEQ_PUT_DEC(" kB\nPrivate_Dirty: ", mss->private_dirty); + SEQ_PUT_DEC(" kB\nReferenced: ", mss->referenced); + SEQ_PUT_DEC(" kB\nAnonymous: ", mss->anonymous); + SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree); + SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp); + SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp); + SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb); + seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ", + mss->private_hugetlb >> 10, 7); + SEQ_PUT_DEC(" kB\nSwap: ", mss->swap); + SEQ_PUT_DEC(" kB\nSwapPss: ", + mss->swap_pss >> PSS_SHIFT); + SEQ_PUT_DEC(" kB\nLocked: ", + mss->pss_locked >> PSS_SHIFT); + seq_puts(m, " kB\n"); +} + static int show_smap(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; @@ -791,28 +817,9 @@ static int show_smap(struct seq_file *m, void *v) seq_puts(m, " kB\n"); } - if (!rollup_mode || last_vma) { - SEQ_PUT_DEC("Rss: ", mss->resident); - SEQ_PUT_DEC(" kB\nPss: ", mss->pss >> PSS_SHIFT); - SEQ_PUT_DEC(" kB\nShared_Clean: ", mss->shared_clean); - SEQ_PUT_DEC(" kB\nShared_Dirty: ", mss->shared_dirty); - SEQ_PUT_DEC(" kB\nPrivate_Clean: ", mss->private_clean); - SEQ_PUT_DEC(" kB\nPrivate_Dirty: ", mss->private_dirty); - SEQ_PUT_DEC(" kB\nReferenced: ", mss->referenced); - SEQ_PUT_DEC(" kB\nAnonymous: ", mss->anonymous); - SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree); - SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp); - SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp); - SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb); - seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ", - mss->private_hugetlb >> 10, 7); - SEQ_PUT_DEC(" kB\nSwap: ", mss->swap); - SEQ_PUT_DEC(" kB\nSwapPss: ", - mss->swap_pss >> PSS_SHIFT); - SEQ_PUT_DEC(" kB\nLocked: ", - mss->pss_locked >> PSS_SHIFT); - seq_puts(m, " kB\n"); - } + if (!rollup_mode || last_vma) + __show_smap(m, mss); + if (!rollup_mode) { if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); -- cgit v1.2.3-59-g8ed1b From 258f669e7e88c18edbc23fe5ce00a476b924551f Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 21 Aug 2018 21:52:59 -0700 Subject: mm: /proc/pid/smaps_rollup: convert to single value seq_file The /proc/pid/smaps_rollup file is currently implemented via the m_start/m_next/m_stop seq_file iterators shared with the other maps files, that iterate over vma's. However, the rollup file doesn't print anything for each vma, only accumulate the stats. There are some issues with the current code as reported in [1] - the accumulated stats can get skewed if seq_file start()/stop() op is called multiple times, if show() is called multiple times, and after seeks to non-zero position. Patch [1] fixed those within existing design, but I believe it is fundamentally wrong to expose the vma iterators to the seq_file mechanism when smaps_rollup shows logically a single set of values for the whole address space. This patch thus refactors the code to provide a single "value" at offset 0, with vma iteration to gather the stats done internally. This fixes the situations where results are skewed, and simplifies the code, especially in show_smap(), at the expense of somewhat less code reuse. [1] https://marc.info/?l=linux-mm&m=151927723128134&w=2 [vbabka@suse.c: use seq_file infrastructure] Link: http://lkml.kernel.org/r/bf4525b0-fd5b-4c4c-2cb3-adee3dd95a48@suse.cz Link: http://lkml.kernel.org/r/20180723111933.15443-5-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reported-by: Daniel Colascione Reviewed-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/internal.h | 1 - fs/proc/task_mmu.c | 155 +++++++++++++++++++++++++++++++++-------------------- 2 files changed, 96 insertions(+), 60 deletions(-) (limited to 'fs') diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 0c538769512a..f5b75d258d22 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -285,7 +285,6 @@ struct proc_maps_private { struct inode *inode; struct task_struct *task; struct mm_struct *mm; - struct mem_size_stats *rollup; #ifdef CONFIG_MMU struct vm_area_struct *tail_vma; #endif diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index c47f3cab70a1..5ea1d64cb0b4 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -247,7 +247,6 @@ static int proc_map_release(struct inode *inode, struct file *file) if (priv->mm) mmdrop(priv->mm); - kfree(priv->rollup); return seq_release_private(inode, file); } @@ -404,7 +403,6 @@ const struct file_operations proc_pid_maps_operations = { #ifdef CONFIG_PROC_PAGE_MONITOR struct mem_size_stats { - bool first; unsigned long resident; unsigned long shared_clean; unsigned long shared_dirty; @@ -418,7 +416,6 @@ struct mem_size_stats { unsigned long swap; unsigned long shared_hugetlb; unsigned long private_hugetlb; - unsigned long first_vma_start; u64 pss; u64 pss_locked; u64 swap_pss; @@ -775,57 +772,75 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss) static int show_smap(struct seq_file *m, void *v) { - struct proc_maps_private *priv = m->private; struct vm_area_struct *vma = v; - struct mem_size_stats mss_stack; - struct mem_size_stats *mss; + struct mem_size_stats mss; + + memset(&mss, 0, sizeof(mss)); + + smap_gather_stats(vma, &mss); + + show_map_vma(m, vma); + + SEQ_PUT_DEC("Size: ", vma->vm_end - vma->vm_start); + SEQ_PUT_DEC(" kB\nKernelPageSize: ", vma_kernel_pagesize(vma)); + SEQ_PUT_DEC(" kB\nMMUPageSize: ", vma_mmu_pagesize(vma)); + seq_puts(m, " kB\n"); + + __show_smap(m, &mss); + + if (arch_pkeys_enabled()) + seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); + show_smap_vma_flags(m, vma); + + m_cache_vma(m, vma); + + return 0; +} + +static int show_smaps_rollup(struct seq_file *m, void *v) +{ + struct proc_maps_private *priv = m->private; + struct mem_size_stats mss; + struct mm_struct *mm; + struct vm_area_struct *vma; + unsigned long last_vma_end = 0; int ret = 0; - bool rollup_mode; - bool last_vma; - - if (priv->rollup) { - rollup_mode = true; - mss = priv->rollup; - if (mss->first) { - mss->first_vma_start = vma->vm_start; - mss->first = false; - } - last_vma = !m_next_vma(priv, vma); - } else { - rollup_mode = false; - memset(&mss_stack, 0, sizeof(mss_stack)); - mss = &mss_stack; - } - smap_gather_stats(vma, mss); + priv->task = get_proc_task(priv->inode); + if (!priv->task) + return -ESRCH; - if (!rollup_mode) { - show_map_vma(m, vma); - } else if (last_vma) { - show_vma_header_prefix( - m, mss->first_vma_start, vma->vm_end, 0, 0, 0, 0); - seq_pad(m, ' '); - seq_puts(m, "[rollup]\n"); - } else { - ret = SEQ_SKIP; + mm = priv->mm; + if (!mm || !mmget_not_zero(mm)) { + ret = -ESRCH; + goto out_put_task; } - if (!rollup_mode) { - SEQ_PUT_DEC("Size: ", vma->vm_end - vma->vm_start); - SEQ_PUT_DEC(" kB\nKernelPageSize: ", vma_kernel_pagesize(vma)); - SEQ_PUT_DEC(" kB\nMMUPageSize: ", vma_mmu_pagesize(vma)); - seq_puts(m, " kB\n"); - } + memset(&mss, 0, sizeof(mss)); - if (!rollup_mode || last_vma) - __show_smap(m, mss); + down_read(&mm->mmap_sem); + hold_task_mempolicy(priv); - if (!rollup_mode) { - if (arch_pkeys_enabled()) - seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); - show_smap_vma_flags(m, vma); + for (vma = priv->mm->mmap; vma; vma = vma->vm_next) { + smap_gather_stats(vma, &mss); + last_vma_end = vma->vm_end; } - m_cache_vma(m, vma); + + show_vma_header_prefix(m, priv->mm->mmap->vm_start, + last_vma_end, 0, 0, 0, 0); + seq_pad(m, ' '); + seq_puts(m, "[rollup]\n"); + + __show_smap(m, &mss); + + release_task_mempolicy(priv); + up_read(&mm->mmap_sem); + mmput(mm); + +out_put_task: + put_task_struct(priv->task); + priv->task = NULL; + return ret; } #undef SEQ_PUT_DEC @@ -842,23 +857,45 @@ static int pid_smaps_open(struct inode *inode, struct file *file) return do_maps_open(inode, file, &proc_pid_smaps_op); } -static int pid_smaps_rollup_open(struct inode *inode, struct file *file) +static int smaps_rollup_open(struct inode *inode, struct file *file) { - struct seq_file *seq; + int ret; struct proc_maps_private *priv; - int ret = do_maps_open(inode, file, &proc_pid_smaps_op); - - if (ret < 0) - return ret; - seq = file->private_data; - priv = seq->private; - priv->rollup = kzalloc(sizeof(*priv->rollup), GFP_KERNEL); - if (!priv->rollup) { - proc_map_release(inode, file); + + priv = kzalloc(sizeof(*priv), GFP_KERNEL_ACCOUNT); + if (!priv) return -ENOMEM; + + ret = single_open(file, show_smaps_rollup, priv); + if (ret) + goto out_free; + + priv->inode = inode; + priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR(priv->mm)) { + ret = PTR_ERR(priv->mm); + + single_release(inode, file); + goto out_free; } - priv->rollup->first = true; + return 0; + +out_free: + kfree(priv); + return ret; +} + +static int smaps_rollup_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct proc_maps_private *priv = seq->private; + + if (priv->mm) + mmdrop(priv->mm); + + kfree(priv); + return single_release(inode, file); } const struct file_operations proc_pid_smaps_operations = { @@ -869,10 +906,10 @@ const struct file_operations proc_pid_smaps_operations = { }; const struct file_operations proc_pid_smaps_rollup_operations = { - .open = pid_smaps_rollup_open, + .open = smaps_rollup_open, .read = seq_read, .llseek = seq_lseek, - .release = proc_map_release, + .release = smaps_rollup_release, }; enum clear_refs_types { -- cgit v1.2.3-59-g8ed1b From a670468f5e0b5fad4db6e4d195f15915dc2a35c1 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 21 Aug 2018 21:53:06 -0700 Subject: mm: zero out the vma in vma_init() Rather than in vm_area_alloc(). To ensure that the various oddball stack-based vmas are in a good state. Some of the callers were zeroing them out, others were not. Acked-by: Kirill A. Shutemov Cc: Russell King Cc: Dmitry Vyukov Cc: Oleg Nesterov Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/kernel/process.c | 9 ++++----- fs/hugetlbfs/inode.c | 2 -- include/linux/mm.h | 1 + kernel/fork.c | 3 ++- mm/mempolicy.c | 1 - mm/shmem.c | 1 - 6 files changed, 7 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index d9c299133111..82ab015bf42b 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -330,16 +330,15 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) * atomic helpers. Insert it into the gate_vma so that it is visible * through ptrace and /proc//mem. */ -static struct vm_area_struct gate_vma = { - .vm_start = 0xffff0000, - .vm_end = 0xffff0000 + PAGE_SIZE, - .vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC, -}; +static struct vm_area_struct gate_vma; static int __init gate_vma_init(void) { vma_init(&gate_vma, NULL); gate_vma.vm_page_prot = PAGE_READONLY_EXEC; + gate_vma.vm_start = 0xffff0000; + gate_vma.vm_end = 0xffff0000 + PAGE_SIZE; + gate_vma.vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC; return 0; } arch_initcall(gate_vma_init); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 346a146c7617..32920a10100e 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -410,7 +410,6 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, int i, freed = 0; bool truncate_op = (lend == LLONG_MAX); - memset(&pseudo_vma, 0, sizeof(struct vm_area_struct)); vma_init(&pseudo_vma, current->mm); pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED); pagevec_init(&pvec); @@ -595,7 +594,6 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, * allocation routines. If NUMA is configured, use page index * as input to create an allocation policy. */ - memset(&pseudo_vma, 0, sizeof(struct vm_area_struct)); vma_init(&pseudo_vma, mm); pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED); pseudo_vma.vm_file = file; diff --git a/include/linux/mm.h b/include/linux/mm.h index a3cae495f9ce..3a4b87d1a59a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -456,6 +456,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) { static const struct vm_operations_struct dummy_vm_ops = {}; + memset(vma, 0, sizeof(*vma)); vma->vm_mm = mm; vma->vm_ops = &dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); diff --git a/kernel/fork.c b/kernel/fork.c index 5ee74c113381..8c760effa42e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -310,8 +310,9 @@ static struct kmem_cache *mm_cachep; struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) { - struct vm_area_struct *vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + struct vm_area_struct *vma; + vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (vma) vma_init(vma, mm); return vma; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 01f1a14facc4..4861ba738d6f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2504,7 +2504,6 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) goto put_new; /* Create pseudo-vma that contains just the policy */ - memset(&pvma, 0, sizeof(struct vm_area_struct)); vma_init(&pvma, NULL); pvma.vm_end = TASK_SIZE; /* policy covers entire file */ mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ diff --git a/mm/shmem.c b/mm/shmem.c index c48c79018a7c..fb04baacc9fa 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1421,7 +1421,6 @@ static void shmem_pseudo_vma_init(struct vm_area_struct *vma, struct shmem_inode_info *info, pgoff_t index) { /* Create a pseudo vma that just contains the policy */ - memset(vma, 0, sizeof(*vma)); vma_init(vma, NULL); /* Bias interleave by inode number to distribute better across nodes */ vma->vm_pgoff = index + info->vfs_inode.i_ino; -- cgit v1.2.3-59-g8ed1b From 7e8a6304d5419cbf056a59de92939e5eef039c57 Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Tue, 21 Aug 2018 21:53:58 -0700 Subject: /proc/meminfo: add percpu populated pages count Currently, percpu memory only exposes allocation and utilization information via debugfs. This more or less is only really useful for understanding the fragmentation and allocation information at a per-chunk level with a few global counters. This is also gated behind a config. BPF and cgroup, for example, have seen an increase in use causing increased use of percpu memory. Let's make it easier for someone to identify how much memory is being used. This patch adds the "Percpu" stat to meminfo to more easily look up how much percpu memory is in use. This number includes the cost for all allocated backing pages and not just insight at the per a unit, per chunk level. Metadata is excluded. I think excluding metadata is fair because the backing memory scales with the numbere of cpus and can quickly outweigh the metadata. It also makes this calculation light. Link: http://lkml.kernel.org/r/20180807184723.74919-1-dennisszhou@gmail.com Signed-off-by: Dennis Zhou Acked-by: Tejun Heo Acked-by: Roman Gushchin Reviewed-by: Andrew Morton Acked-by: David Rientjes Acked-by: Vlastimil Babka Cc: Johannes Weiner Cc: Christoph Lameter Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 3 +++ fs/proc/meminfo.c | 2 ++ include/linux/percpu.h | 2 ++ mm/percpu.c | 29 +++++++++++++++++++++++++++++ 4 files changed, 36 insertions(+) (limited to 'fs') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 1605acbb23b6..22b4b00dee31 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -870,6 +870,7 @@ Committed_AS: 100056 kB VmallocTotal: 112216 kB VmallocUsed: 428 kB VmallocChunk: 111088 kB +Percpu: 62080 kB HardwareCorrupted: 0 kB AnonHugePages: 49152 kB ShmemHugePages: 0 kB @@ -962,6 +963,8 @@ Committed_AS: The amount of memory presently allocated on the system. VmallocTotal: total size of vmalloc memory area VmallocUsed: amount of vmalloc area which is used VmallocChunk: largest contiguous block of vmalloc area which is free + Percpu: Memory allocated to the percpu allocator used to back percpu + allocations. This stat excludes the cost of metadata. .............................................................................. diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 2fb04846ed11..edda898714eb 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -121,6 +122,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) (unsigned long)VMALLOC_TOTAL >> 10); show_val_kb(m, "VmallocUsed: ", 0ul); show_val_kb(m, "VmallocChunk: ", 0ul); + show_val_kb(m, "Percpu: ", pcpu_nr_pages()); #ifdef CONFIG_MEMORY_FAILURE seq_printf(m, "HardwareCorrupted: %5lu kB\n", diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 296bbe49d5d1..70b7123f38c7 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -149,4 +149,6 @@ extern phys_addr_t per_cpu_ptr_to_phys(void *addr); (typeof(type) __percpu *)__alloc_percpu(sizeof(type), \ __alignof__(type)) +extern unsigned long pcpu_nr_pages(void); + #endif /* __LINUX_PERCPU_H */ diff --git a/mm/percpu.c b/mm/percpu.c index 0b6480979ac7..a749d4d96e3e 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -169,6 +169,14 @@ static LIST_HEAD(pcpu_map_extend_chunks); */ int pcpu_nr_empty_pop_pages; +/* + * The number of populated pages in use by the allocator, protected by + * pcpu_lock. This number is kept per a unit per chunk (i.e. when a page gets + * allocated/deallocated, it is allocated/deallocated in all units of a chunk + * and increments/decrements this count by 1). + */ +static unsigned long pcpu_nr_populated; + /* * Balance work is used to populate or destroy chunks asynchronously. We * try to keep the number of populated free pages between @@ -1232,6 +1240,7 @@ static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start, bitmap_set(chunk->populated, page_start, nr); chunk->nr_populated += nr; + pcpu_nr_populated += nr; if (!for_alloc) { chunk->nr_empty_pop_pages += nr; @@ -1260,6 +1269,7 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, chunk->nr_populated -= nr; chunk->nr_empty_pop_pages -= nr; pcpu_nr_empty_pop_pages -= nr; + pcpu_nr_populated -= nr; } /* @@ -2176,6 +2186,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages; pcpu_chunk_relocate(pcpu_first_chunk, -1); + /* include all regions of the first chunk */ + pcpu_nr_populated += PFN_DOWN(size_sum); + pcpu_stats_chunk_alloc(); trace_percpu_create_chunk(base_addr); @@ -2745,6 +2758,22 @@ void __init setup_per_cpu_areas(void) #endif /* CONFIG_SMP */ +/* + * pcpu_nr_pages - calculate total number of populated backing pages + * + * This reflects the number of pages populated to back chunks. Metadata is + * excluded in the number exposed in meminfo as the number of backing pages + * scales with the number of cpus and can quickly outweigh the memory used for + * metadata. It also keeps this calculation nice and simple. + * + * RETURNS: + * Total number of populated backing pages in use by the allocator. + */ +unsigned long pcpu_nr_pages(void) +{ + return pcpu_nr_populated * pcpu_nr_units; +} + /* * Percpu allocator is initialized early during boot when neither slab or * workqueue is available. Plug async management until everything is up -- cgit v1.2.3-59-g8ed1b From 2d6e4e822ad8174b59414903ae6b35660032c20a Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 21 Aug 2018 21:54:09 -0700 Subject: proc: fixup PDE allocation bloat 24074a35c5c975 ("proc: Make inline name size calculation automatic") started to put PDE allocations into kmalloc-256 which is unnecessary as ~40 character names are very rare. Put allocation back into kmalloc-192 cache for 64-bit non-debug builds. Put BUILD_BUG_ON to know when PDE size has gotten out of control. [adobriyan@gmail.com: fix BUILD_BUG_ON breakage on powerpc64] Link: http://lkml.kernel.org/r/20180703191602.GA25521@avx2 Link: http://lkml.kernel.org/r/20180617215732.GA24688@avx2 Signed-off-by: Alexey Dobriyan Cc: David Howells Cc: Al Viro Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 6 ++++-- fs/proc/internal.h | 17 +++++++---------- 2 files changed, 11 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 85ffbd27f288..fc5306a31a1d 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -105,8 +105,10 @@ void __init proc_init_kmemcache(void) kmem_cache_create("pde_opener", sizeof(struct pde_opener), 0, SLAB_ACCOUNT|SLAB_PANIC, NULL); proc_dir_entry_cache = kmem_cache_create_usercopy( - "proc_dir_entry", SIZEOF_PDE_SLOT, 0, SLAB_PANIC, - OFFSETOF_PDE_NAME, SIZEOF_PDE_INLINE_NAME, NULL); + "proc_dir_entry", SIZEOF_PDE, 0, SLAB_PANIC, + offsetof(struct proc_dir_entry, inline_name), + SIZEOF_PDE_INLINE_NAME, NULL); + BUILD_BUG_ON(sizeof(struct proc_dir_entry) >= SIZEOF_PDE); } static int proc_show_options(struct seq_file *seq, struct dentry *root) diff --git a/fs/proc/internal.h b/fs/proc/internal.h index f5b75d258d22..c9d97818a421 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -65,16 +65,13 @@ struct proc_dir_entry { char inline_name[]; } __randomize_layout; -#define OFFSETOF_PDE_NAME offsetof(struct proc_dir_entry, inline_name) -#define SIZEOF_PDE_SLOT \ - (OFFSETOF_PDE_NAME + 34 <= 64 ? 64 : \ - OFFSETOF_PDE_NAME + 34 <= 128 ? 128 : \ - OFFSETOF_PDE_NAME + 34 <= 192 ? 192 : \ - OFFSETOF_PDE_NAME + 34 <= 256 ? 256 : \ - OFFSETOF_PDE_NAME + 34 <= 512 ? 512 : \ - 0) - -#define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE_SLOT - OFFSETOF_PDE_NAME) +#define SIZEOF_PDE ( \ + sizeof(struct proc_dir_entry) < 128 ? 128 : \ + sizeof(struct proc_dir_entry) < 192 ? 192 : \ + sizeof(struct proc_dir_entry) < 256 ? 256 : \ + sizeof(struct proc_dir_entry) < 512 ? 512 : \ + 0) +#define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE - sizeof(struct proc_dir_entry)) extern struct kmem_cache *proc_dir_entry_cache; void pde_free(struct proc_dir_entry *pde); -- cgit v1.2.3-59-g8ed1b From bdf228a27278d398ad26a156f01811be405867f9 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 21 Aug 2018 21:54:13 -0700 Subject: fs/proc/uptime.c: use ktime_get_boottime_ts64 get_monotonic_boottime() is deprecated and uses the old timespec type. Let's convert /proc/uptime to use ktime_get_boottime_ts64(). Link: http://lkml.kernel.org/r/20180620081746.282742-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Acked-by: Thomas Gleixner Cc: Al Viro Cc: Deepa Dinamani Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/uptime.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index 3f723cb478af..a4c2791ab70b 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -9,7 +9,7 @@ static int uptime_proc_show(struct seq_file *m, void *v) { - struct timespec uptime; + struct timespec64 uptime; struct timespec64 idle; u64 nsec; u32 rem; @@ -19,7 +19,7 @@ static int uptime_proc_show(struct seq_file *m, void *v) for_each_possible_cpu(i) nsec += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE]; - get_monotonic_boottime(&uptime); + ktime_get_boottime_ts64(&uptime); idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); idle.tv_nsec = rem; seq_printf(m, "%lu.%02lu %lu.%02lu\n", -- cgit v1.2.3-59-g8ed1b From 8d48b2e044218ba8b3f6b1d34e96a7474b6b0688 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 21 Aug 2018 21:54:23 -0700 Subject: proc: smaller readlock section in readdir("/proc") Readdir context is thread local, so ->pos is thread local, move it out of readlock. Link: http://lkml.kernel.org/r/20180627195339.GD18113@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/generic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/generic.c b/fs/proc/generic.c index bb1c1625b158..8ae109429a88 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -286,9 +286,9 @@ int proc_readdir_de(struct file *file, struct dir_context *ctx, if (!dir_emit_dots(file, ctx)) return 0; + i = ctx->pos - 2; read_lock(&proc_subdir_lock); de = pde_subdir_first(de); - i = ctx->pos - 2; for (;;) { if (!de) { read_unlock(&proc_subdir_lock); @@ -309,8 +309,8 @@ int proc_readdir_de(struct file *file, struct dir_context *ctx, pde_put(de); return 0; } - read_lock(&proc_subdir_lock); ctx->pos++; + read_lock(&proc_subdir_lock); next = pde_subdir_next(de); pde_put(de); de = next; -- cgit v1.2.3-59-g8ed1b From a44937fe4ef6a1190576492017939df636f4e38e Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 21 Aug 2018 21:54:27 -0700 Subject: proc: put task earlier in /proc/*/fail-nth Link: http://lkml.kernel.org/r/20180627195427.GE18113@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index ad047977ed04..912a0306bb4d 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1366,10 +1366,8 @@ static ssize_t proc_fail_nth_read(struct file *file, char __user *buf, if (!task) return -ESRCH; len = snprintf(numbuf, sizeof(numbuf), "%u\n", task->fail_nth); - len = simple_read_from_buffer(buf, count, ppos, numbuf, len); put_task_struct(task); - - return len; + return simple_read_from_buffer(buf, count, ppos, numbuf, len); } static const struct file_operations proc_fail_nth_operations = { -- cgit v1.2.3-59-g8ed1b From 41089b6d3e44a895076cc8ce56b08e463cb4f796 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 21 Aug 2018 21:54:30 -0700 Subject: proc: save 2 atomic ops on write to "/proc/*/attr/*" Code checks if write is done by current to its own attributes. For that get/put pair is unnecessary as it can be done under RCU. Note: rcu_read_unlock() can be done even earlier since pointer to a task is not dereferenced. It depends if /proc code should look scary or not: rcu_read_lock(); task = pid_task(...); rcu_read_unlock(); if (!task) return -ESRCH; if (task != current) return -EACCESS: P.S.: rename "length" variable. Code like this length = -EINVAL; should not exist. Link: http://lkml.kernel.org/r/20180627200218.GF18113@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 912a0306bb4d..4d8dac635da4 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2517,47 +2517,47 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { struct inode * inode = file_inode(file); + struct task_struct *task; void *page; - ssize_t length; - struct task_struct *task = get_proc_task(inode); - - length = -ESRCH; - if (!task) - goto out_no_task; + int rv; + rcu_read_lock(); + task = pid_task(proc_pid(inode), PIDTYPE_PID); + if (!task) { + rcu_read_unlock(); + return -ESRCH; + } /* A task may only write its own attributes. */ - length = -EACCES; - if (current != task) - goto out; + if (current != task) { + rcu_read_unlock(); + return -EACCES; + } + rcu_read_unlock(); if (count > PAGE_SIZE) count = PAGE_SIZE; /* No partial writes. */ - length = -EINVAL; if (*ppos != 0) - goto out; + return -EINVAL; page = memdup_user(buf, count); if (IS_ERR(page)) { - length = PTR_ERR(page); + rv = PTR_ERR(page); goto out; } /* Guard against adverse ptrace interaction */ - length = mutex_lock_interruptible(¤t->signal->cred_guard_mutex); - if (length < 0) + rv = mutex_lock_interruptible(¤t->signal->cred_guard_mutex); + if (rv < 0) goto out_free; - length = security_setprocattr(file->f_path.dentry->d_name.name, - page, count); + rv = security_setprocattr(file->f_path.dentry->d_name.name, page, count); mutex_unlock(¤t->signal->cred_guard_mutex); out_free: kfree(page); out: - put_task_struct(task); -out_no_task: - return length; + return rv; } static const struct file_operations proc_pid_attr_operations = { -- cgit v1.2.3-59-g8ed1b From f6d2f584d88616e27eeb603dc8c88ca16e00d682 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 21 Aug 2018 21:54:34 -0700 Subject: proc: use macro in /proc/latency hook ->latency_record is defined as struct latency_record[LT_SAVECOUNT]; so use the same macro whie iterating. Link: http://lkml.kernel.org/r/20180627200534.GA18434@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 4d8dac635da4..ccf86f16d9f0 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -463,7 +463,7 @@ static int lstats_show_proc(struct seq_file *m, void *v) if (!task) return -ESRCH; seq_puts(m, "Latency Top version : v0.1\n"); - for (i = 0; i < 32; i++) { + for (i = 0; i < LT_SAVECOUNT; i++) { struct latency_record *lr = &task->latency_record[i]; if (lr->backtrace[0]) { int q; -- cgit v1.2.3-59-g8ed1b From 891ae71dc4fbd2454a3fa569e115a7ca86630949 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 21 Aug 2018 21:54:37 -0700 Subject: proc: spread "const" a bit Link: http://lkml.kernel.org/r/20180627200614.GB18434@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/internal.h | 4 ++-- include/linux/proc_fs.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/proc/internal.h b/fs/proc/internal.h index c9d97818a421..5185d7f6a51e 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -113,12 +113,12 @@ static inline void *__PDE_DATA(const struct inode *inode) return PDE(inode)->data; } -static inline struct pid *proc_pid(struct inode *inode) +static inline struct pid *proc_pid(const struct inode *inode) { return PROC_I(inode)->pid; } -static inline struct task_struct *get_proc_task(struct inode *inode) +static inline struct task_struct *get_proc_task(const struct inode *inode) { return get_pid_task(proc_pid(inode), PIDTYPE_PID); } diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 626fc65c4336..d0e1f1522a78 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -129,7 +129,7 @@ int open_related_ns(struct ns_common *ns, struct ns_common *(*get_ns)(struct ns_common *ns)); /* get the associated pid namespace for a file in procfs */ -static inline struct pid_namespace *proc_pid_ns(struct inode *inode) +static inline struct pid_namespace *proc_pid_ns(const struct inode *inode) { return inode->i_sb->s_fs_info; } -- cgit v1.2.3-59-g8ed1b From 9a27e97aaab9a25fac2e8976e98e42ab7f4a8fac Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 21 Aug 2018 21:54:41 -0700 Subject: proc: use "unsigned int" in /proc/stat hook Number of CPUs is never high enough to force 64-bit arithmetic. Save couple of bytes on x86_64. Link: http://lkml.kernel.org/r/20180627200710.GC18434@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/stat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 59749dfaef67..535eda7857cf 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -183,7 +183,7 @@ static int show_stat(struct seq_file *p, void *v) static int stat_open(struct inode *inode, struct file *file) { - size_t size = 1024 + 128 * num_online_cpus(); + unsigned int size = 1024 + 128 * num_online_cpus(); /* minimum size to display an interrupt count : 2 bytes */ size += 2 * nr_irqs; -- cgit v1.2.3-59-g8ed1b From 36f062042b0fd9f8e41b97a472f52139886ca26f Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Tue, 21 Aug 2018 21:54:44 -0700 Subject: fs/proc/vmcore.c: use new typedef vm_fault_t Use new return type vm_fault_t for fault handler in struct vm_operations_struct. For now, this is just documenting that the function returns a VM_FAULT value rather than an errno. Once all instances are converted, vm_fault_t will become a distinct type. See 1c8f422059ae ("mm: change return type to vm_fault_t") for reference. Link: http://lkml.kernel.org/r/20180702153325.GA3875@jordon-HP-15-Notebook-PC Signed-off-by: Souptick Joarder Reviewed-by: Matthew Wilcox Cc: Ganesh Goudar Cc: Rahul Lakkireddy Cc: David S. Miller Cc: Alexey Dobriyan Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/vmcore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index cfb6674331fd..6c1c2607e9e4 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -379,7 +379,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, * On s390 the fault handler is used for memory regions that can't be mapped * directly with remap_pfn_range(). */ -static int mmap_vmcore_fault(struct vm_fault *vmf) +static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf) { #ifdef CONFIG_S390 struct address_space *mapping = vmf->vma->vm_file->f_mapping; -- cgit v1.2.3-59-g8ed1b From df865e8337c397471b95f51017fea559bc8abb4a Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 21 Aug 2018 21:54:48 -0700 Subject: fs/proc/kcore.c: use __pa_symbol() for KCORE_TEXT list entries elf_kcore_store_hdr() uses __pa() to find the physical address of KCORE_RAM or KCORE_TEXT entries exported as program headers. This trips CONFIG_DEBUG_VIRTUAL's checks, as the KCORE_TEXT entries are not in the linear map. Handle these two cases separately, using __pa_symbol() for the KCORE_TEXT entries. Link: http://lkml.kernel.org/r/20180711131944.15252-1-james.morse@arm.com Signed-off-by: James Morse Cc: Alexey Dobriyan Cc: Omar Sandoval Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/kcore.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index e64ecb9f2720..66c373230e60 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -384,8 +384,10 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff) phdr->p_flags = PF_R|PF_W|PF_X; phdr->p_offset = kc_vaddr_to_offset(m->addr) + dataoff; phdr->p_vaddr = (size_t)m->addr; - if (m->type == KCORE_RAM || m->type == KCORE_TEXT) + if (m->type == KCORE_RAM) phdr->p_paddr = __pa(m->addr); + else if (m->type == KCORE_TEXT) + phdr->p_paddr = __pa_symbol(m->addr); else phdr->p_paddr = (elf_addr_t)-1; phdr->p_filesz = phdr->p_memsz = m->size; -- cgit v1.2.3-59-g8ed1b From a8dd9c4df18edc873d244790d163564a5d17626b Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 21 Aug 2018 21:54:51 -0700 Subject: proc/kcore: don't grab lock for kclist_add() Patch series "/proc/kcore improvements", v4. This series makes a few improvements to /proc/kcore. It fixes a couple of small issues in v3 but is otherwise the same. Patches 1, 2, and 3 are prep patches. Patch 4 is a fix/cleanup. Patch 5 is another prep patch. Patches 6 and 7 are optimizations to ->read(). Patch 8 makes it possible to enable CRASH_CORE on any architecture, which is needed for patch 9. Patch 9 adds vmcoreinfo to /proc/kcore. This patch (of 9): kclist_add() is only called at init time, so there's no point in grabbing any locks. We're also going to replace the rwlock with a rwsem, which we don't want to try grabbing during early boot. While we're here, mark kclist_add() with __init so that we'll get a warning if it's called from non-init code. Link: http://lkml.kernel.org/r/98208db1faf167aa8b08eebfa968d95c70527739.1531953780.git.osandov@fb.com Signed-off-by: Omar Sandoval Reviewed-by: Andrew Morton Reviewed-by: Bhupesh Sharma Tested-by: Bhupesh Sharma Cc: Alexey Dobriyan Cc: Bhupesh Sharma Cc: Eric Biederman Cc: James Morse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/kcore.c | 7 +++---- include/linux/kcore.h | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 66c373230e60..b0b9a76f28d6 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -62,16 +62,15 @@ static LIST_HEAD(kclist_head); static DEFINE_RWLOCK(kclist_lock); static int kcore_need_update = 1; -void -kclist_add(struct kcore_list *new, void *addr, size_t size, int type) +/* This doesn't grab kclist_lock, so it should only be used at init time. */ +void __init kclist_add(struct kcore_list *new, void *addr, size_t size, + int type) { new->addr = (unsigned long)addr; new->size = size; new->type = type; - write_lock(&kclist_lock); list_add_tail(&new->list, &kclist_head); - write_unlock(&kclist_lock); } static size_t get_kcore_size(int *nphdr, size_t *elf_buflen) diff --git a/include/linux/kcore.h b/include/linux/kcore.h index 8de55e4b5ee9..c20f296438fb 100644 --- a/include/linux/kcore.h +++ b/include/linux/kcore.h @@ -35,7 +35,7 @@ struct vmcoredd_node { }; #ifdef CONFIG_PROC_KCORE -extern void kclist_add(struct kcore_list *, void *, size_t, int type); +void __init kclist_add(struct kcore_list *, void *, size_t, int type); #else static inline void kclist_add(struct kcore_list *new, void *addr, size_t size, int type) -- cgit v1.2.3-59-g8ed1b From bf53183164dbba00b342df7d2215b33007ed83ed Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 21 Aug 2018 21:54:55 -0700 Subject: proc/kcore: don't grab lock for memory hotplug notifier The memory hotplug notifier kcore_callback() only needs kclist_lock to prevent races with __kcore_update_ram(), but we can easily eliminate that race by using an atomic xchg() in __kcore_update_ram(). This is preparation for converting kclist_lock to an rwsem. Link: http://lkml.kernel.org/r/0a4bc89f4dbde8b5b2ea309f7b4fb6a85fe29df2.1531953780.git.osandov@fb.com Signed-off-by: Omar Sandoval Reviewed-by: Andrew Morton Cc: Alexey Dobriyan Cc: Bhupesh Sharma Cc: Eric Biederman Cc: James Morse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/kcore.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index b0b9a76f28d6..e83f15a4f66d 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -118,7 +118,7 @@ static void __kcore_update_ram(struct list_head *list) LIST_HEAD(garbage); write_lock(&kclist_lock); - if (kcore_need_update) { + if (xchg(&kcore_need_update, 0)) { list_for_each_entry_safe(pos, tmp, &kclist_head, list) { if (pos->type == KCORE_RAM || pos->type == KCORE_VMEMMAP) @@ -127,7 +127,6 @@ static void __kcore_update_ram(struct list_head *list) list_splice_tail(list, &kclist_head); } else list_splice(list, &garbage); - kcore_need_update = 0; proc_root_kcore->size = get_kcore_size(&nphdr, &size); write_unlock(&kclist_lock); @@ -593,9 +592,8 @@ static int __meminit kcore_callback(struct notifier_block *self, switch (action) { case MEM_ONLINE: case MEM_OFFLINE: - write_lock(&kclist_lock); kcore_need_update = 1; - write_unlock(&kclist_lock); + break; } return NOTIFY_OK; } -- cgit v1.2.3-59-g8ed1b From 0b172f845ff963ab15e2d861dc155e2ab13241e9 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 21 Aug 2018 21:54:59 -0700 Subject: proc/kcore: replace kclist_lock rwlock with rwsem Now we only need kclist_lock from user context and at fs init time, and the following changes need to sleep while holding the kclist_lock. Link: http://lkml.kernel.org/r/521ba449ebe921d905177410fee9222d07882f0d.1531953780.git.osandov@fb.com Signed-off-by: Omar Sandoval Reviewed-by: Andrew Morton Cc: Alexey Dobriyan Cc: Bhupesh Sharma Cc: Eric Biederman Cc: James Morse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/kcore.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index e83f15a4f66d..ae43a97d511d 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -59,7 +59,7 @@ struct memelfnote }; static LIST_HEAD(kclist_head); -static DEFINE_RWLOCK(kclist_lock); +static DECLARE_RWSEM(kclist_lock); static int kcore_need_update = 1; /* This doesn't grab kclist_lock, so it should only be used at init time. */ @@ -117,7 +117,7 @@ static void __kcore_update_ram(struct list_head *list) struct kcore_list *tmp, *pos; LIST_HEAD(garbage); - write_lock(&kclist_lock); + down_write(&kclist_lock); if (xchg(&kcore_need_update, 0)) { list_for_each_entry_safe(pos, tmp, &kclist_head, list) { if (pos->type == KCORE_RAM @@ -128,7 +128,7 @@ static void __kcore_update_ram(struct list_head *list) } else list_splice(list, &garbage); proc_root_kcore->size = get_kcore_size(&nphdr, &size); - write_unlock(&kclist_lock); + up_write(&kclist_lock); free_kclist_ents(&garbage); } @@ -451,11 +451,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) int nphdr; unsigned long start; - read_lock(&kclist_lock); + down_read(&kclist_lock); size = get_kcore_size(&nphdr, &elf_buflen); if (buflen == 0 || *fpos >= size) { - read_unlock(&kclist_lock); + up_read(&kclist_lock); return 0; } @@ -472,11 +472,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) tsz = buflen; elf_buf = kzalloc(elf_buflen, GFP_ATOMIC); if (!elf_buf) { - read_unlock(&kclist_lock); + up_read(&kclist_lock); return -ENOMEM; } elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen); - read_unlock(&kclist_lock); + up_read(&kclist_lock); if (copy_to_user(buffer, elf_buf + *fpos, tsz)) { kfree(elf_buf); return -EFAULT; @@ -491,7 +491,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) if (buflen == 0) return acc; } else - read_unlock(&kclist_lock); + up_read(&kclist_lock); /* * Check to see if our file offset matches with any of @@ -504,12 +504,12 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) while (buflen) { struct kcore_list *m; - read_lock(&kclist_lock); + down_read(&kclist_lock); list_for_each_entry(m, &kclist_head, list) { if (start >= m->addr && start < (m->addr+m->size)) break; } - read_unlock(&kclist_lock); + up_read(&kclist_lock); if (&m->list == &kclist_head) { if (clear_user(buffer, tsz)) -- cgit v1.2.3-59-g8ed1b From b66fb005c97544e9e589b2f2e60ccfe3808c6c3e Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 21 Aug 2018 21:55:02 -0700 Subject: proc/kcore: fix memory hotplug vs multiple opens race There's a theoretical race condition that will cause /proc/kcore to miss a memory hotplug event: CPU0 CPU1 // hotplug event 1 kcore_need_update = 1 open_kcore() open_kcore() kcore_update_ram() kcore_update_ram() // Walk RAM // Walk RAM __kcore_update_ram() __kcore_update_ram() kcore_need_update = 0 // hotplug event 2 kcore_need_update = 1 kcore_need_update = 0 Note that CPU1 set up the RAM kcore entries with the state after hotplug event 1 but cleared the flag for hotplug event 2. The RAM entries will therefore be stale until there is another hotplug event. This is an extremely unlikely sequence of events, but the fix makes the synchronization saner, anyways: we serialize the entire update sequence, which means that whoever clears the flag will always succeed in replacing the kcore list. Link: http://lkml.kernel.org/r/6106c509998779730c12400c1b996425df7d7089.1531953780.git.osandov@fb.com Signed-off-by: Omar Sandoval Cc: Alexey Dobriyan Cc: Bhupesh Sharma Cc: Eric Biederman Cc: James Morse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/kcore.c | 93 +++++++++++++++++++++++++++------------------------------ 1 file changed, 44 insertions(+), 49 deletions(-) (limited to 'fs') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index ae43a97d511d..95aa988c5b5d 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -98,53 +98,15 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen) return size + *elf_buflen; } -static void free_kclist_ents(struct list_head *head) -{ - struct kcore_list *tmp, *pos; - - list_for_each_entry_safe(pos, tmp, head, list) { - list_del(&pos->list); - kfree(pos); - } -} -/* - * Replace all KCORE_RAM/KCORE_VMEMMAP information with passed list. - */ -static void __kcore_update_ram(struct list_head *list) -{ - int nphdr; - size_t size; - struct kcore_list *tmp, *pos; - LIST_HEAD(garbage); - - down_write(&kclist_lock); - if (xchg(&kcore_need_update, 0)) { - list_for_each_entry_safe(pos, tmp, &kclist_head, list) { - if (pos->type == KCORE_RAM - || pos->type == KCORE_VMEMMAP) - list_move(&pos->list, &garbage); - } - list_splice_tail(list, &kclist_head); - } else - list_splice(list, &garbage); - proc_root_kcore->size = get_kcore_size(&nphdr, &size); - up_write(&kclist_lock); - - free_kclist_ents(&garbage); -} - - #ifdef CONFIG_HIGHMEM /* * If no highmem, we can assume [0...max_low_pfn) continuous range of memory * because memory hole is not as big as !HIGHMEM case. * (HIGHMEM is special because part of memory is _invisible_ from the kernel.) */ -static int kcore_update_ram(void) +static int kcore_ram_list(struct list_head *head) { - LIST_HEAD(head); struct kcore_list *ent; - int ret = 0; ent = kmalloc(sizeof(*ent), GFP_KERNEL); if (!ent) @@ -152,9 +114,8 @@ static int kcore_update_ram(void) ent->addr = (unsigned long)__va(0); ent->size = max_low_pfn << PAGE_SHIFT; ent->type = KCORE_RAM; - list_add(&ent->list, &head); - __kcore_update_ram(&head); - return ret; + list_add(&ent->list, head); + return 0; } #else /* !CONFIG_HIGHMEM */ @@ -253,11 +214,10 @@ free_out: return 1; } -static int kcore_update_ram(void) +static int kcore_ram_list(struct list_head *list) { int nid, ret; unsigned long end_pfn; - LIST_HEAD(head); /* Not inialized....update now */ /* find out "max pfn" */ @@ -269,15 +229,50 @@ static int kcore_update_ram(void) end_pfn = node_end; } /* scan 0 to max_pfn */ - ret = walk_system_ram_range(0, end_pfn, &head, kclist_add_private); - if (ret) { - free_kclist_ents(&head); + ret = walk_system_ram_range(0, end_pfn, list, kclist_add_private); + if (ret) return -ENOMEM; + return 0; +} +#endif /* CONFIG_HIGHMEM */ + +static int kcore_update_ram(void) +{ + LIST_HEAD(list); + LIST_HEAD(garbage); + int nphdr; + size_t size; + struct kcore_list *tmp, *pos; + int ret = 0; + + down_write(&kclist_lock); + if (!xchg(&kcore_need_update, 0)) + goto out; + + ret = kcore_ram_list(&list); + if (ret) { + /* Couldn't get the RAM list, try again next time. */ + WRITE_ONCE(kcore_need_update, 1); + list_splice_tail(&list, &garbage); + goto out; + } + + list_for_each_entry_safe(pos, tmp, &kclist_head, list) { + if (pos->type == KCORE_RAM || pos->type == KCORE_VMEMMAP) + list_move(&pos->list, &garbage); + } + list_splice_tail(&list, &kclist_head); + + proc_root_kcore->size = get_kcore_size(&nphdr, &size); + +out: + up_write(&kclist_lock); + list_for_each_entry_safe(pos, tmp, &garbage, list) { + list_del(&pos->list); + kfree(pos); } - __kcore_update_ram(&head); return ret; } -#endif /* CONFIG_HIGHMEM */ /*****************************************************************************/ /* -- cgit v1.2.3-59-g8ed1b From 3673fb08db73347332ab956d90090c1da6e610b7 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 21 Aug 2018 21:55:06 -0700 Subject: proc/kcore: hold lock during read Now that we're using an rwsem, we can hold it during the entirety of read_kcore() and have a common return path. This is preparation for the next change. [akpm@linux-foundation.org: fix locking bug reported by Tetsuo Handa] Link: http://lkml.kernel.org/r/d7cfbc1e8a76616f3b699eaff9df0a2730380534.1531953780.git.osandov@fb.com Signed-off-by: Omar Sandoval Cc: Alexey Dobriyan Cc: Bhupesh Sharma Cc: Eric Biederman Cc: James Morse Cc: Tetsuo Handa Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/kcore.c | 70 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 40 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 95aa988c5b5d..dc34642bbdb7 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -440,19 +440,18 @@ static ssize_t read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) { char *buf = file->private_data; - ssize_t acc = 0; size_t size, tsz; size_t elf_buflen; int nphdr; unsigned long start; + size_t orig_buflen = buflen; + int ret = 0; down_read(&kclist_lock); size = get_kcore_size(&nphdr, &elf_buflen); - if (buflen == 0 || *fpos >= size) { - up_read(&kclist_lock); - return 0; - } + if (buflen == 0 || *fpos >= size) + goto out; /* trim buflen to not go beyond EOF */ if (buflen > size - *fpos) @@ -465,28 +464,26 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) tsz = elf_buflen - *fpos; if (buflen < tsz) tsz = buflen; - elf_buf = kzalloc(elf_buflen, GFP_ATOMIC); + elf_buf = kzalloc(elf_buflen, GFP_KERNEL); if (!elf_buf) { - up_read(&kclist_lock); - return -ENOMEM; + ret = -ENOMEM; + goto out; } elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen); - up_read(&kclist_lock); if (copy_to_user(buffer, elf_buf + *fpos, tsz)) { kfree(elf_buf); - return -EFAULT; + ret = -EFAULT; + goto out; } kfree(elf_buf); buflen -= tsz; *fpos += tsz; buffer += tsz; - acc += tsz; /* leave now if filled buffer already */ if (buflen == 0) - return acc; - } else - up_read(&kclist_lock); + goto out; + } /* * Check to see if our file offset matches with any of @@ -499,25 +496,29 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) while (buflen) { struct kcore_list *m; - down_read(&kclist_lock); list_for_each_entry(m, &kclist_head, list) { if (start >= m->addr && start < (m->addr+m->size)) break; } - up_read(&kclist_lock); if (&m->list == &kclist_head) { - if (clear_user(buffer, tsz)) - return -EFAULT; + if (clear_user(buffer, tsz)) { + ret = -EFAULT; + goto out; + } } else if (m->type == KCORE_VMALLOC) { vread(buf, (char *)start, tsz); /* we have to zero-fill user buffer even if no read */ - if (copy_to_user(buffer, buf, tsz)) - return -EFAULT; + if (copy_to_user(buffer, buf, tsz)) { + ret = -EFAULT; + goto out; + } } else if (m->type == KCORE_USER) { /* User page is handled prior to normal kernel page: */ - if (copy_to_user(buffer, (char *)start, tsz)) - return -EFAULT; + if (copy_to_user(buffer, (char *)start, tsz)) { + ret = -EFAULT; + goto out; + } } else { if (kern_addr_valid(start)) { /* @@ -525,26 +526,35 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) * hardened user copy kernel text checks. */ if (probe_kernel_read(buf, (void *) start, tsz)) { - if (clear_user(buffer, tsz)) - return -EFAULT; + if (clear_user(buffer, tsz)) { + ret = -EFAULT; + goto out; + } } else { - if (copy_to_user(buffer, buf, tsz)) - return -EFAULT; + if (copy_to_user(buffer, buf, tsz)) { + ret = -EFAULT; + goto out; + } } } else { - if (clear_user(buffer, tsz)) - return -EFAULT; + if (clear_user(buffer, tsz)) { + ret = -EFAULT; + goto out; + } } } buflen -= tsz; *fpos += tsz; buffer += tsz; - acc += tsz; start += tsz; tsz = (buflen > PAGE_SIZE ? PAGE_SIZE : buflen); } - return acc; +out: + up_read(&kclist_lock); + if (ret) + return ret; + return orig_buflen - buflen; } -- cgit v1.2.3-59-g8ed1b From 37e949bd5293ddb70acf236eedf2ae8caa1db57b Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 21 Aug 2018 21:55:09 -0700 Subject: proc/kcore: clean up ELF header generation Currently, the ELF file header, program headers, and note segment are allocated all at once, in some icky code dating back to 2.3. Programs tend to read the file header, then the program headers, then the note segment, all separately, so this is a waste of effort. It's cleaner and more efficient to handle the three separately. Link: http://lkml.kernel.org/r/19c92cbad0e11f6103ff3274b2e7a7e51a1eb74b.1531953780.git.osandov@fb.com Signed-off-by: Omar Sandoval Cc: Alexey Dobriyan Cc: Bhupesh Sharma Cc: Eric Biederman Cc: James Morse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/kcore.c | 350 +++++++++++++++++++++++--------------------------------- 1 file changed, 141 insertions(+), 209 deletions(-) (limited to 'fs') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index dc34642bbdb7..808ef9afd084 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -49,15 +49,6 @@ static struct proc_dir_entry *proc_root_kcore; #define kc_offset_to_vaddr(o) ((o) + PAGE_OFFSET) #endif -/* An ELF note in memory */ -struct memelfnote -{ - const char *name; - int type; - unsigned int datasz; - void *data; -}; - static LIST_HEAD(kclist_head); static DECLARE_RWSEM(kclist_lock); static int kcore_need_update = 1; @@ -73,7 +64,8 @@ void __init kclist_add(struct kcore_list *new, void *addr, size_t size, list_add_tail(&new->list, &kclist_head); } -static size_t get_kcore_size(int *nphdr, size_t *elf_buflen) +static size_t get_kcore_size(int *nphdr, size_t *phdrs_len, size_t *notes_len, + size_t *data_offset) { size_t try, size; struct kcore_list *m; @@ -87,15 +79,15 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen) size = try; *nphdr = *nphdr + 1; } - *elf_buflen = sizeof(struct elfhdr) + - (*nphdr + 2)*sizeof(struct elf_phdr) + - 3 * ((sizeof(struct elf_note)) + - roundup(sizeof(CORE_STR), 4)) + - roundup(sizeof(struct elf_prstatus), 4) + - roundup(sizeof(struct elf_prpsinfo), 4) + - roundup(arch_task_struct_size, 4); - *elf_buflen = PAGE_ALIGN(*elf_buflen); - return size + *elf_buflen; + + *phdrs_len = *nphdr * sizeof(struct elf_phdr); + *notes_len = (3 * (sizeof(struct elf_note) + ALIGN(sizeof(CORE_STR), 4)) + + ALIGN(sizeof(struct elf_prstatus), 4) + + ALIGN(sizeof(struct elf_prpsinfo), 4) + + ALIGN(arch_task_struct_size, 4)); + *data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + *phdrs_len + + *notes_len); + return *data_offset + size; } #ifdef CONFIG_HIGHMEM @@ -241,7 +233,7 @@ static int kcore_update_ram(void) LIST_HEAD(list); LIST_HEAD(garbage); int nphdr; - size_t size; + size_t phdrs_len, notes_len, data_offset; struct kcore_list *tmp, *pos; int ret = 0; @@ -263,7 +255,8 @@ static int kcore_update_ram(void) } list_splice_tail(&list, &kclist_head); - proc_root_kcore->size = get_kcore_size(&nphdr, &size); + proc_root_kcore->size = get_kcore_size(&nphdr, &phdrs_len, ¬es_len, + &data_offset); out: up_write(&kclist_lock); @@ -274,228 +267,168 @@ out: return ret; } -/*****************************************************************************/ -/* - * determine size of ELF note - */ -static int notesize(struct memelfnote *en) +static void append_kcore_note(char *notes, size_t *i, const char *name, + unsigned int type, const void *desc, + size_t descsz) { - int sz; - - sz = sizeof(struct elf_note); - sz += roundup((strlen(en->name) + 1), 4); - sz += roundup(en->datasz, 4); - - return sz; -} /* end notesize() */ - -/*****************************************************************************/ -/* - * store a note in the header buffer - */ -static char *storenote(struct memelfnote *men, char *bufp) -{ - struct elf_note en; - -#define DUMP_WRITE(addr,nr) do { memcpy(bufp,addr,nr); bufp += nr; } while(0) - - en.n_namesz = strlen(men->name) + 1; - en.n_descsz = men->datasz; - en.n_type = men->type; - - DUMP_WRITE(&en, sizeof(en)); - DUMP_WRITE(men->name, en.n_namesz); - - /* XXX - cast from long long to long to avoid need for libgcc.a */ - bufp = (char*) roundup((unsigned long)bufp,4); - DUMP_WRITE(men->data, men->datasz); - bufp = (char*) roundup((unsigned long)bufp,4); - -#undef DUMP_WRITE - - return bufp; -} /* end storenote() */ - -/* - * store an ELF coredump header in the supplied buffer - * nphdr is the number of elf_phdr to insert - */ -static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff) -{ - struct elf_prstatus prstatus; /* NT_PRSTATUS */ - struct elf_prpsinfo prpsinfo; /* NT_PRPSINFO */ - struct elf_phdr *nhdr, *phdr; - struct elfhdr *elf; - struct memelfnote notes[3]; - off_t offset = 0; - struct kcore_list *m; - - /* setup ELF header */ - elf = (struct elfhdr *) bufp; - bufp += sizeof(struct elfhdr); - offset += sizeof(struct elfhdr); - memcpy(elf->e_ident, ELFMAG, SELFMAG); - elf->e_ident[EI_CLASS] = ELF_CLASS; - elf->e_ident[EI_DATA] = ELF_DATA; - elf->e_ident[EI_VERSION]= EV_CURRENT; - elf->e_ident[EI_OSABI] = ELF_OSABI; - memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD); - elf->e_type = ET_CORE; - elf->e_machine = ELF_ARCH; - elf->e_version = EV_CURRENT; - elf->e_entry = 0; - elf->e_phoff = sizeof(struct elfhdr); - elf->e_shoff = 0; - elf->e_flags = ELF_CORE_EFLAGS; - elf->e_ehsize = sizeof(struct elfhdr); - elf->e_phentsize= sizeof(struct elf_phdr); - elf->e_phnum = nphdr; - elf->e_shentsize= 0; - elf->e_shnum = 0; - elf->e_shstrndx = 0; - - /* setup ELF PT_NOTE program header */ - nhdr = (struct elf_phdr *) bufp; - bufp += sizeof(struct elf_phdr); - offset += sizeof(struct elf_phdr); - nhdr->p_type = PT_NOTE; - nhdr->p_offset = 0; - nhdr->p_vaddr = 0; - nhdr->p_paddr = 0; - nhdr->p_filesz = 0; - nhdr->p_memsz = 0; - nhdr->p_flags = 0; - nhdr->p_align = 0; - - /* setup ELF PT_LOAD program header for every area */ - list_for_each_entry(m, &kclist_head, list) { - phdr = (struct elf_phdr *) bufp; - bufp += sizeof(struct elf_phdr); - offset += sizeof(struct elf_phdr); - - phdr->p_type = PT_LOAD; - phdr->p_flags = PF_R|PF_W|PF_X; - phdr->p_offset = kc_vaddr_to_offset(m->addr) + dataoff; - phdr->p_vaddr = (size_t)m->addr; - if (m->type == KCORE_RAM) - phdr->p_paddr = __pa(m->addr); - else if (m->type == KCORE_TEXT) - phdr->p_paddr = __pa_symbol(m->addr); - else - phdr->p_paddr = (elf_addr_t)-1; - phdr->p_filesz = phdr->p_memsz = m->size; - phdr->p_align = PAGE_SIZE; - } - - /* - * Set up the notes in similar form to SVR4 core dumps made - * with info from their /proc. - */ - nhdr->p_offset = offset; - - /* set up the process status */ - notes[0].name = CORE_STR; - notes[0].type = NT_PRSTATUS; - notes[0].datasz = sizeof(struct elf_prstatus); - notes[0].data = &prstatus; - - memset(&prstatus, 0, sizeof(struct elf_prstatus)); - - nhdr->p_filesz = notesize(¬es[0]); - bufp = storenote(¬es[0], bufp); - - /* set up the process info */ - notes[1].name = CORE_STR; - notes[1].type = NT_PRPSINFO; - notes[1].datasz = sizeof(struct elf_prpsinfo); - notes[1].data = &prpsinfo; - - memset(&prpsinfo, 0, sizeof(struct elf_prpsinfo)); - prpsinfo.pr_state = 0; - prpsinfo.pr_sname = 'R'; - prpsinfo.pr_zomb = 0; - - strcpy(prpsinfo.pr_fname, "vmlinux"); - strlcpy(prpsinfo.pr_psargs, saved_command_line, sizeof(prpsinfo.pr_psargs)); - - nhdr->p_filesz += notesize(¬es[1]); - bufp = storenote(¬es[1], bufp); - - /* set up the task structure */ - notes[2].name = CORE_STR; - notes[2].type = NT_TASKSTRUCT; - notes[2].datasz = arch_task_struct_size; - notes[2].data = current; - - nhdr->p_filesz += notesize(¬es[2]); - bufp = storenote(¬es[2], bufp); - -} /* end elf_kcore_store_hdr() */ + struct elf_note *note = (struct elf_note *)¬es[*i]; + + note->n_namesz = strlen(name) + 1; + note->n_descsz = descsz; + note->n_type = type; + *i += sizeof(*note); + memcpy(¬es[*i], name, note->n_namesz); + *i = ALIGN(*i + note->n_namesz, 4); + memcpy(¬es[*i], desc, descsz); + *i = ALIGN(*i + descsz, 4); +} -/*****************************************************************************/ -/* - * read from the ELF header and then kernel memory - */ static ssize_t read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) { char *buf = file->private_data; - size_t size, tsz; - size_t elf_buflen; + size_t phdrs_offset, notes_offset, data_offset; + size_t phdrs_len, notes_len; + struct kcore_list *m; + size_t tsz; int nphdr; unsigned long start; size_t orig_buflen = buflen; int ret = 0; down_read(&kclist_lock); - size = get_kcore_size(&nphdr, &elf_buflen); - if (buflen == 0 || *fpos >= size) - goto out; + get_kcore_size(&nphdr, &phdrs_len, ¬es_len, &data_offset); + phdrs_offset = sizeof(struct elfhdr); + notes_offset = phdrs_offset + phdrs_len; + + /* ELF file header. */ + if (buflen && *fpos < sizeof(struct elfhdr)) { + struct elfhdr ehdr = { + .e_ident = { + [EI_MAG0] = ELFMAG0, + [EI_MAG1] = ELFMAG1, + [EI_MAG2] = ELFMAG2, + [EI_MAG3] = ELFMAG3, + [EI_CLASS] = ELF_CLASS, + [EI_DATA] = ELF_DATA, + [EI_VERSION] = EV_CURRENT, + [EI_OSABI] = ELF_OSABI, + }, + .e_type = ET_CORE, + .e_machine = ELF_ARCH, + .e_version = EV_CURRENT, + .e_phoff = sizeof(struct elfhdr), + .e_flags = ELF_CORE_EFLAGS, + .e_ehsize = sizeof(struct elfhdr), + .e_phentsize = sizeof(struct elf_phdr), + .e_phnum = nphdr, + }; + + tsz = min_t(size_t, buflen, sizeof(struct elfhdr) - *fpos); + if (copy_to_user(buffer, (char *)&ehdr + *fpos, tsz)) { + ret = -EFAULT; + goto out; + } - /* trim buflen to not go beyond EOF */ - if (buflen > size - *fpos) - buflen = size - *fpos; + buffer += tsz; + buflen -= tsz; + *fpos += tsz; + } - /* construct an ELF core header if we'll need some of it */ - if (*fpos < elf_buflen) { - char * elf_buf; + /* ELF program headers. */ + if (buflen && *fpos < phdrs_offset + phdrs_len) { + struct elf_phdr *phdrs, *phdr; - tsz = elf_buflen - *fpos; - if (buflen < tsz) - tsz = buflen; - elf_buf = kzalloc(elf_buflen, GFP_KERNEL); - if (!elf_buf) { + phdrs = kzalloc(phdrs_len, GFP_KERNEL); + if (!phdrs) { ret = -ENOMEM; goto out; } - elf_kcore_store_hdr(elf_buf, nphdr, elf_buflen); - if (copy_to_user(buffer, elf_buf + *fpos, tsz)) { - kfree(elf_buf); + + phdrs[0].p_type = PT_NOTE; + phdrs[0].p_offset = notes_offset; + phdrs[0].p_filesz = notes_len; + + phdr = &phdrs[1]; + list_for_each_entry(m, &kclist_head, list) { + phdr->p_type = PT_LOAD; + phdr->p_flags = PF_R | PF_W | PF_X; + phdr->p_offset = kc_vaddr_to_offset(m->addr) + data_offset; + phdr->p_vaddr = (size_t)m->addr; + if (m->type == KCORE_RAM) + phdr->p_paddr = __pa(m->addr); + else if (m->type == KCORE_TEXT) + phdr->p_paddr = __pa_symbol(m->addr); + else + phdr->p_paddr = (elf_addr_t)-1; + phdr->p_filesz = phdr->p_memsz = m->size; + phdr->p_align = PAGE_SIZE; + phdr++; + } + + tsz = min_t(size_t, buflen, phdrs_offset + phdrs_len - *fpos); + if (copy_to_user(buffer, (char *)phdrs + *fpos - phdrs_offset, + tsz)) { + kfree(phdrs); ret = -EFAULT; goto out; } - kfree(elf_buf); + kfree(phdrs); + + buffer += tsz; buflen -= tsz; *fpos += tsz; - buffer += tsz; + } + + /* ELF note segment. */ + if (buflen && *fpos < notes_offset + notes_len) { + struct elf_prstatus prstatus = {}; + struct elf_prpsinfo prpsinfo = { + .pr_sname = 'R', + .pr_fname = "vmlinux", + }; + char *notes; + size_t i = 0; + + strlcpy(prpsinfo.pr_psargs, saved_command_line, + sizeof(prpsinfo.pr_psargs)); + + notes = kzalloc(notes_len, GFP_KERNEL); + if (!notes) { + ret = -ENOMEM; + goto out; + } + + append_kcore_note(notes, &i, CORE_STR, NT_PRSTATUS, &prstatus, + sizeof(prstatus)); + append_kcore_note(notes, &i, CORE_STR, NT_PRPSINFO, &prpsinfo, + sizeof(prpsinfo)); + append_kcore_note(notes, &i, CORE_STR, NT_TASKSTRUCT, current, + arch_task_struct_size); - /* leave now if filled buffer already */ - if (buflen == 0) + tsz = min_t(size_t, buflen, notes_offset + notes_len - *fpos); + if (copy_to_user(buffer, notes + *fpos - notes_offset, tsz)) { + kfree(notes); + ret = -EFAULT; goto out; + } + kfree(notes); + + buffer += tsz; + buflen -= tsz; + *fpos += tsz; } /* * Check to see if our file offset matches with any of * the addresses in the elf_phdr on our list. */ - start = kc_offset_to_vaddr(*fpos - elf_buflen); + start = kc_offset_to_vaddr(*fpos - data_offset); if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen) tsz = buflen; - - while (buflen) { - struct kcore_list *m; + while (buflen) { list_for_each_entry(m, &kclist_head, list) { if (start >= m->addr && start < (m->addr+m->size)) break; @@ -557,7 +490,6 @@ out: return orig_buflen - buflen; } - static int open_kcore(struct inode *inode, struct file *filp) { if (!capable(CAP_SYS_RAWIO)) -- cgit v1.2.3-59-g8ed1b From bf991c2231117d50a7645792b514354fc8d19dae Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 21 Aug 2018 21:55:13 -0700 Subject: proc/kcore: optimize multiple page reads The current code does a full search of the segment list every time for every page. This is wasteful, since it's almost certain that the next page will be in the same segment. Instead, check if the previous segment covers the current page before doing the list search. Link: http://lkml.kernel.org/r/fd346c11090cf93d867e01b8d73a6567c5ac6361.1531953780.git.osandov@fb.com Signed-off-by: Omar Sandoval Cc: Alexey Dobriyan Cc: Bhupesh Sharma Cc: Eric Biederman Cc: James Morse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/kcore.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 808ef9afd084..758c14e46a44 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -428,10 +428,18 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) if ((tsz = (PAGE_SIZE - (start & ~PAGE_MASK))) > buflen) tsz = buflen; + m = NULL; while (buflen) { - list_for_each_entry(m, &kclist_head, list) { - if (start >= m->addr && start < (m->addr+m->size)) - break; + /* + * If this is the first iteration or the address is not within + * the previous entry, search for a matching entry. + */ + if (!m || start < m->addr || start >= m->addr + m->size) { + list_for_each_entry(m, &kclist_head, list) { + if (start >= m->addr && + start < m->addr + m->size) + break; + } } if (&m->list == &kclist_head) { -- cgit v1.2.3-59-g8ed1b From 23c85094fe1895caefdd19ef624ee687ec5f4507 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Tue, 21 Aug 2018 21:55:20 -0700 Subject: proc/kcore: add vmcoreinfo note to /proc/kcore The vmcoreinfo information is useful for runtime debugging tools, not just for crash dumps. A lot of this information can be determined by other means, but this is much more convenient, and it only adds a page at most to the file. Link: http://lkml.kernel.org/r/fddbcd08eed76344863303878b12de1c1e2a04b6.1531953780.git.osandov@fb.com Signed-off-by: Omar Sandoval Cc: Alexey Dobriyan Cc: Bhupesh Sharma Cc: Eric Biederman Cc: James Morse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/Kconfig | 1 + fs/proc/kcore.c | 18 ++++++++++++++++-- include/linux/crash_core.h | 2 ++ kernel/crash_core.c | 4 ++-- 4 files changed, 21 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 0eaeb41453f5..817c02b13b1d 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -31,6 +31,7 @@ config PROC_FS config PROC_KCORE bool "/proc/kcore support" if !ARM depends on PROC_FS && MMU + select CRASH_CORE help Provides a virtual ELF core file of the live kernel. This can be read with gdb and other ELF tools. No modifications can be diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 758c14e46a44..80464432dfe6 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -10,6 +10,7 @@ * Safe accesses to vmalloc/direct-mapped discontiguous areas, Kanoj Sarcar */ +#include #include #include #include @@ -81,10 +82,13 @@ static size_t get_kcore_size(int *nphdr, size_t *phdrs_len, size_t *notes_len, } *phdrs_len = *nphdr * sizeof(struct elf_phdr); - *notes_len = (3 * (sizeof(struct elf_note) + ALIGN(sizeof(CORE_STR), 4)) + + *notes_len = (4 * sizeof(struct elf_note) + + 3 * ALIGN(sizeof(CORE_STR), 4) + + VMCOREINFO_NOTE_NAME_BYTES + ALIGN(sizeof(struct elf_prstatus), 4) + ALIGN(sizeof(struct elf_prpsinfo), 4) + - ALIGN(arch_task_struct_size, 4)); + ALIGN(arch_task_struct_size, 4) + + ALIGN(vmcoreinfo_size, 4)); *data_offset = PAGE_ALIGN(sizeof(struct elfhdr) + *phdrs_len + *notes_len); return *data_offset + size; @@ -406,6 +410,16 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) sizeof(prpsinfo)); append_kcore_note(notes, &i, CORE_STR, NT_TASKSTRUCT, current, arch_task_struct_size); + /* + * vmcoreinfo_size is mostly constant after init time, but it + * can be changed by crash_save_vmcoreinfo(). Racing here with a + * panic on another CPU before the machine goes down is insanely + * unlikely, but it's better to not leave potential buffer + * overflows lying around, regardless. + */ + append_kcore_note(notes, &i, VMCOREINFO_NOTE_NAME, 0, + vmcoreinfo_data, + min(vmcoreinfo_size, notes_len - i)); tsz = min_t(size_t, buflen, notes_offset + notes_len - *fpos); if (copy_to_user(buffer, notes + *fpos - notes_offset, tsz)) { diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h index b511f6d24b42..525510a9f965 100644 --- a/include/linux/crash_core.h +++ b/include/linux/crash_core.h @@ -60,6 +60,8 @@ phys_addr_t paddr_vmcoreinfo_note(void); #define VMCOREINFO_CONFIG(name) \ vmcoreinfo_append_str("CONFIG_%s=y\n", #name) +extern unsigned char *vmcoreinfo_data; +extern size_t vmcoreinfo_size; extern u32 *vmcoreinfo_note; Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, diff --git a/kernel/crash_core.c b/kernel/crash_core.c index e7b4025c7b24..a683bfb0530f 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -14,8 +14,8 @@ #include /* vmcoreinfo stuff */ -static unsigned char *vmcoreinfo_data; -static size_t vmcoreinfo_size; +unsigned char *vmcoreinfo_data; +size_t vmcoreinfo_size; u32 *vmcoreinfo_note; /* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */ -- cgit v1.2.3-59-g8ed1b From ee8ef0a4b167c1f2605bc9b5701c603224673d53 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 21 Aug 2018 21:56:26 -0700 Subject: epoll: use the waitqueue lock to protect ep->wq Patch series "waitqueue lockdep annotation", v3. This series adds a strategic lockdep_assert_held to __wake_up_common to ensure callers really do hold the wait_queue_head lock when calling the unlocked wake_up variants. It turns out epoll did not do this for a fairly common path (hit all the time by systemd during bootup), so the second patch fixed this instance as well. This patch (of 3): The epoll code currently uses the unlocked waitqueue helpers for managing ep->wq, but instead of holding the waitqueue lock around these calls, it uses its own ep->lock spinlock. Given that the waitqueue is not exposed to the rest of the kernel this actually works ok at the moment, but prevents the epoll locking rules from being enforced using lockdep. Remove ep->lock and use the waitqueue lock to not only reduce the size of struct eventpoll but also to make sure we can assert locking invariants in the waitqueue code. Link: http://lkml.kernel.org/r/20171214152344.6880-2-hch@lst.de Signed-off-by: Christoph Hellwig Reviewed-by: Jason Baron Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Al Viro Cc: Andrea Arcangeli Cc: Mike Rapoport Cc: Jason Baron Cc: Ingo Molnar Cc: Matthew Wilcox Cc: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 65 ++++++++++++++++++++++++++-------------------------------- 1 file changed, 29 insertions(+), 36 deletions(-) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 67db22fe99c5..2737ef591b3e 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -50,10 +50,10 @@ * * 1) epmutex (mutex) * 2) ep->mtx (mutex) - * 3) ep->lock (spinlock) + * 3) ep->wq.lock (spinlock) * * The acquire order is the one listed above, from 1 to 3. - * We need a spinlock (ep->lock) because we manipulate objects + * We need a spinlock (ep->wq.lock) because we manipulate objects * from inside the poll callback, that might be triggered from * a wake_up() that in turn might be called from IRQ context. * So we can't sleep inside the poll callback and hence we need @@ -85,7 +85,7 @@ * of epoll file descriptors, we use the current recursion depth as * the lockdep subkey. * It is possible to drop the "ep->mtx" and to use the global - * mutex "epmutex" (together with "ep->lock") to have it working, + * mutex "epmutex" (together with "ep->wq.lock") to have it working, * but having "ep->mtx" will make the interface more scalable. * Events that require holding "epmutex" are very rare, while for * normal operations the epoll private "ep->mtx" will guarantee @@ -182,11 +182,10 @@ struct epitem { * This structure is stored inside the "private_data" member of the file * structure and represents the main data structure for the eventpoll * interface. + * + * Access to it is protected by the lock inside wq. */ struct eventpoll { - /* Protect the access to this structure */ - spinlock_t lock; - /* * This mutex is used to ensure that files are not removed * while epoll is using them. This is held during the event @@ -210,7 +209,7 @@ struct eventpoll { /* * This is a single linked list that chains all the "struct epitem" that * happened while transferring ready events to userspace w/out - * holding ->lock. + * holding ->wq.lock. */ struct epitem *ovflist; @@ -688,17 +687,17 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep, * because we want the "sproc" callback to be able to do it * in a lockless way. */ - spin_lock_irqsave(&ep->lock, flags); + spin_lock_irqsave(&ep->wq.lock, flags); list_splice_init(&ep->rdllist, &txlist); ep->ovflist = NULL; - spin_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irqrestore(&ep->wq.lock, flags); /* * Now call the callback function. */ res = (*sproc)(ep, &txlist, priv); - spin_lock_irqsave(&ep->lock, flags); + spin_lock_irqsave(&ep->wq.lock, flags); /* * During the time we spent inside the "sproc" callback, some * other events might have been queued by the poll callback. @@ -740,7 +739,7 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep, if (waitqueue_active(&ep->poll_wait)) pwake++; } - spin_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irqrestore(&ep->wq.lock, flags); if (!ep_locked) mutex_unlock(&ep->mtx); @@ -768,12 +767,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) struct file *file = epi->ffd.file; /* - * Removes poll wait queue hooks. We _have_ to do this without holding - * the "ep->lock" otherwise a deadlock might occur. This because of the - * sequence of the lock acquisition. Here we do "ep->lock" then the wait - * queue head lock when unregistering the wait queue. The wakeup callback - * will run by holding the wait queue head lock and will call our callback - * that will try to get "ep->lock". + * Removes poll wait queue hooks. */ ep_unregister_pollwait(ep, epi); @@ -784,10 +778,10 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) rb_erase_cached(&epi->rbn, &ep->rbr); - spin_lock_irqsave(&ep->lock, flags); + spin_lock_irqsave(&ep->wq.lock, flags); if (ep_is_linked(&epi->rdllink)) list_del_init(&epi->rdllink); - spin_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irqrestore(&ep->wq.lock, flags); wakeup_source_unregister(ep_wakeup_source(epi)); /* @@ -837,7 +831,7 @@ static void ep_free(struct eventpoll *ep) * Walks through the whole tree by freeing each "struct epitem". At this * point we are sure no poll callbacks will be lingering around, and also by * holding "epmutex" we can be sure that no file cleanup code will hit - * us during this operation. So we can avoid the lock on "ep->lock". + * us during this operation. So we can avoid the lock on "ep->wq.lock". * We do not need to lock ep->mtx, either, we only do it to prevent * a lockdep warning. */ @@ -1017,7 +1011,6 @@ static int ep_alloc(struct eventpoll **pep) if (unlikely(!ep)) goto free_uid; - spin_lock_init(&ep->lock); mutex_init(&ep->mtx); init_waitqueue_head(&ep->wq); init_waitqueue_head(&ep->poll_wait); @@ -1122,7 +1115,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v __poll_t pollflags = key_to_poll(key); int ewake = 0; - spin_lock_irqsave(&ep->lock, flags); + spin_lock_irqsave(&ep->wq.lock, flags); ep_set_busy_poll_napi_id(epi); @@ -1199,7 +1192,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v pwake++; out_unlock: - spin_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irqrestore(&ep->wq.lock, flags); /* We have to call this outside the lock */ if (pwake) @@ -1484,7 +1477,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, goto error_remove_epi; /* We have to drop the new item inside our item list to keep track of it */ - spin_lock_irqsave(&ep->lock, flags); + spin_lock_irqsave(&ep->wq.lock, flags); /* record NAPI ID of new item if present */ ep_set_busy_poll_napi_id(epi); @@ -1501,7 +1494,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, pwake++; } - spin_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irqrestore(&ep->wq.lock, flags); atomic_long_inc(&ep->user->epoll_watches); @@ -1527,10 +1520,10 @@ error_unregister: * list, since that is used/cleaned only inside a section bound by "mtx". * And ep_insert() is called with "mtx" held. */ - spin_lock_irqsave(&ep->lock, flags); + spin_lock_irqsave(&ep->wq.lock, flags); if (ep_is_linked(&epi->rdllink)) list_del_init(&epi->rdllink); - spin_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irqrestore(&ep->wq.lock, flags); wakeup_source_unregister(ep_wakeup_source(epi)); @@ -1572,9 +1565,9 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, * 1) Flush epi changes above to other CPUs. This ensures * we do not miss events from ep_poll_callback if an * event occurs immediately after we call f_op->poll(). - * We need this because we did not take ep->lock while + * We need this because we did not take ep->wq.lock while * changing epi above (but ep_poll_callback does take - * ep->lock). + * ep->wq.lock). * * 2) We also need to ensure we do not miss _past_ events * when calling f_op->poll(). This barrier also @@ -1593,7 +1586,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, * list, push it inside. */ if (ep_item_poll(epi, &pt, 1)) { - spin_lock_irq(&ep->lock); + spin_lock_irq(&ep->wq.lock); if (!ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); @@ -1604,7 +1597,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, if (waitqueue_active(&ep->poll_wait)) pwake++; } - spin_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->wq.lock); } /* We have to call this outside the lock */ @@ -1756,7 +1749,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, * caller specified a non blocking operation. */ timed_out = 1; - spin_lock_irqsave(&ep->lock, flags); + spin_lock_irqsave(&ep->wq.lock, flags); goto check_events; } @@ -1765,7 +1758,7 @@ fetch_events: if (!ep_events_available(ep)) ep_busy_loop(ep, timed_out); - spin_lock_irqsave(&ep->lock, flags); + spin_lock_irqsave(&ep->wq.lock, flags); if (!ep_events_available(ep)) { /* @@ -1807,11 +1800,11 @@ fetch_events: break; } - spin_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irqrestore(&ep->wq.lock, flags); if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) timed_out = 1; - spin_lock_irqsave(&ep->lock, flags); + spin_lock_irqsave(&ep->wq.lock, flags); } __remove_wait_queue(&ep->wq, &wait); @@ -1821,7 +1814,7 @@ check_events: /* Is it worth to try to dig for events ? */ eavail = ep_events_available(ep); - spin_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irqrestore(&ep->wq.lock, flags); /* * Try to transfer events to user space. In case we get 0 events and -- cgit v1.2.3-59-g8ed1b From c430d1e848ff1240d126e79780f3c26208b8aed9 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Tue, 21 Aug 2018 21:56:30 -0700 Subject: userfaultfd: use fault_wqh lock The userfaultfd code currently uses the unlocked waitqueue helpers for managing fault_wqh, but instead of holding the waitqueue lock for this waitqueue around these calls, it the waitqueue lock of fault_pending_wq, which is a different waitqueue instance. Given that the waitqueue is not exposed to the rest of the kernel this actually works ok at the moment, but prevents the userfaultfd locking rules from being enforced using lockdep. Switch to the internally locked waitqueue helpers instead. This means that the lock inside fault_wqh now nests inside the fault_pending_wqh lock, but that's not a problem since it was entirely unused before. [hch@lst.de: slight changelog updates] [rppt@linux.vnet.ibm.com: spotted changelog spellos] Link: http://lkml.kernel.org/r/20171214152344.6880-3-hch@lst.de Signed-off-by: Matthew Wilcox Signed-off-by: Christoph Hellwig Reviewed-by: Mike Rapoport Cc: Al Viro Cc: Andrea Arcangeli Cc: Ingo Molnar Cc: Jason Baron Cc: Peter Zijlstra Cc: Davidlohr Bueso Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 15c265d450bf..f649023b19b5 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -910,7 +910,7 @@ wakeup: */ spin_lock(&ctx->fault_pending_wqh.lock); __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range); - __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, &range); + __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range); spin_unlock(&ctx->fault_pending_wqh.lock); /* Flush pending events that may still wait on event_wqh */ @@ -1066,7 +1066,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, * anyway. */ list_del(&uwq->wq.entry); - __add_wait_queue(&ctx->fault_wqh, &uwq->wq); + add_wait_queue(&ctx->fault_wqh, &uwq->wq); write_seqcount_end(&ctx->refile_seq); @@ -1215,7 +1215,7 @@ static void __wake_userfault(struct userfaultfd_ctx *ctx, __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, range); if (waitqueue_active(&ctx->fault_wqh)) - __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, range); + __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, range); spin_unlock(&ctx->fault_pending_wqh.lock); } -- cgit v1.2.3-59-g8ed1b From 002b343669c474151954266e7fcf727bf7faa851 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 21 Aug 2018 21:56:38 -0700 Subject: fs/epoll: loosen irq safety in ep_scan_ready_list() Patch series "fs/epoll: loosen irq safety when possible". Both patches replace saving+restoring interrupts when taking the ep->lock (now the waitqueue lock), with just disabling local irqs. This shows immediate performance benefits in patch 1 for an epoll workload running on Xen. The main concern we need to have with this sort of changes in epoll is the ep_poll_callback() which is passed to the wait queue wakeup and is done very often under irq context, this patch does not touch this call. Patches have been tested pretty heavily with the customer workload, microbenchmarks, ltp testcases and two high level workloads that use epoll under the hood: nginx and libevent benchmarks. This patch (of 2): Saving and restoring interrupts in ep_scan_ready_list() is an overkill as it is never called with interrupts disabled. Loosen this to simply disabling local irqs such that archs where managing irqs is expensive or virtual environments. This patch yields some throughput improvements on a workload that is epoll intensive running on a single Xen DomU. 1 Job 7500 --> 8800 enq/s (+17%) 2 Jobs 14000 --> 15200 enq/s (+8%) 3 Jobs 20500 --> 22300 enq/s (+8%) 4 Jobs 25000 --> 28000 enq/s (+8-12)% On bare metal: For a 2-socket 40-core (ht) IvyBridge on a few workloads, unfortunately I don't have a xen environment and the results for Xen I do have (which numbers are in patch 1) I don't have the actual workload, so cannot compare them directly. 1) Different configurations were used for a epoll_wait (pipes io) microbench (http://linux-scalability.org/epoll/epoll-test.c) and shows around a 7-10% improvement in overall total number of times the epoll_wait() loops when using both regular and nested epolls, so very raw numbers, but measurable nonetheless. # threads vanilla dirty 1 1677717 1805587 2 1660510 1854064 4 1610184 1805484 8 1577696 1751222 16 1568837 1725299 32 1291532 1378463 64 752584 787368 Note that stddev is pretty small. 2) Another pipe test, which shows no real measurable improvement. (http://www.xmailserver.org/linux-patches/pipetest.c) Link: http://lkml.kernel.org/r/20180720172956.2883-2-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Cc: Jason Baron Cc: Al Viro Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 2737ef591b3e..2247769eb941 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -667,7 +667,6 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep, { __poll_t res; int pwake = 0; - unsigned long flags; struct epitem *epi, *nepi; LIST_HEAD(txlist); @@ -687,17 +686,17 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep, * because we want the "sproc" callback to be able to do it * in a lockless way. */ - spin_lock_irqsave(&ep->wq.lock, flags); + spin_lock_irq(&ep->wq.lock); list_splice_init(&ep->rdllist, &txlist); ep->ovflist = NULL; - spin_unlock_irqrestore(&ep->wq.lock, flags); + spin_unlock_irq(&ep->wq.lock); /* * Now call the callback function. */ res = (*sproc)(ep, &txlist, priv); - spin_lock_irqsave(&ep->wq.lock, flags); + spin_lock_irq(&ep->wq.lock); /* * During the time we spent inside the "sproc" callback, some * other events might have been queued by the poll callback. @@ -739,7 +738,7 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep, if (waitqueue_active(&ep->poll_wait)) pwake++; } - spin_unlock_irqrestore(&ep->wq.lock, flags); + spin_unlock_irq(&ep->wq.lock); if (!ep_locked) mutex_unlock(&ep->mtx); -- cgit v1.2.3-59-g8ed1b From 304b18b8d6af796c8ece221d34c92aeb1559789b Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 21 Aug 2018 21:56:41 -0700 Subject: fs/epoll: loosen irq safety in epoll_insert() and epoll_remove() Both functions are similar to the context of ep_modify(), called via epoll_ctl(2). Just like ep_modify(), saving and restoring interrupts is an overkill in these calls as it will never be called with irqs disabled. While ep_remove() can be called directly from EPOLL_CTL_DEL, it can also be called when releasing the file, but this also complies with the above. Link: http://lkml.kernel.org/r/20180720172956.2883-3-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Cc: Jason Baron Cc: Al Viro Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 2247769eb941..1b1abc461fc0 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -762,7 +762,6 @@ static void epi_rcu_free(struct rcu_head *head) */ static int ep_remove(struct eventpoll *ep, struct epitem *epi) { - unsigned long flags; struct file *file = epi->ffd.file; /* @@ -777,10 +776,10 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) rb_erase_cached(&epi->rbn, &ep->rbr); - spin_lock_irqsave(&ep->wq.lock, flags); + spin_lock_irq(&ep->wq.lock); if (ep_is_linked(&epi->rdllink)) list_del_init(&epi->rdllink); - spin_unlock_irqrestore(&ep->wq.lock, flags); + spin_unlock_irq(&ep->wq.lock); wakeup_source_unregister(ep_wakeup_source(epi)); /* @@ -1409,7 +1408,6 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, { int error, pwake = 0; __poll_t revents; - unsigned long flags; long user_watches; struct epitem *epi; struct ep_pqueue epq; @@ -1476,7 +1474,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, goto error_remove_epi; /* We have to drop the new item inside our item list to keep track of it */ - spin_lock_irqsave(&ep->wq.lock, flags); + spin_lock_irq(&ep->wq.lock); /* record NAPI ID of new item if present */ ep_set_busy_poll_napi_id(epi); @@ -1493,7 +1491,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, pwake++; } - spin_unlock_irqrestore(&ep->wq.lock, flags); + spin_unlock_irq(&ep->wq.lock); atomic_long_inc(&ep->user->epoll_watches); @@ -1519,10 +1517,10 @@ error_unregister: * list, since that is used/cleaned only inside a section bound by "mtx". * And ep_insert() is called with "mtx" held. */ - spin_lock_irqsave(&ep->wq.lock, flags); + spin_lock_irq(&ep->wq.lock); if (ep_is_linked(&epi->rdllink)) list_del_init(&epi->rdllink); - spin_unlock_irqrestore(&ep->wq.lock, flags); + spin_unlock_irq(&ep->wq.lock); wakeup_source_unregister(ep_wakeup_source(epi)); -- cgit v1.2.3-59-g8ed1b From 92e641784055998879942d39c74d4f84fa750968 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 21 Aug 2018 21:56:45 -0700 Subject: s/epoll: robustify irq safety with lockdep_assert_irqs_enabled() Sprinkle lockdep_assert_irqs_enabled() checks in the functions that do not save and restore interrupts when dealing with the ep->wq.lock. These are ep_scan_ready_list() and those called by epoll_ctl(): ep_insert, ep_modify and ep_remove. [akpm@linux-foundation.org: remove too-obvious comments] Link: http://lkml.kernel.org/r/20180721183127.3busfa335zlcjeox@linux-r8p5 Signed-off-by: Davidlohr Bueso Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 1b1abc461fc0..58b96d8a898a 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -670,6 +670,8 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep, struct epitem *epi, *nepi; LIST_HEAD(txlist); + lockdep_assert_irqs_enabled(); + /* * We need to lock this because we could be hit by * eventpoll_release_file() and epoll_ctl(). @@ -764,6 +766,8 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) { struct file *file = epi->ffd.file; + lockdep_assert_irqs_enabled(); + /* * Removes poll wait queue hooks. */ @@ -1412,6 +1416,8 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, struct epitem *epi; struct ep_pqueue epq; + lockdep_assert_irqs_enabled(); + user_watches = atomic_long_read(&ep->user->epoll_watches); if (unlikely(user_watches >= max_user_watches)) return -ENOSPC; @@ -1540,6 +1546,8 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, int pwake = 0; poll_table pt; + lockdep_assert_irqs_enabled(); + init_poll_funcptr(&pt, NULL); /* -- cgit v1.2.3-59-g8ed1b From 514056d506e44084369f5ce1c8186e4253901a05 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 21 Aug 2018 21:58:19 -0700 Subject: fs/eventpoll.c: simply CONFIG_NET_RX_BUSY_POLL ifdefery ... 'tis easier on the eye. [akpm@linux-foundation.org: use inlines rather than macros] Link: http://lkml.kernel.org/r/20180725185620.11020-1-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Cc: Jason Baron Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 58b96d8a898a..b5e43e11f1e3 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -391,7 +391,6 @@ static bool ep_busy_loop_end(void *p, unsigned long start_time) return ep_events_available(ep) || busy_loop_timeout(start_time); } -#endif /* CONFIG_NET_RX_BUSY_POLL */ /* * Busy poll if globally on and supporting sockets found && no events, @@ -401,20 +400,16 @@ static bool ep_busy_loop_end(void *p, unsigned long start_time) */ static void ep_busy_loop(struct eventpoll *ep, int nonblock) { -#ifdef CONFIG_NET_RX_BUSY_POLL unsigned int napi_id = READ_ONCE(ep->napi_id); if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep); -#endif } static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep) { -#ifdef CONFIG_NET_RX_BUSY_POLL if (ep->napi_id) ep->napi_id = 0; -#endif } /* @@ -422,7 +417,6 @@ static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep) */ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) { -#ifdef CONFIG_NET_RX_BUSY_POLL struct eventpoll *ep; unsigned int napi_id; struct socket *sock; @@ -452,9 +446,24 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) /* record NAPI ID for use in next busy poll */ ep->napi_id = napi_id; -#endif } +#else + +static inline void ep_busy_loop(struct eventpoll *ep, int nonblock) +{ +} + +static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep) +{ +} + +static inline void ep_set_busy_poll_napi_id(struct epitem *epi) +{ +} + +#endif /* CONFIG_NET_RX_BUSY_POLL */ + /** * ep_call_nested - Perform a bound (possibly) nested call, by checking * that the recursion limit is not exceeded, and that -- cgit v1.2.3-59-g8ed1b From 679abf381a18e945457b01921f667cee9e656a7f Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 21 Aug 2018 21:58:23 -0700 Subject: fs/eventpoll.c: loosen irq safety in ep_poll() Similar to other calls, ep_poll() is not called with interrupts disabled, and we can therefore avoid the irq save/restore dance and just disable local irqs. In fact, the call should never be called in irq context at all, considering that the only path is epoll_wait(2) -> do_epoll_wait() -> ep_poll(). When running on a 2 socket 40-core (ht) IvyBridge a common pipe based epoll_wait(2) microbenchmark, the following performance improvements are seen: # threads vanilla dirty 1 1805587 2106412 2 1854064 2090762 4 1805484 2017436 8 1751222 1974475 16 1725299 1962104 32 1378463 1571233 64 787368 900784 Which is a pretty constantly near 15%. Also add a lockdep check such that we detect any mischief before deadlocking. Link: http://lkml.kernel.org/r/20180727053432.16679-2-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Reviewed-by: Andrew Morton Cc: Al Viro Cc: Peter Zijlstra Cc: Jason Baron Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index b5e43e11f1e3..88473e6271ef 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1746,11 +1746,12 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, int maxevents, long timeout) { int res = 0, eavail, timed_out = 0; - unsigned long flags; u64 slack = 0; wait_queue_entry_t wait; ktime_t expires, *to = NULL; + lockdep_assert_irqs_enabled(); + if (timeout > 0) { struct timespec64 end_time = ep_set_mstimeout(timeout); @@ -1763,7 +1764,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, * caller specified a non blocking operation. */ timed_out = 1; - spin_lock_irqsave(&ep->wq.lock, flags); + spin_lock_irq(&ep->wq.lock); goto check_events; } @@ -1772,7 +1773,7 @@ fetch_events: if (!ep_events_available(ep)) ep_busy_loop(ep, timed_out); - spin_lock_irqsave(&ep->wq.lock, flags); + spin_lock_irq(&ep->wq.lock); if (!ep_events_available(ep)) { /* @@ -1814,11 +1815,11 @@ fetch_events: break; } - spin_unlock_irqrestore(&ep->wq.lock, flags); + spin_unlock_irq(&ep->wq.lock); if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) timed_out = 1; - spin_lock_irqsave(&ep->wq.lock, flags); + spin_lock_irq(&ep->wq.lock); } __remove_wait_queue(&ep->wq, &wait); @@ -1828,7 +1829,7 @@ check_events: /* Is it worth to try to dig for events ? */ eavail = ep_events_available(ep); - spin_unlock_irqrestore(&ep->wq.lock, flags); + spin_unlock_irq(&ep->wq.lock); /* * Try to transfer events to user space. In case we get 0 events and -- cgit v1.2.3-59-g8ed1b From 992991c03ca033f779aae1afb5b0c27bcb7139c3 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 21 Aug 2018 21:58:26 -0700 Subject: fs/eventpoll.c: simplify ep_is_linked() callers Instead of having each caller pass the rdllink explicitly, just have ep_is_linked() pass it while the callers just need the epi pointer. This helper is all about the rdllink, and this change, furthermore, improves the function's self documentation. Link: http://lkml.kernel.org/r/20180727053432.16679-3-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Reviewed-by: Andrew Morton Cc: Al Viro Cc: Jason Baron Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 88473e6271ef..42bbe6824b4b 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -336,9 +336,9 @@ static inline int ep_cmp_ffd(struct epoll_filefd *p1, } /* Tells us if the item is currently linked */ -static inline int ep_is_linked(struct list_head *p) +static inline int ep_is_linked(struct epitem *epi) { - return !list_empty(p); + return !list_empty(&epi->rdllink); } static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p) @@ -721,7 +721,7 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep, * queued into ->ovflist but the "txlist" might already * contain them, and the list_splice() below takes care of them. */ - if (!ep_is_linked(&epi->rdllink)) { + if (!ep_is_linked(epi)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); } @@ -790,7 +790,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) rb_erase_cached(&epi->rbn, &ep->rbr); spin_lock_irq(&ep->wq.lock); - if (ep_is_linked(&epi->rdllink)) + if (ep_is_linked(epi)) list_del_init(&epi->rdllink); spin_unlock_irq(&ep->wq.lock); @@ -1171,7 +1171,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v } /* If this file is already in the ready list we exit soon */ - if (!ep_is_linked(&epi->rdllink)) { + if (!ep_is_linked(epi)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake_rcu(epi); } @@ -1495,7 +1495,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, ep_set_busy_poll_napi_id(epi); /* If the file is already "ready" we drop it inside the ready list */ - if (revents && !ep_is_linked(&epi->rdllink)) { + if (revents && !ep_is_linked(epi)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); @@ -1533,7 +1533,7 @@ error_unregister: * And ep_insert() is called with "mtx" held. */ spin_lock_irq(&ep->wq.lock); - if (ep_is_linked(&epi->rdllink)) + if (ep_is_linked(epi)) list_del_init(&epi->rdllink); spin_unlock_irq(&ep->wq.lock); @@ -1601,7 +1601,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, */ if (ep_item_poll(epi, &pt, 1)) { spin_lock_irq(&ep->wq.lock); - if (!ep_is_linked(&epi->rdllink)) { + if (!ep_is_linked(epi)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); -- cgit v1.2.3-59-g8ed1b From d4d79b8195bfc6d5d8f82f9189c1bc828cc7e03a Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Tue, 21 Aug 2018 21:58:41 -0700 Subject: autofs: fix directory and symlink access Depending on how it is configured the autofs user space daemon can leave in use mounts mounted at exit and re-connect to them at start up. But for this to work best the state of the autofs file system needs to be left intact over the restart. Also, at system shutdown, mounts in an autofs file system might be umounted exposing a mount point trigger for which subsequent access can lead to a hang. So recent versions of automount(8) now does its best to set autofs file system mounts catatonic at shutdown. When autofs file system mounts are catatonic it's currently possible to create and remove directories and symlinks which can be a problem at restart, as described above. So return EACCES in the directory, symlink and unlink methods if the autofs file system is catatonic. Link: http://lkml.kernel.org/r/152902119090.4144.9561910674530214291.stgit@pluto.themaw.net Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs/root.c | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/autofs/root.c b/fs/autofs/root.c index a3d414150578..782e57b911ab 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -559,6 +559,13 @@ static int autofs_dir_symlink(struct inode *dir, if (!autofs_oz_mode(sbi)) return -EACCES; + /* autofs_oz_mode() needs to allow path walks when the + * autofs mount is catatonic but the state of an autofs + * file system needs to be preserved over restarts. + */ + if (sbi->catatonic) + return -EACCES; + BUG_ON(!ino); autofs_clean_ino(ino); @@ -612,9 +619,15 @@ static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry) struct autofs_info *ino = autofs_dentry_ino(dentry); struct autofs_info *p_ino; - /* This allows root to remove symlinks */ - if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) - return -EPERM; + if (!autofs_oz_mode(sbi)) + return -EACCES; + + /* autofs_oz_mode() needs to allow path walks when the + * autofs mount is catatonic but the state of an autofs + * file system needs to be preserved over restarts. + */ + if (sbi->catatonic) + return -EACCES; if (atomic_dec_and_test(&ino->count)) { p_ino = autofs_dentry_ino(dentry->d_parent); @@ -697,6 +710,13 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry) if (!autofs_oz_mode(sbi)) return -EACCES; + /* autofs_oz_mode() needs to allow path walks when the + * autofs mount is catatonic but the state of an autofs + * file system needs to be preserved over restarts. + */ + if (sbi->catatonic) + return -EACCES; + spin_lock(&sbi->lookup_lock); if (!simple_empty(dentry)) { spin_unlock(&sbi->lookup_lock); @@ -735,6 +755,13 @@ static int autofs_dir_mkdir(struct inode *dir, if (!autofs_oz_mode(sbi)) return -EACCES; + /* autofs_oz_mode() needs to allow path walks when the + * autofs mount is catatonic but the state of an autofs + * file system needs to be preserved over restarts. + */ + if (sbi->catatonic) + return -EACCES; + pr_debug("dentry %p, creating %pd\n", dentry, dentry); BUG_ON(!ino); -- cgit v1.2.3-59-g8ed1b From 2fd9944f0fd41f7bc2f590169a9a758e1186b345 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Tue, 21 Aug 2018 21:58:44 -0700 Subject: autofs: fix inconsistent use of now variable The global variable "now" in fs/autofs/expire.c is used in an inconsistent way, sometimes using jiffies directly, and sometimes using the "now" variable, and setting it isn't done consistently either. But the autofs dentry info last_used field is only updated during path walks or during expire so jiffies can be used directly and the global variable "now" removed. Link: http://lkml.kernel.org/r/152937731702.21213.7371321165189170865.stgit@pluto.themaw.net Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs/expire.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c index b332d3f6e730..295feec10ea6 100644 --- a/fs/autofs/expire.c +++ b/fs/autofs/expire.c @@ -10,8 +10,6 @@ #include "autofs_i.h" -static unsigned long now; - /* Check if a dentry can be expired */ static inline int autofs_can_expire(struct dentry *dentry, unsigned long timeout, int do_now) @@ -24,7 +22,7 @@ static inline int autofs_can_expire(struct dentry *dentry, if (!do_now) { /* Too young to die */ - if (!timeout || time_after(ino->last_used + timeout, now)) + if (!timeout || time_after(ino->last_used + timeout, jiffies)) return 0; } return 1; @@ -307,7 +305,6 @@ struct dentry *autofs_expire_direct(struct super_block *sb, if (!root) return NULL; - now = jiffies; timeout = sbi->exp_timeout; if (!autofs_direct_busy(mnt, root, timeout, do_now)) { @@ -442,7 +439,6 @@ struct dentry *autofs_expire_indirect(struct super_block *sb, if (!root) return NULL; - now = jiffies; timeout = sbi->exp_timeout; dentry = NULL; @@ -575,7 +571,7 @@ int autofs_expire_run(struct super_block *sb, spin_lock(&sbi->fs_lock); ino = autofs_dentry_ino(dentry); /* avoid rapid-fire expire attempts if expiry fails */ - ino->last_used = now; + ino->last_used = jiffies; ino->flags &= ~(AUTOFS_INF_EXPIRING|AUTOFS_INF_WANT_EXPIRE); complete_all(&ino->expire_complete); spin_unlock(&sbi->fs_lock); @@ -605,7 +601,7 @@ int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, spin_lock(&sbi->fs_lock); /* avoid rapid-fire expire attempts if expiry fails */ - ino->last_used = now; + ino->last_used = jiffies; ino->flags &= ~(AUTOFS_INF_EXPIRING|AUTOFS_INF_WANT_EXPIRE); complete_all(&ino->expire_complete); spin_unlock(&sbi->fs_lock); -- cgit v1.2.3-59-g8ed1b From d1055565bdc27fd4dc90a91988b938b949662025 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Tue, 21 Aug 2018 21:58:48 -0700 Subject: autofs: fix clearing AUTOFS_EXP_LEAVES in autofs_expire_indirect() The expire flag AUTOFS_EXP_LEAVES is cleared before the second call to should_expire() in autofs_expire_indirect() but the parameter passed in the second call is incorrect. Fortunately AUTOFS_EXP_LEAVES expire flag has not been used for a long time but might be needed in the future so fix it rather than remove the expire leaves functionality. Link: http://lkml.kernel.org/r/152937732410.21213.7447294898147765076.stgit@pluto.themaw.net Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs/expire.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c index 295feec10ea6..41855cdc5630 100644 --- a/fs/autofs/expire.c +++ b/fs/autofs/expire.c @@ -467,7 +467,7 @@ struct dentry *autofs_expire_indirect(struct super_block *sb, * things have changed. */ flags &= ~AUTOFS_EXP_LEAVES; - found = should_expire(expired, mnt, timeout, how); + found = should_expire(expired, mnt, timeout, flags); if (!found || found != expired) /* Something has changed, continue */ goto next; -- cgit v1.2.3-59-g8ed1b From 5d30517d67e349164838ad039f0f2d09dde15f59 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Tue, 21 Aug 2018 21:58:51 -0700 Subject: autofs: make autofs_expire_direct() static autofs_expire_direct() isn't used outside of fs/autofs/expire.c so make it static. Link: http://lkml.kernel.org/r/152937732944.21213.11821977712410930973.stgit@pluto.themaw.net Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs/autofs_i.h | 3 --- fs/autofs/expire.c | 8 ++++---- 2 files changed, 4 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index 5057b9f0f846..fe1b62cfc769 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -156,9 +156,6 @@ int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, struct autofs_sb_info *sbi, int when); int autofs_expire_multi(struct super_block *, struct vfsmount *, struct autofs_sb_info *, int __user *); -struct dentry *autofs_expire_direct(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, int how); struct dentry *autofs_expire_indirect(struct super_block *sb, struct vfsmount *mnt, struct autofs_sb_info *sbi, int how); diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c index 41855cdc5630..64e6eba2c628 100644 --- a/fs/autofs/expire.c +++ b/fs/autofs/expire.c @@ -292,10 +292,10 @@ static struct dentry *autofs_check_leaves(struct vfsmount *mnt, } /* Check if we can expire a direct mount (possibly a tree) */ -struct dentry *autofs_expire_direct(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, - int how) +static struct dentry *autofs_expire_direct(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, + int how) { unsigned long timeout; struct dentry *root = dget(sb->s_root); -- cgit v1.2.3-59-g8ed1b From 571bc35c42f3455bc55393f22cb97f7407a5a6d1 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Tue, 21 Aug 2018 21:58:54 -0700 Subject: autofs: make autofs_expire_indirect() static autofs_expire_indirect() isn't used outside of fs/autofs/expire.c so make it static. Link: http://lkml.kernel.org/r/152937733512.21213.10509996499623738446.stgit@pluto.themaw.net Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs/autofs_i.h | 3 --- fs/autofs/expire.c | 8 ++++---- 2 files changed, 4 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index fe1b62cfc769..f53ca4bd5365 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -156,9 +156,6 @@ int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, struct autofs_sb_info *sbi, int when); int autofs_expire_multi(struct super_block *, struct vfsmount *, struct autofs_sb_info *, int __user *); -struct dentry *autofs_expire_indirect(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, int how); /* Device node initialization */ diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c index 64e6eba2c628..5fb47a4ca48c 100644 --- a/fs/autofs/expire.c +++ b/fs/autofs/expire.c @@ -424,10 +424,10 @@ static struct dentry *should_expire(struct dentry *dentry, * - it is unused by any user process * - it has been unused for exp_timeout time */ -struct dentry *autofs_expire_indirect(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, - int how) +static struct dentry *autofs_expire_indirect(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, + int how) { unsigned long timeout; struct dentry *root = sb->s_root; -- cgit v1.2.3-59-g8ed1b From e5c85e1fe19c03777cbacf4b5a5167b2f5ff90fb Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Tue, 21 Aug 2018 21:58:58 -0700 Subject: autofs: make expire flags usage consistent with v5 params Make the usage of the expire flags consistent by naming the expire flags the same as it is named in the version 5 miscelaneous ioctl parameters and only check the bit flags when needed. Link: http://lkml.kernel.org/r/152937734046.21213.9454131988766280028.stgit@pluto.themaw.net Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs/autofs_i.h | 2 +- fs/autofs/expire.c | 61 ++++++++++++++++++++++++---------------------------- 2 files changed, 29 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index f53ca4bd5365..633986a6a93a 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -153,7 +153,7 @@ int autofs_expire_run(struct super_block *, struct vfsmount *, struct autofs_sb_info *, struct autofs_packet_expire __user *); int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, - struct autofs_sb_info *sbi, int when); + struct autofs_sb_info *sbi, unsigned int how); int autofs_expire_multi(struct super_block *, struct vfsmount *, struct autofs_sb_info *, int __user *); diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c index 5fb47a4ca48c..dfb666c5b8a2 100644 --- a/fs/autofs/expire.c +++ b/fs/autofs/expire.c @@ -12,7 +12,7 @@ /* Check if a dentry can be expired */ static inline int autofs_can_expire(struct dentry *dentry, - unsigned long timeout, int do_now) + unsigned long timeout, unsigned int how) { struct autofs_info *ino = autofs_dentry_ino(dentry); @@ -20,7 +20,7 @@ static inline int autofs_can_expire(struct dentry *dentry, if (ino == NULL) return 0; - if (!do_now) { + if (!(how & AUTOFS_EXP_IMMEDIATE)) { /* Too young to die */ if (!timeout || time_after(ino->last_used + timeout, jiffies)) return 0; @@ -185,7 +185,7 @@ again: static int autofs_direct_busy(struct vfsmount *mnt, struct dentry *top, unsigned long timeout, - int do_now) + unsigned int how) { pr_debug("top %p %pd\n", top, top); @@ -200,7 +200,7 @@ static int autofs_direct_busy(struct vfsmount *mnt, } /* Timeout of a direct mount is determined by its top dentry */ - if (!autofs_can_expire(top, timeout, do_now)) + if (!autofs_can_expire(top, timeout, how)) return 1; return 0; @@ -213,7 +213,7 @@ static int autofs_direct_busy(struct vfsmount *mnt, static int autofs_tree_busy(struct vfsmount *mnt, struct dentry *top, unsigned long timeout, - int do_now) + unsigned int how) { struct autofs_info *top_ino = autofs_dentry_ino(top); struct dentry *p; @@ -259,7 +259,7 @@ static int autofs_tree_busy(struct vfsmount *mnt, } /* Timeout of a tree mount is ultimately determined by its top dentry */ - if (!autofs_can_expire(top, timeout, do_now)) + if (!autofs_can_expire(top, timeout, how)) return 1; return 0; @@ -268,7 +268,7 @@ static int autofs_tree_busy(struct vfsmount *mnt, static struct dentry *autofs_check_leaves(struct vfsmount *mnt, struct dentry *parent, unsigned long timeout, - int do_now) + unsigned int how) { struct dentry *p; @@ -284,7 +284,7 @@ static struct dentry *autofs_check_leaves(struct vfsmount *mnt, continue; /* Can we expire this guy */ - if (autofs_can_expire(p, timeout, do_now)) + if (autofs_can_expire(p, timeout, how)) return p; } } @@ -295,19 +295,18 @@ static struct dentry *autofs_check_leaves(struct vfsmount *mnt, static struct dentry *autofs_expire_direct(struct super_block *sb, struct vfsmount *mnt, struct autofs_sb_info *sbi, - int how) + unsigned int how) { - unsigned long timeout; struct dentry *root = dget(sb->s_root); - int do_now = how & AUTOFS_EXP_IMMEDIATE; struct autofs_info *ino; + unsigned long timeout; if (!root) return NULL; timeout = sbi->exp_timeout; - if (!autofs_direct_busy(mnt, root, timeout, do_now)) { + if (!autofs_direct_busy(mnt, root, timeout, how)) { spin_lock(&sbi->fs_lock); ino = autofs_dentry_ino(root); /* No point expiring a pending mount */ @@ -318,7 +317,7 @@ static struct dentry *autofs_expire_direct(struct super_block *sb, ino->flags |= AUTOFS_INF_WANT_EXPIRE; spin_unlock(&sbi->fs_lock); synchronize_rcu(); - if (!autofs_direct_busy(mnt, root, timeout, do_now)) { + if (!autofs_direct_busy(mnt, root, timeout, how)) { spin_lock(&sbi->fs_lock); ino->flags |= AUTOFS_INF_EXPIRING; init_completion(&ino->expire_complete); @@ -343,10 +342,8 @@ out: static struct dentry *should_expire(struct dentry *dentry, struct vfsmount *mnt, unsigned long timeout, - int how) + unsigned int how) { - int do_now = how & AUTOFS_EXP_IMMEDIATE; - int exp_leaves = how & AUTOFS_EXP_LEAVES; struct autofs_info *ino = autofs_dentry_ino(dentry); unsigned int ino_count; @@ -368,7 +365,7 @@ static struct dentry *should_expire(struct dentry *dentry, return NULL; /* Can we expire this guy */ - if (autofs_can_expire(dentry, timeout, do_now)) + if (autofs_can_expire(dentry, timeout, how)) return dentry; return NULL; } @@ -379,7 +376,7 @@ static struct dentry *should_expire(struct dentry *dentry, * A symlink can't be "busy" in the usual sense so * just check last used for expire timeout. */ - if (autofs_can_expire(dentry, timeout, do_now)) + if (autofs_can_expire(dentry, timeout, how)) return dentry; return NULL; } @@ -388,13 +385,13 @@ static struct dentry *should_expire(struct dentry *dentry, return NULL; /* Case 2: tree mount, expire iff entire tree is not busy */ - if (!exp_leaves) { + if (!(how & AUTOFS_EXP_LEAVES)) { /* Path walk currently on this dentry? */ ino_count = atomic_read(&ino->count) + 1; if (d_count(dentry) > ino_count) return NULL; - if (!autofs_tree_busy(mnt, dentry, timeout, do_now)) + if (!autofs_tree_busy(mnt, dentry, timeout, how)) return dentry; /* * Case 3: pseudo direct mount, expire individual leaves @@ -408,7 +405,7 @@ static struct dentry *should_expire(struct dentry *dentry, if (d_count(dentry) > ino_count) return NULL; - expired = autofs_check_leaves(mnt, dentry, timeout, do_now); + expired = autofs_check_leaves(mnt, dentry, timeout, how); if (expired) { if (expired == dentry) dput(dentry); @@ -427,7 +424,7 @@ static struct dentry *should_expire(struct dentry *dentry, static struct dentry *autofs_expire_indirect(struct super_block *sb, struct vfsmount *mnt, struct autofs_sb_info *sbi, - int how) + unsigned int how) { unsigned long timeout; struct dentry *root = sb->s_root; @@ -443,8 +440,6 @@ static struct dentry *autofs_expire_indirect(struct super_block *sb, dentry = NULL; while ((dentry = get_next_positive_subdir(dentry, root))) { - int flags = how; - spin_lock(&sbi->fs_lock); ino = autofs_dentry_ino(dentry); if (ino->flags & AUTOFS_INF_WANT_EXPIRE) { @@ -453,7 +448,7 @@ static struct dentry *autofs_expire_indirect(struct super_block *sb, } spin_unlock(&sbi->fs_lock); - expired = should_expire(dentry, mnt, timeout, flags); + expired = should_expire(dentry, mnt, timeout, how); if (!expired) continue; @@ -466,8 +461,8 @@ static struct dentry *autofs_expire_indirect(struct super_block *sb, /* Make sure a reference is not taken on found if * things have changed. */ - flags &= ~AUTOFS_EXP_LEAVES; - found = should_expire(expired, mnt, timeout, flags); + how &= ~AUTOFS_EXP_LEAVES; + found = should_expire(expired, mnt, timeout, how); if (!found || found != expired) /* Something has changed, continue */ goto next; @@ -580,15 +575,15 @@ int autofs_expire_run(struct super_block *sb, } int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, - struct autofs_sb_info *sbi, int when) + struct autofs_sb_info *sbi, unsigned int how) { struct dentry *dentry; int ret = -EAGAIN; if (autofs_type_trigger(sbi->type)) - dentry = autofs_expire_direct(sb, mnt, sbi, when); + dentry = autofs_expire_direct(sb, mnt, sbi, how); else - dentry = autofs_expire_indirect(sb, mnt, sbi, when); + dentry = autofs_expire_indirect(sb, mnt, sbi, how); if (dentry) { struct autofs_info *ino = autofs_dentry_ino(dentry); @@ -618,10 +613,10 @@ int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, int autofs_expire_multi(struct super_block *sb, struct vfsmount *mnt, struct autofs_sb_info *sbi, int __user *arg) { - int do_now = 0; + unsigned int how = 0; - if (arg && get_user(do_now, arg)) + if (arg && get_user(how, arg)) return -EFAULT; - return autofs_do_expire_multi(sb, mnt, sbi, do_now); + return autofs_do_expire_multi(sb, mnt, sbi, how); } -- cgit v1.2.3-59-g8ed1b From cbf6898fd69455092c43cd573b38d42c86ddb1e0 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Tue, 21 Aug 2018 21:59:01 -0700 Subject: autofs: add AUTOFS_EXP_FORCED flag The userspace automount(8) daemon is meant to perform a forced expire when sent a SIGUSR2. But since the expiration is routed through the kernel and the kernel doesn't send an expire request if the mount is busy this hasn't worked at least since autofs version 5. Add an AUTOFS_EXP_FORCED flag to allow implemention of the feature and bump the protocol version so user space can check if it's implemented if needed. Link: http://lkml.kernel.org/r/152937734715.21213.6594007182776598970.stgit@pluto.themaw.net Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs/expire.c | 62 +++++++++++++++++++++++++++++++++++--------- include/uapi/linux/auto_fs.h | 8 +++--- 2 files changed, 55 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c index dfb666c5b8a2..d441244b79df 100644 --- a/fs/autofs/expire.c +++ b/fs/autofs/expire.c @@ -29,7 +29,8 @@ static inline int autofs_can_expire(struct dentry *dentry, } /* Check a mount point for busyness */ -static int autofs_mount_busy(struct vfsmount *mnt, struct dentry *dentry) +static int autofs_mount_busy(struct vfsmount *mnt, + struct dentry *dentry, unsigned int how) { struct dentry *top = dentry; struct path path = {.mnt = mnt, .dentry = dentry}; @@ -50,6 +51,12 @@ static int autofs_mount_busy(struct vfsmount *mnt, struct dentry *dentry) goto done; } + /* Not a submount, has a forced expire been requested */ + if (how & AUTOFS_EXP_FORCED) { + status = 0; + goto done; + } + /* Update the expiry counter if fs is busy */ if (!may_umount_tree(path.mnt)) { struct autofs_info *ino; @@ -189,6 +196,10 @@ static int autofs_direct_busy(struct vfsmount *mnt, { pr_debug("top %p %pd\n", top, top); + /* Forced expire, user space handles busy mounts */ + if (how & AUTOFS_EXP_FORCED) + return 0; + /* If it's busy update the expiry counters */ if (!may_umount_tree(mnt)) { struct autofs_info *ino; @@ -235,7 +246,7 @@ static int autofs_tree_busy(struct vfsmount *mnt, * If the fs is busy update the expiry counter. */ if (d_mountpoint(p)) { - if (autofs_mount_busy(mnt, p)) { + if (autofs_mount_busy(mnt, p, how)) { top_ino->last_used = jiffies; dput(p); return 1; @@ -258,6 +269,10 @@ static int autofs_tree_busy(struct vfsmount *mnt, } } + /* Forced expire, user space handles busy mounts */ + if (how & AUTOFS_EXP_FORCED) + return 0; + /* Timeout of a tree mount is ultimately determined by its top dentry */ if (!autofs_can_expire(top, timeout, how)) return 1; @@ -280,9 +295,15 @@ static struct dentry *autofs_check_leaves(struct vfsmount *mnt, if (d_mountpoint(p)) { /* Can we umount this guy */ - if (autofs_mount_busy(mnt, p)) + if (autofs_mount_busy(mnt, p, how)) continue; + /* This isn't a submount so if a forced expire + * has been requested, user space handles busy + * mounts */ + if (how & AUTOFS_EXP_FORCED) + return p; + /* Can we expire this guy */ if (autofs_can_expire(p, timeout, how)) return p; @@ -361,9 +382,15 @@ static struct dentry *should_expire(struct dentry *dentry, pr_debug("checking mountpoint %p %pd\n", dentry, dentry); /* Can we umount this guy */ - if (autofs_mount_busy(mnt, dentry)) + if (autofs_mount_busy(mnt, dentry, how)) return NULL; + /* This isn't a submount so if a forced expire + * has been requested, user space handles busy + * mounts */ + if (how & AUTOFS_EXP_FORCED) + return dentry; + /* Can we expire this guy */ if (autofs_can_expire(dentry, timeout, how)) return dentry; @@ -372,6 +399,11 @@ static struct dentry *should_expire(struct dentry *dentry, if (d_really_is_positive(dentry) && d_is_symlink(dentry)) { pr_debug("checking symlink %p %pd\n", dentry, dentry); + + /* Forced expire, user space handles busy mounts */ + if (how & AUTOFS_EXP_FORCED) + return dentry; + /* * A symlink can't be "busy" in the usual sense so * just check last used for expire timeout. @@ -386,10 +418,13 @@ static struct dentry *should_expire(struct dentry *dentry, /* Case 2: tree mount, expire iff entire tree is not busy */ if (!(how & AUTOFS_EXP_LEAVES)) { - /* Path walk currently on this dentry? */ - ino_count = atomic_read(&ino->count) + 1; - if (d_count(dentry) > ino_count) - return NULL; + /* Not a forced expire? */ + if (!(how & AUTOFS_EXP_FORCED)) { + /* ref-walk currently on this dentry? */ + ino_count = atomic_read(&ino->count) + 1; + if (d_count(dentry) > ino_count) + return NULL; + } if (!autofs_tree_busy(mnt, dentry, timeout, how)) return dentry; @@ -398,12 +433,15 @@ static struct dentry *should_expire(struct dentry *dentry, * (autofs-4.1). */ } else { - /* Path walk currently on this dentry? */ struct dentry *expired; - ino_count = atomic_read(&ino->count) + 1; - if (d_count(dentry) > ino_count) - return NULL; + /* Not a forced expire? */ + if (!(how & AUTOFS_EXP_FORCED)) { + /* ref-walk currently on this dentry? */ + ino_count = atomic_read(&ino->count) + 1; + if (d_count(dentry) > ino_count) + return NULL; + } expired = autofs_check_leaves(mnt, dentry, timeout, how); if (expired) { diff --git a/include/uapi/linux/auto_fs.h b/include/uapi/linux/auto_fs.h index e13eec3dfb2f..df31aa9c9a8c 100644 --- a/include/uapi/linux/auto_fs.h +++ b/include/uapi/linux/auto_fs.h @@ -23,7 +23,7 @@ #define AUTOFS_MIN_PROTO_VERSION 3 #define AUTOFS_MAX_PROTO_VERSION 5 -#define AUTOFS_PROTO_SUBVERSION 2 +#define AUTOFS_PROTO_SUBVERSION 3 /* * The wait_queue_token (autofs_wqt_t) is part of a structure which is passed @@ -90,8 +90,10 @@ enum { /* autofs version 4 and later definitions */ /* Mask for expire behaviour */ -#define AUTOFS_EXP_IMMEDIATE 1 -#define AUTOFS_EXP_LEAVES 2 +#define AUTOFS_EXP_NORMAL 0x00 +#define AUTOFS_EXP_IMMEDIATE 0x01 +#define AUTOFS_EXP_LEAVES 0x02 +#define AUTOFS_EXP_FORCED 0x04 #define AUTOFS_TYPE_ANY 0U #define AUTOFS_TYPE_INDIRECT 1U -- cgit v1.2.3-59-g8ed1b From 21a1a52dbdd5c9dc17c546bbdb95038f53515d2c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 21 Aug 2018 21:59:05 -0700 Subject: nilfs2: use 64-bit superblock timstamps The mount time field in the superblock uses a 64-bit timestamp, but calling get_seconds() may truncate the current time to 32 bits. This changes it to ktime_get_real_seconds() to avoid the potential overflow. Link: http://lkml.kernel.org/r/20180620075041.4154396-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Acked-by: Ryusuke Konishi Cc: David Howells Cc: Jeff Layton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 6ffeca84d7c3..1b9067cf4511 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -834,7 +834,7 @@ static int nilfs_setup_super(struct super_block *sb, int is_mount) sbp[0]->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT); sbp[0]->s_mnt_count = cpu_to_le16(mnt_count + 1); - sbp[0]->s_mtime = cpu_to_le64(get_seconds()); + sbp[0]->s_mtime = cpu_to_le64(ktime_get_real_seconds()); skip_mount_setup: sbp[0]->s_state = -- cgit v1.2.3-59-g8ed1b From c8ed98cd88a2b39f504e1ce6af42423c71ebe5b7 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Tue, 21 Aug 2018 21:59:08 -0700 Subject: fs/nilfs2/file.c: use new return type vm_fault_t Use new return type vm_fault_t for page_mkwrite handler. Link: http://lkml.kernel.org/r/1529555928-2411-1-git-send-email-konishi.ryusuke@lab.ntt.co.jp Signed-off-by: Souptick Joarder Signed-off-by: Ryusuke Konishi Reviewed-by: Matthew Wilcox Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index c5fa3dee72fc..7da0fac71dc2 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -51,7 +51,7 @@ int nilfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) return err; } -static int nilfs_page_mkwrite(struct vm_fault *vmf) +static vm_fault_t nilfs_page_mkwrite(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page = vmf->page; -- cgit v1.2.3-59-g8ed1b From 7464726cb5998846306ed0a7d6714afb2e37b25d Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Tue, 21 Aug 2018 21:59:12 -0700 Subject: hfsplus: don't return 0 when fill_super() failed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit syzbot is reporting NULL pointer dereference at mount_fs() [1]. This is because hfsplus_fill_super() is by error returning 0 when hfsplus_fill_super() detected invalid filesystem image, and mount_bdev() is returning NULL because dget(s->s_root) == NULL if s->s_root == NULL, and mount_fs() is accessing root->d_sb because IS_ERR(root) == false if root == NULL. Fix this by returning -EINVAL when hfsplus_fill_super() detected invalid filesystem image. [1] https://syzkaller.appspot.com/bug?id=21acb6850cecbc960c927229e597158cf35f33d0 Link: http://lkml.kernel.org/r/d83ce31a-874c-dd5b-f790-41405983a5be@I-love.SAKURA.ne.jp Signed-off-by: Tetsuo Handa Reported-by: syzbot Reviewed-by: Ernesto A. Fernández Reviewed-by: Andrew Morton Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfsplus/super.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index a6c0f54c48c3..80abba550bfa 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -524,8 +524,10 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) goto out_put_root; if (!hfs_brec_read(&fd, &entry, sizeof(entry))) { hfs_find_exit(&fd); - if (entry.type != cpu_to_be16(HFSPLUS_FOLDER)) + if (entry.type != cpu_to_be16(HFSPLUS_FOLDER)) { + err = -EINVAL; goto out_put_root; + } inode = hfsplus_iget(sb, be32_to_cpu(entry.folder.id)); if (IS_ERR(inode)) { err = PTR_ERR(inode); -- cgit v1.2.3-59-g8ed1b From 31651c607151f1034cfb57e5a78678bea54c362b Mon Sep 17 00:00:00 2001 From: "Ernesto A. Fernández" Date: Tue, 21 Aug 2018 21:59:16 -0700 Subject: hfsplus: avoid deadlock on file truncation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After an extent is removed from the extent tree, the corresponding bits are also cleared from the block allocation file. This is currently done without releasing the tree lock. The problem is that the allocation file has extents of its own; if it is fragmented enough, some of them may be in the extent tree as well, and hfsplus_get_block() will try to take the lock again. To avoid deadlock, only hold the extent tree lock during the actual tree operations. Link: http://lkml.kernel.org/r/20180709202549.auxwkb6memlegb4a@eaf Signed-off-by: Ernesto A. Fernández Reported-by: Anatoly Trosinenko Cc: Viacheslav Dubeyko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfsplus/extents.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index e8770935ce6d..8e0f59767694 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -336,6 +336,9 @@ static int hfsplus_free_extents(struct super_block *sb, int i; int err = 0; + /* Mapping the allocation file may lock the extent tree */ + WARN_ON(mutex_is_locked(&HFSPLUS_SB(sb)->ext_tree->tree_lock)); + hfsplus_dump_extent(extent); for (i = 0; i < 8; extent++, i++) { count = be32_to_cpu(extent->block_count); @@ -415,11 +418,13 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, if (res) break; start = be32_to_cpu(fd.key->ext.start_block); - hfsplus_free_extents(sb, ext_entry, - total_blocks - start, - total_blocks); hfs_brec_remove(&fd); + + mutex_unlock(&fd.tree->tree_lock); + hfsplus_free_extents(sb, ext_entry, total_blocks - start, + total_blocks); total_blocks = start; + mutex_lock(&fd.tree->tree_lock); } while (total_blocks > blocks); hfs_find_exit(&fd); @@ -576,15 +581,20 @@ void hfsplus_file_truncate(struct inode *inode) } while (1) { if (alloc_cnt == hip->first_blocks) { + mutex_unlock(&fd.tree->tree_lock); hfsplus_free_extents(sb, hip->first_extents, alloc_cnt, alloc_cnt - blk_cnt); hfsplus_dump_extent(hip->first_extents); hip->first_blocks = blk_cnt; + mutex_lock(&fd.tree->tree_lock); break; } res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt); if (res) break; + hfs_brec_remove(&fd); + + mutex_unlock(&fd.tree->tree_lock); start = hip->cached_start; hfsplus_free_extents(sb, hip->cached_extents, alloc_cnt - start, alloc_cnt - blk_cnt); @@ -596,7 +606,7 @@ void hfsplus_file_truncate(struct inode *inode) alloc_cnt = start; hip->cached_start = hip->cached_blocks = 0; hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW); - hfs_brec_remove(&fd); + mutex_lock(&fd.tree->tree_lock); } hfs_find_exit(&fd); -- cgit v1.2.3-59-g8ed1b From afd6c9e1f5287ad236adcf56db8c42fef65561fa Mon Sep 17 00:00:00 2001 From: "Ernesto A. Fernández" Date: Tue, 21 Aug 2018 21:59:19 -0700 Subject: hfsplus: fix decomposition of Hangul characters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Files created under macOS cannot be opened under linux if their names contain Korean characters, and vice versa. The Korean alphabet is special because its normalization is done without a table. The module deals with it correctly when composing, but forgets about it for the decomposition. Fix this using the Hangul decomposition function provided in the Unicode Standard. The code fits a bit awkwardly because it requires a buffer, while all the other normalizations are returned as pointers to the decomposition table. This is actually also a bug because reordering may still be needed, but for now leave it as it is. The patch will cause trouble for Hangul filenames already created by the module in the past. This shouldn't really be concern because its main purpose was always sharing with macOS. If a user actually needs to access such a file the nodecompose mount option should be enough. Link: http://lkml.kernel.org/r/20180717220951.p6qqrgautc4pxvzu@eaf Signed-off-by: Ernesto A. Fernández Reported-by: Ting-Chang Hou Tested-by: Ting-Chang Hou Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfsplus/unicode.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 56 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c index dfa90c21948f..c8d1b2be7854 100644 --- a/fs/hfsplus/unicode.c +++ b/fs/hfsplus/unicode.c @@ -272,8 +272,8 @@ static inline int asc2unichar(struct super_block *sb, const char *astr, int len, return size; } -/* Decomposes a single unicode character. */ -static inline u16 *decompose_unichar(wchar_t uc, int *size) +/* Decomposes a non-Hangul unicode character. */ +static u16 *hfsplus_decompose_nonhangul(wchar_t uc, int *size) { int off; @@ -296,6 +296,51 @@ static inline u16 *decompose_unichar(wchar_t uc, int *size) return hfsplus_decompose_table + (off / 4); } +/* + * Try to decompose a unicode character as Hangul. Return 0 if @uc is not + * precomposed Hangul, otherwise return the length of the decomposition. + * + * This function was adapted from sample code from the Unicode Standard + * Annex #15: Unicode Normalization Forms, version 3.2.0. + * + * Copyright (C) 1991-2018 Unicode, Inc. All rights reserved. Distributed + * under the Terms of Use in http://www.unicode.org/copyright.html. + */ +static int hfsplus_try_decompose_hangul(wchar_t uc, u16 *result) +{ + int index; + int l, v, t; + + index = uc - Hangul_SBase; + if (index < 0 || index >= Hangul_SCount) + return 0; + + l = Hangul_LBase + index / Hangul_NCount; + v = Hangul_VBase + (index % Hangul_NCount) / Hangul_TCount; + t = Hangul_TBase + index % Hangul_TCount; + + result[0] = l; + result[1] = v; + if (t != Hangul_TBase) { + result[2] = t; + return 3; + } + return 2; +} + +/* Decomposes a single unicode character. */ +static u16 *decompose_unichar(wchar_t uc, int *size, u16 *hangul_buffer) +{ + u16 *result; + + /* Hangul is handled separately */ + result = hangul_buffer; + *size = hfsplus_try_decompose_hangul(uc, result); + if (*size == 0) + result = hfsplus_decompose_nonhangul(uc, size); + return result; +} + int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr, int max_unistr_len, const char *astr, int len) @@ -303,13 +348,14 @@ int hfsplus_asc2uni(struct super_block *sb, int size, dsize, decompose; u16 *dstr, outlen = 0; wchar_t c; + u16 dhangul[3]; decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); while (outlen < max_unistr_len && len > 0) { size = asc2unichar(sb, astr, len, &c); if (decompose) - dstr = decompose_unichar(c, &dsize); + dstr = decompose_unichar(c, &dsize, dhangul); else dstr = NULL; if (dstr) { @@ -344,6 +390,7 @@ int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str) unsigned long hash; wchar_t c; u16 c2; + u16 dhangul[3]; casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags); decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); @@ -357,7 +404,7 @@ int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str) len -= size; if (decompose) - dstr = decompose_unichar(c, &dsize); + dstr = decompose_unichar(c, &dsize, dhangul); else dstr = NULL; if (dstr) { @@ -396,6 +443,7 @@ int hfsplus_compare_dentry(const struct dentry *dentry, const char *astr1, *astr2; u16 c1, c2; wchar_t c; + u16 dhangul_1[3], dhangul_2[3]; casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags); decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags); @@ -413,7 +461,8 @@ int hfsplus_compare_dentry(const struct dentry *dentry, len1 -= size; if (decompose) - dstr1 = decompose_unichar(c, &dsize1); + dstr1 = decompose_unichar(c, &dsize1, + dhangul_1); if (!decompose || !dstr1) { c1 = c; dstr1 = &c1; @@ -427,7 +476,8 @@ int hfsplus_compare_dentry(const struct dentry *dentry, len2 -= size; if (decompose) - dstr2 = decompose_unichar(c, &dsize2); + dstr2 = decompose_unichar(c, &dsize2, + dhangul_2); if (!decompose || !dstr2) { c2 = c; dstr2 = &c2; -- cgit v1.2.3-59-g8ed1b From f168d9fd634a4612d308d7dbe0a4d2a9b366c045 Mon Sep 17 00:00:00 2001 From: "Ernesto A. Fernández" Date: Tue, 21 Aug 2018 21:59:23 -0700 Subject: hfsplus: drop ACL support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The HFS+ Access Control Lists have not worked at all for the past five years, and nobody seems to have noticed. Besides, POSIX draft ACLs are not compatible with MacOS. Drop the feature entirely. Link: http://lkml.kernel.org/r/20180714190608.wtnmmtjqeyladkut@eaf Signed-off-by: Ernesto A. Fernández Acked-by: Christoph Hellwig Cc: Viacheslav Dubeyko Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfsplus/Kconfig | 15 ----- fs/hfsplus/Makefile | 2 - fs/hfsplus/acl.h | 28 --------- fs/hfsplus/dir.c | 9 +-- fs/hfsplus/hfsplus_fs.h | 1 - fs/hfsplus/inode.c | 11 ---- fs/hfsplus/posix_acl.c | 144 -------------------------------------------- fs/hfsplus/super.c | 4 +- fs/hfsplus/xattr.c | 6 -- fs/hfsplus/xattr.h | 3 - fs/hfsplus/xattr_security.c | 13 ---- 11 files changed, 4 insertions(+), 232 deletions(-) delete mode 100644 fs/hfsplus/acl.h delete mode 100644 fs/hfsplus/posix_acl.c (limited to 'fs') diff --git a/fs/hfsplus/Kconfig b/fs/hfsplus/Kconfig index 7cc8b4acf66a..a63371815aab 100644 --- a/fs/hfsplus/Kconfig +++ b/fs/hfsplus/Kconfig @@ -11,18 +11,3 @@ config HFSPLUS_FS MacOS 8. It includes all Mac specific filesystem data such as data forks and creator codes, but it also has several UNIX style features such as file ownership and permissions. - -config HFSPLUS_FS_POSIX_ACL - bool "HFS+ POSIX Access Control Lists" - depends on HFSPLUS_FS - select FS_POSIX_ACL - help - POSIX Access Control Lists (ACLs) support permissions for users and - groups beyond the owner/group/world scheme. - - It needs to understand that POSIX ACLs are treated only under - Linux. POSIX ACLs doesn't mean something under Mac OS X. - Mac OS X beginning with version 10.4 ("Tiger") support NFSv4 ACLs, - which are part of the NFSv4 standard. - - If you don't know what Access Control Lists are, say N diff --git a/fs/hfsplus/Makefile b/fs/hfsplus/Makefile index f6a56542f8d7..9ed20e64b983 100644 --- a/fs/hfsplus/Makefile +++ b/fs/hfsplus/Makefile @@ -8,5 +8,3 @@ obj-$(CONFIG_HFSPLUS_FS) += hfsplus.o hfsplus-objs := super.o options.o inode.o ioctl.o extents.o catalog.o dir.o btree.o \ bnode.o brec.o bfind.o tables.o unicode.o wrapper.o bitmap.o part_tbl.o \ attributes.o xattr.o xattr_user.o xattr_security.o xattr_trusted.o - -hfsplus-$(CONFIG_HFSPLUS_FS_POSIX_ACL) += posix_acl.o diff --git a/fs/hfsplus/acl.h b/fs/hfsplus/acl.h deleted file mode 100644 index 488c2b75cf41..000000000000 --- a/fs/hfsplus/acl.h +++ /dev/null @@ -1,28 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * linux/fs/hfsplus/acl.h - * - * Vyacheslav Dubeyko - * - * Handler for Posix Access Control Lists (ACLs) support. - */ - -#include - -#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL - -/* posix_acl.c */ -struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type); -int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl, - int type); -extern int hfsplus_init_posix_acl(struct inode *, struct inode *); - -#else /* CONFIG_HFSPLUS_FS_POSIX_ACL */ -#define hfsplus_get_posix_acl NULL -#define hfsplus_set_posix_acl NULL - -static inline int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir) -{ - return 0; -} -#endif /* CONFIG_HFSPLUS_FS_POSIX_ACL */ diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index b5254378f011..c5a70f83dbe7 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -18,7 +18,6 @@ #include "hfsplus_fs.h" #include "hfsplus_raw.h" #include "xattr.h" -#include "acl.h" static inline void hfsplus_instantiate(struct dentry *dentry, struct inode *inode, u32 cnid) @@ -455,7 +454,7 @@ static int hfsplus_symlink(struct inode *dir, struct dentry *dentry, if (res) goto out_err; - res = hfsplus_init_inode_security(inode, dir, &dentry->d_name); + res = hfsplus_init_security(inode, dir, &dentry->d_name); if (res == -EOPNOTSUPP) res = 0; /* Operation is not supported. */ else if (res) { @@ -496,7 +495,7 @@ static int hfsplus_mknod(struct inode *dir, struct dentry *dentry, if (res) goto failed_mknod; - res = hfsplus_init_inode_security(inode, dir, &dentry->d_name); + res = hfsplus_init_security(inode, dir, &dentry->d_name); if (res == -EOPNOTSUPP) res = 0; /* Operation is not supported. */ else if (res) { @@ -567,10 +566,6 @@ const struct inode_operations hfsplus_dir_inode_operations = { .mknod = hfsplus_mknod, .rename = hfsplus_rename, .listxattr = hfsplus_listxattr, -#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL - .get_acl = hfsplus_get_posix_acl, - .set_acl = hfsplus_set_posix_acl, -#endif }; const struct file_operations hfsplus_dir_operations = { diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index d9255abafb81..8e039435958a 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -31,7 +31,6 @@ #define DBG_EXTENT 0x00000020 #define DBG_BITMAP 0x00000040 #define DBG_ATTR_MOD 0x00000080 -#define DBG_ACL_MOD 0x00000100 #if 0 #define DBG_MASK (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD) diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index c824f702feec..8e9427a42b81 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -21,7 +21,6 @@ #include "hfsplus_fs.h" #include "hfsplus_raw.h" #include "xattr.h" -#include "acl.h" static int hfsplus_readpage(struct file *file, struct page *page) { @@ -267,12 +266,6 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr) setattr_copy(inode, attr); mark_inode_dirty(inode); - if (attr->ia_valid & ATTR_MODE) { - error = posix_acl_chmod(inode, inode->i_mode); - if (unlikely(error)) - return error; - } - return 0; } @@ -336,10 +329,6 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, static const struct inode_operations hfsplus_file_inode_operations = { .setattr = hfsplus_setattr, .listxattr = hfsplus_listxattr, -#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL - .get_acl = hfsplus_get_posix_acl, - .set_acl = hfsplus_set_posix_acl, -#endif }; static const struct file_operations hfsplus_file_operations = { diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c deleted file mode 100644 index 066114dcc3a2..000000000000 --- a/fs/hfsplus/posix_acl.c +++ /dev/null @@ -1,144 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/fs/hfsplus/posix_acl.c - * - * Vyacheslav Dubeyko - * - * Handler for Posix Access Control Lists (ACLs) support. - */ - -#include "hfsplus_fs.h" -#include "xattr.h" -#include "acl.h" - -struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type) -{ - struct posix_acl *acl; - char *xattr_name; - char *value = NULL; - ssize_t size; - - hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino); - - switch (type) { - case ACL_TYPE_ACCESS: - xattr_name = XATTR_NAME_POSIX_ACL_ACCESS; - break; - case ACL_TYPE_DEFAULT: - xattr_name = XATTR_NAME_POSIX_ACL_DEFAULT; - break; - default: - return ERR_PTR(-EINVAL); - } - - size = __hfsplus_getxattr(inode, xattr_name, NULL, 0); - - if (size > 0) { - value = (char *)hfsplus_alloc_attr_entry(); - if (unlikely(!value)) - return ERR_PTR(-ENOMEM); - size = __hfsplus_getxattr(inode, xattr_name, value, size); - } - - if (size > 0) - acl = posix_acl_from_xattr(&init_user_ns, value, size); - else if (size == -ENODATA) - acl = NULL; - else - acl = ERR_PTR(size); - - hfsplus_destroy_attr_entry((hfsplus_attr_entry *)value); - - return acl; -} - -static int __hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl, - int type) -{ - int err; - char *xattr_name; - size_t size = 0; - char *value = NULL; - - hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino); - - switch (type) { - case ACL_TYPE_ACCESS: - xattr_name = XATTR_NAME_POSIX_ACL_ACCESS; - break; - - case ACL_TYPE_DEFAULT: - xattr_name = XATTR_NAME_POSIX_ACL_DEFAULT; - if (!S_ISDIR(inode->i_mode)) - return acl ? -EACCES : 0; - break; - - default: - return -EINVAL; - } - - if (acl) { - size = posix_acl_xattr_size(acl->a_count); - if (unlikely(size > HFSPLUS_MAX_INLINE_DATA_SIZE)) - return -ENOMEM; - value = (char *)hfsplus_alloc_attr_entry(); - if (unlikely(!value)) - return -ENOMEM; - err = posix_acl_to_xattr(&init_user_ns, acl, value, size); - if (unlikely(err < 0)) - goto end_set_acl; - } - - err = __hfsplus_setxattr(inode, xattr_name, value, size, 0); - -end_set_acl: - hfsplus_destroy_attr_entry((hfsplus_attr_entry *)value); - - if (!err) - set_cached_acl(inode, type, acl); - - return err; -} - -int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl, int type) -{ - int err; - - if (type == ACL_TYPE_ACCESS && acl) { - err = posix_acl_update_mode(inode, &inode->i_mode, &acl); - if (err) - return err; - } - return __hfsplus_set_posix_acl(inode, acl, type); -} - -int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir) -{ - int err = 0; - struct posix_acl *default_acl, *acl; - - hfs_dbg(ACL_MOD, - "[%s]: ino %lu, dir->ino %lu\n", - __func__, inode->i_ino, dir->i_ino); - - if (S_ISLNK(inode->i_mode)) - return 0; - - err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); - if (err) - return err; - - if (default_acl) { - err = __hfsplus_set_posix_acl(inode, default_acl, - ACL_TYPE_DEFAULT); - posix_acl_release(default_acl); - } - - if (acl) { - if (!err) - err = __hfsplus_set_posix_acl(inode, acl, - ACL_TYPE_ACCESS); - posix_acl_release(acl); - } - return err; -} diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 80abba550bfa..eb4535eba95d 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -564,8 +564,8 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) goto out_put_hidden_dir; } - err = hfsplus_init_inode_security(sbi->hidden_dir, - root, &str); + err = hfsplus_init_security(sbi->hidden_dir, + root, &str); if (err == -EOPNOTSUPP) err = 0; /* Operation is not supported. */ else if (err) { diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index e538b758c448..d5403b4004c9 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -8,10 +8,8 @@ */ #include "hfsplus_fs.h" -#include #include #include "xattr.h" -#include "acl.h" static int hfsplus_removexattr(struct inode *inode, const char *name); @@ -19,10 +17,6 @@ const struct xattr_handler *hfsplus_xattr_handlers[] = { &hfsplus_xattr_osx_handler, &hfsplus_xattr_user_handler, &hfsplus_xattr_trusted_handler, -#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif &hfsplus_xattr_security_handler, NULL }; diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h index a4e611d69710..d14e362b3eba 100644 --- a/fs/hfsplus/xattr.h +++ b/fs/hfsplus/xattr.h @@ -38,7 +38,4 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size); int hfsplus_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr); -int hfsplus_init_inode_security(struct inode *inode, struct inode *dir, - const struct qstr *qstr); - #endif diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c index f5550b006e88..cfbe6a3bfb1e 100644 --- a/fs/hfsplus/xattr_security.c +++ b/fs/hfsplus/xattr_security.c @@ -12,7 +12,6 @@ #include "hfsplus_fs.h" #include "xattr.h" -#include "acl.h" static int hfsplus_security_getxattr(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, @@ -72,18 +71,6 @@ int hfsplus_init_security(struct inode *inode, struct inode *dir, &hfsplus_initxattrs, NULL); } -int hfsplus_init_inode_security(struct inode *inode, - struct inode *dir, - const struct qstr *qstr) -{ - int err; - - err = hfsplus_init_posix_acl(inode, dir); - if (!err) - err = hfsplus_init_security(inode, dir, qstr); - return err; -} - const struct xattr_handler hfsplus_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .get = hfsplus_security_getxattr, -- cgit v1.2.3-59-g8ed1b From 34d082604a7c8856051ff441fef0e22d93afd848 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 21 Aug 2018 21:59:26 -0700 Subject: reiserfs: use monotonic time for j_trans_start_time Using CLOCK_REALTIME time_t timestamps breaks on 32-bit systems in 2038, and gives surprising results with a concurrent settimeofday(). This changes the reiserfs journal timestamps to use ktime_get_seconds() instead, which makes it use a 64-bit CLOCK_MONOTONIC stamp. In the procfs output, the monotonic timestamp needs to be converted back to CLOCK_REALTIME to keep the existing ABI. Link: http://lkml.kernel.org/r/20180620142522.27639-2-arnd@arndb.de Signed-off-by: Arnd Bergmann Reviewed-by: Jan Kara Cc: Al Viro Cc: Jeff Mahoney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/journal.c | 22 +++++++++++----------- fs/reiserfs/procfs.c | 11 +++++++++-- fs/reiserfs/reiserfs.h | 2 +- 3 files changed, 21 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 52eb5d293a34..8a76f9d14bc6 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2381,7 +2381,7 @@ static int journal_read(struct super_block *sb) struct reiserfs_journal_desc *desc; unsigned int oldest_trans_id = 0; unsigned int oldest_invalid_trans_id = 0; - time_t start; + time64_t start; unsigned long oldest_start = 0; unsigned long cur_dblock = 0; unsigned long newest_mount_id = 9; @@ -2395,7 +2395,7 @@ static int journal_read(struct super_block *sb) cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb); reiserfs_info(sb, "checking transaction log (%pg)\n", journal->j_dev_bd); - start = get_seconds(); + start = ktime_get_seconds(); /* * step 1, read in the journal header block. Check the transaction @@ -2556,7 +2556,7 @@ start_log_replay: if (replay_count > 0) { reiserfs_info(sb, "replayed %d transactions in %lu seconds\n", - replay_count, get_seconds() - start); + replay_count, ktime_get_seconds() - start); } /* needed to satisfy the locking in _update_journal_header_block */ reiserfs_write_lock(sb); @@ -2914,7 +2914,7 @@ int journal_transaction_should_end(struct reiserfs_transaction_handle *th, int new_alloc) { struct reiserfs_journal *journal = SB_JOURNAL(th->t_super); - time_t now = get_seconds(); + time64_t now = ktime_get_seconds(); /* cannot restart while nested */ BUG_ON(!th->t_trans_id); if (th->t_refcount > 1) @@ -3023,7 +3023,7 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, struct super_block *sb, unsigned long nblocks, int join) { - time_t now = get_seconds(); + time64_t now = ktime_get_seconds(); unsigned int old_trans_id; struct reiserfs_journal *journal = SB_JOURNAL(sb); struct reiserfs_transaction_handle myth; @@ -3056,7 +3056,7 @@ relock: PROC_INFO_INC(sb, journal.journal_relock_writers); goto relock; } - now = get_seconds(); + now = ktime_get_seconds(); /* * if there is no room in the journal OR @@ -3119,7 +3119,7 @@ relock: } /* we are the first writer, set trans_id */ if (journal->j_trans_start_time == 0) { - journal->j_trans_start_time = get_seconds(); + journal->j_trans_start_time = ktime_get_seconds(); } atomic_inc(&journal->j_wcount); journal->j_len_alloc += nblocks; @@ -3559,11 +3559,11 @@ static void flush_async_commits(struct work_struct *work) */ void reiserfs_flush_old_commits(struct super_block *sb) { - time_t now; + time64_t now; struct reiserfs_transaction_handle th; struct reiserfs_journal *journal = SB_JOURNAL(sb); - now = get_seconds(); + now = ktime_get_seconds(); /* * safety check so we don't flush while we are replaying the log during * mount @@ -3613,7 +3613,7 @@ void reiserfs_flush_old_commits(struct super_block *sb) static int check_journal_end(struct reiserfs_transaction_handle *th, int flags) { - time_t now; + time64_t now; int flush = flags & FLUSH_ALL; int commit_now = flags & COMMIT_NOW; int wait_on_commit = flags & WAIT; @@ -3694,7 +3694,7 @@ static int check_journal_end(struct reiserfs_transaction_handle *th, int flags) } /* deal with old transactions where we are the last writers */ - now = get_seconds(); + now = ktime_get_seconds(); if ((now - journal->j_trans_start_time) > journal->j_max_trans_age) { commit_now = 1; journal->j_next_async_flush = 1; diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index e39b3910d24d..f2cf3441fdfc 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -297,6 +297,13 @@ static int show_oidmap(struct seq_file *m, void *unused) return 0; } +static time64_t ktime_mono_to_real_seconds(time64_t mono) +{ + ktime_t kt = ktime_set(mono, NSEC_PER_SEC/2); + + return ktime_divns(ktime_mono_to_real(kt), NSEC_PER_SEC); +} + static int show_journal(struct seq_file *m, void *unused) { struct super_block *sb = m->private; @@ -325,7 +332,7 @@ static int show_journal(struct seq_file *m, void *unused) "j_bcount: \t%lu\n" "j_first_unflushed_offset: \t%lu\n" "j_last_flush_trans_id: \t%u\n" - "j_trans_start_time: \t%li\n" + "j_trans_start_time: \t%lli\n" "j_list_bitmap_index: \t%i\n" "j_must_wait: \t%i\n" "j_next_full_flush: \t%i\n" @@ -366,7 +373,7 @@ static int show_journal(struct seq_file *m, void *unused) JF(j_bcount), JF(j_first_unflushed_offset), JF(j_last_flush_trans_id), - JF(j_trans_start_time), + ktime_mono_to_real_seconds(JF(j_trans_start_time)), JF(j_list_bitmap_index), JF(j_must_wait), JF(j_next_full_flush), diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index ae4811fecc1f..621b9a07080a 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -331,7 +331,7 @@ struct reiserfs_journal { struct buffer_head *j_header_bh; - time_t j_trans_start_time; /* time this transaction started */ + time64_t j_trans_start_time; /* time this transaction started */ struct mutex j_mutex; struct mutex j_flush_mutex; -- cgit v1.2.3-59-g8ed1b From 5b1d149c895b3229d70c0d4c69e14bdbe5fe8226 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 21 Aug 2018 21:59:30 -0700 Subject: reiserfs: remove obsolete print_time function Before linux-2.4.6, print_time() was used to pretty-print an inode time when running reiserfs in user space, after that it has become obsolete and is still a bit incorrect: It behaves differently on 32-bit and 64-bit machines, and uses a static buffer to hold a string, which could lead to undefined behavior if we ever called this from multiple places simultaneously. Since we always want to treat the timestamps as 'unsigned' anyway, simply printing them as an integer is both simpler and safer while avoiding the deprecated time_t type. Link: http://lkml.kernel.org/r/20180620142522.27639-3-arnd@arndb.de Signed-off-by: Arnd Bergmann Reviewed-by: Jan Kara Cc: Al Viro Cc: Jeff Mahoney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/item_ops.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c index e3c558d1b78c..3a5a752d96c7 100644 --- a/fs/reiserfs/item_ops.c +++ b/fs/reiserfs/item_ops.c @@ -33,30 +33,22 @@ static int sd_is_left_mergeable(struct reiserfs_key *key, unsigned long bsize) return 0; } -static char *print_time(time_t t) -{ - static char timebuf[256]; - - sprintf(timebuf, "%ld", t); - return timebuf; -} - static void sd_print_item(struct item_head *ih, char *item) { printk("\tmode | size | nlinks | first direct | mtime\n"); if (stat_data_v1(ih)) { struct stat_data_v1 *sd = (struct stat_data_v1 *)item; - printk("\t0%-6o | %6u | %2u | %d | %s\n", sd_v1_mode(sd), + printk("\t0%-6o | %6u | %2u | %d | %u\n", sd_v1_mode(sd), sd_v1_size(sd), sd_v1_nlink(sd), sd_v1_first_direct_byte(sd), - print_time(sd_v1_mtime(sd))); + sd_v1_mtime(sd)); } else { struct stat_data *sd = (struct stat_data *)item; - printk("\t0%-6o | %6llu | %2u | %d | %s\n", sd_v2_mode(sd), + printk("\t0%-6o | %6llu | %2u | %d | %u\n", sd_v2_mode(sd), (unsigned long long)sd_v2_size(sd), sd_v2_nlink(sd), - sd_v2_rdev(sd), print_time(sd_v2_mtime(sd))); + sd_v2_rdev(sd), sd_v2_mtime(sd)); } } -- cgit v1.2.3-59-g8ed1b From 8b73ce6a4bae4fe12bcb2c361c0da4183c2e1b6f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 21 Aug 2018 21:59:34 -0700 Subject: reiserfs: change j_timestamp type to time64_t This uses the deprecated time_t type but is write-only, and could be removed, but as Jeff explains, having a timestamp can be usefule for post-mortem analysis in crash dumps. In order to remove one of the last instances of time_t, this changes the type to time64_t, same as j_trans_start_time. Link: http://lkml.kernel.org/r/20180622133315.221210-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Cc: Jan Kara Cc: Jeff Mahoney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/reiserfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index 621b9a07080a..e5ca9ed79e54 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -271,7 +271,7 @@ struct reiserfs_journal_list { struct mutex j_commit_mutex; unsigned int j_trans_id; - time_t j_timestamp; + time64_t j_timestamp; /* write-only but useful for crash dump analysis */ struct reiserfs_list_bitmap *j_list_bitmap; struct buffer_head *j_commit_bh; /* commit buffer head */ struct reiserfs_journal_cnode *j_realblock; -- cgit v1.2.3-59-g8ed1b From a13f085d111e90469faf2d9965eb39b11c114d7e Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 21 Aug 2018 21:59:37 -0700 Subject: reiserfs: fix broken xattr handling (heap corruption, bad retval) This fixes the following issues: - When a buffer size is supplied to reiserfs_listxattr() such that each individual name fits, but the concatenation of all names doesn't fit, reiserfs_listxattr() overflows the supplied buffer. This leads to a kernel heap overflow (verified using KASAN) followed by an out-of-bounds usercopy and is therefore a security bug. - When a buffer size is supplied to reiserfs_listxattr() such that a name doesn't fit, -ERANGE should be returned. But reiserfs instead just truncates the list of names; I have verified that if the only xattr on a file has a longer name than the supplied buffer length, listxattr() incorrectly returns zero. With my patch applied, -ERANGE is returned in both cases and the memory corruption doesn't happen anymore. Credit for making me clean this code up a bit goes to Al Viro, who pointed out that the ->actor calling convention is suboptimal and should be changed. Link: http://lkml.kernel.org/r/20180802151539.5373-1-jannh@google.com Fixes: 48b32a3553a5 ("reiserfs: use generic xattr handlers") Signed-off-by: Jann Horn Acked-by: Jeff Mahoney Cc: Eric Biggers Cc: Al Viro Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/xattr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index ff94fad477e4..48cdfc81fe10 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -792,8 +792,10 @@ static int listxattr_filler(struct dir_context *ctx, const char *name, return 0; size = namelen + 1; if (b->buf) { - if (size > b->size) + if (b->pos + size > b->size) { + b->pos = -ERANGE; return -ERANGE; + } memcpy(b->buf + b->pos, name, namelen); b->buf[b->pos + namelen] = 0; } -- cgit v1.2.3-59-g8ed1b From f663b5b38fffeb31841f8bfaf0ef87a445b0ffee Mon Sep 17 00:00:00 2001 From: Wentao Wang Date: Tue, 21 Aug 2018 21:59:41 -0700 Subject: fat: add FITRIM ioctl for FAT file system Add FITRIM ioctl for FAT file system [witallwang@gmail.com: use u64s] Link: http://lkml.kernel.org/r/87h8l37hub.fsf@mail.parknet.co.jp [hirofumi@mail.parknet.co.jp: bug fixes, coding style fixes, add signal check] Link: http://lkml.kernel.org/r/87fu10anhj.fsf@mail.parknet.co.jp Signed-off-by: Wentao Wang Signed-off-by: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fat/fat.h | 1 + fs/fat/fatent.c | 102 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/fat/file.c | 33 ++++++++++++++++++ 3 files changed, 136 insertions(+) (limited to 'fs') diff --git a/fs/fat/fat.h b/fs/fat/fat.h index 8fc1093da47d..154ae54a6b3a 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -357,6 +357,7 @@ extern int fat_alloc_clusters(struct inode *inode, int *cluster, int nr_cluster); extern int fat_free_clusters(struct inode *inode, int cluster); extern int fat_count_free_clusters(struct super_block *sb); +extern int fat_trim_fs(struct inode *inode, struct fstrim_range *range); /* fat/file.c */ extern long fat_generic_ioctl(struct file *filp, unsigned int cmd, diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index bac10de678cc..25d43a5e8a4d 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -4,6 +4,7 @@ */ #include +#include #include "fat.h" struct fatent_operations { @@ -690,3 +691,104 @@ out: unlock_fat(sbi); return err; } + +static int fat_trim_clusters(struct super_block *sb, u32 clus, u32 nr_clus) +{ + struct msdos_sb_info *sbi = MSDOS_SB(sb); + return sb_issue_discard(sb, fat_clus_to_blknr(sbi, clus), + nr_clus * sbi->sec_per_clus, GFP_NOFS, 0); +} + +int fat_trim_fs(struct inode *inode, struct fstrim_range *range) +{ + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + const struct fatent_operations *ops = sbi->fatent_ops; + struct fat_entry fatent; + u64 ent_start, ent_end, minlen, trimmed = 0; + u32 free = 0; + unsigned long reada_blocks, reada_mask, cur_block = 0; + int err = 0; + + /* + * FAT data is organized as clusters, trim at the granulary of cluster. + * + * fstrim_range is in byte, convert vaules to cluster index. + * Treat sectors before data region as all used, not to trim them. + */ + ent_start = max_t(u64, range->start>>sbi->cluster_bits, FAT_START_ENT); + ent_end = ent_start + (range->len >> sbi->cluster_bits) - 1; + minlen = range->minlen >> sbi->cluster_bits; + + if (ent_start >= sbi->max_cluster || range->len < sbi->cluster_size) + return -EINVAL; + if (ent_end >= sbi->max_cluster) + ent_end = sbi->max_cluster - 1; + + reada_blocks = FAT_READA_SIZE >> sb->s_blocksize_bits; + reada_mask = reada_blocks - 1; + + fatent_init(&fatent); + lock_fat(sbi); + fatent_set_entry(&fatent, ent_start); + while (fatent.entry <= ent_end) { + /* readahead of fat blocks */ + if ((cur_block & reada_mask) == 0) { + unsigned long rest = sbi->fat_length - cur_block; + fat_ent_reada(sb, &fatent, min(reada_blocks, rest)); + } + cur_block++; + + err = fat_ent_read_block(sb, &fatent); + if (err) + goto error; + do { + if (ops->ent_get(&fatent) == FAT_ENT_FREE) { + free++; + } else if (free) { + if (free >= minlen) { + u32 clus = fatent.entry - free; + + err = fat_trim_clusters(sb, clus, free); + if (err && err != -EOPNOTSUPP) + goto error; + if (!err) + trimmed += free; + err = 0; + } + free = 0; + } + } while (fat_ent_next(sbi, &fatent) && fatent.entry <= ent_end); + + if (fatal_signal_pending(current)) { + err = -ERESTARTSYS; + goto error; + } + + if (need_resched()) { + fatent_brelse(&fatent); + unlock_fat(sbi); + cond_resched(); + lock_fat(sbi); + } + } + /* handle scenario when tail entries are all free */ + if (free && free >= minlen) { + u32 clus = fatent.entry - free; + + err = fat_trim_clusters(sb, clus, free); + if (err && err != -EOPNOTSUPP) + goto error; + if (!err) + trimmed += free; + err = 0; + } + +error: + fatent_brelse(&fatent); + unlock_fat(sbi); + + range->len = trimmed << sbi->cluster_bits; + + return err; +} diff --git a/fs/fat/file.c b/fs/fat/file.c index 4724cc9ad650..4f3d72fb1e60 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -121,6 +121,37 @@ static int fat_ioctl_get_volume_id(struct inode *inode, u32 __user *user_attr) return put_user(sbi->vol_id, user_attr); } +static int fat_ioctl_fitrim(struct inode *inode, unsigned long arg) +{ + struct super_block *sb = inode->i_sb; + struct fstrim_range __user *user_range; + struct fstrim_range range; + struct request_queue *q = bdev_get_queue(sb->s_bdev); + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + + user_range = (struct fstrim_range __user *)arg; + if (copy_from_user(&range, user_range, sizeof(range))) + return -EFAULT; + + range.minlen = max_t(unsigned int, range.minlen, + q->limits.discard_granularity); + + err = fat_trim_fs(inode, &range); + if (err < 0) + return err; + + if (copy_to_user(user_range, &range, sizeof(range))) + return -EFAULT; + + return 0; +} + long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -133,6 +164,8 @@ long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return fat_ioctl_set_attributes(filp, user_attr); case FAT_IOCTL_GET_VOLUME_ID: return fat_ioctl_get_volume_id(inode, user_attr); + case FITRIM: + return fat_ioctl_fitrim(inode, arg); default: return -ENOTTY; /* Inappropriate ioctl for device */ } -- cgit v1.2.3-59-g8ed1b From 0afa9626667c3659ef8bd82d42a11e39fedf235c Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Tue, 21 Aug 2018 21:59:44 -0700 Subject: fat: validate ->i_start before using On corrupted FATfs may have invalid ->i_start. To handle it, this checks ->i_start before using, and return proper error code. Link: http://lkml.kernel.org/r/87o9f8y1t5.fsf_-_@mail.parknet.co.jp Signed-off-by: OGAWA Hirofumi Reported-by: Anatoly Trosinenko Tested-by: Anatoly Trosinenko Cc: Alan Cox Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fat/cache.c | 19 ++++++++++++------- fs/fat/fat.h | 5 +++++ fs/fat/fatent.c | 6 +++--- 3 files changed, 20 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/fat/cache.c b/fs/fat/cache.c index e9bed49df6b7..78d501c1fb65 100644 --- a/fs/fat/cache.c +++ b/fs/fat/cache.c @@ -225,7 +225,8 @@ static inline void cache_init(struct fat_cache_id *cid, int fclus, int dclus) int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus) { struct super_block *sb = inode->i_sb; - const int limit = sb->s_maxbytes >> MSDOS_SB(sb)->cluster_bits; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + const int limit = sb->s_maxbytes >> sbi->cluster_bits; struct fat_entry fatent; struct fat_cache_id cid; int nr; @@ -234,6 +235,12 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus) *fclus = 0; *dclus = MSDOS_I(inode)->i_start; + if (!fat_valid_entry(sbi, *dclus)) { + fat_fs_error_ratelimit(sb, + "%s: invalid start cluster (i_pos %lld, start %08x)", + __func__, MSDOS_I(inode)->i_pos, *dclus); + return -EIO; + } if (cluster == 0) return 0; @@ -250,9 +257,8 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus) /* prevent the infinite loop of cluster chain */ if (*fclus > limit) { fat_fs_error_ratelimit(sb, - "%s: detected the cluster chain loop" - " (i_pos %lld)", __func__, - MSDOS_I(inode)->i_pos); + "%s: detected the cluster chain loop (i_pos %lld)", + __func__, MSDOS_I(inode)->i_pos); nr = -EIO; goto out; } @@ -262,9 +268,8 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus) goto out; else if (nr == FAT_ENT_FREE) { fat_fs_error_ratelimit(sb, - "%s: invalid cluster chain (i_pos %lld)", - __func__, - MSDOS_I(inode)->i_pos); + "%s: invalid cluster chain (i_pos %lld)", + __func__, MSDOS_I(inode)->i_pos); nr = -EIO; goto out; } else if (nr == FAT_ENT_EOF) { diff --git a/fs/fat/fat.h b/fs/fat/fat.h index 154ae54a6b3a..df84d5710b59 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -348,6 +348,11 @@ static inline void fatent_brelse(struct fat_entry *fatent) fatent->fat_inode = NULL; } +static inline bool fat_valid_entry(struct msdos_sb_info *sbi, int entry) +{ + return FAT_START_ENT <= entry && entry < sbi->max_cluster; +} + extern void fat_ent_access_init(struct super_block *sb); extern int fat_ent_read(struct inode *inode, struct fat_entry *fatent, int entry); diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 25d43a5e8a4d..defc2168de91 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -24,7 +24,7 @@ static void fat12_ent_blocknr(struct super_block *sb, int entry, { struct msdos_sb_info *sbi = MSDOS_SB(sb); int bytes = entry + (entry >> 1); - WARN_ON(entry < FAT_START_ENT || sbi->max_cluster <= entry); + WARN_ON(!fat_valid_entry(sbi, entry)); *offset = bytes & (sb->s_blocksize - 1); *blocknr = sbi->fat_start + (bytes >> sb->s_blocksize_bits); } @@ -34,7 +34,7 @@ static void fat_ent_blocknr(struct super_block *sb, int entry, { struct msdos_sb_info *sbi = MSDOS_SB(sb); int bytes = (entry << sbi->fatent_shift); - WARN_ON(entry < FAT_START_ENT || sbi->max_cluster <= entry); + WARN_ON(!fat_valid_entry(sbi, entry)); *offset = bytes & (sb->s_blocksize - 1); *blocknr = sbi->fat_start + (bytes >> sb->s_blocksize_bits); } @@ -354,7 +354,7 @@ int fat_ent_read(struct inode *inode, struct fat_entry *fatent, int entry) int err, offset; sector_t blocknr; - if (entry < FAT_START_ENT || sbi->max_cluster <= entry) { + if (!fat_valid_entry(sbi, entry)) { fatent_brelse(fatent); fat_fs_error(sb, "invalid access to FAT (entry 0x%08x)", entry); return -EIO; -- cgit v1.2.3-59-g8ed1b From f423420c23899469a3ba4e100def43ab26f2e0bf Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 21 Aug 2018 21:59:48 -0700 Subject: fat: propagate 64-bit inode timestamps Now that we pass down 64-bit timestamps from VFS, we just need to convert that correctly into on-disk timestamps. To make that work correctly, this changes the last use of time_to_tm() in the kernel to time64_to_tm(), which also lets use remove that deprecated interfaces. Similarly, the time_t use in fat_time_fat2unix() truncates the timestamp on the way in, which can be avoided by using types that are wide enough to hold the intermediate values during the conversion. [hirofumi@mail.parknet.co.jp: remove useless temporary variable, needless long long] Link: http://lkml.kernel.org/r/20180619153646.3637529-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Acked-by: OGAWA Hirofumi Cc: Jeff Layton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fat/dir.c | 2 +- fs/fat/fat.h | 6 +++--- fs/fat/inode.c | 20 ++++++-------------- fs/fat/misc.c | 13 +++++++------ fs/fat/namei_msdos.c | 17 ++++++----------- fs/fat/namei_vfat.c | 20 +++++++------------- 6 files changed, 30 insertions(+), 48 deletions(-) (limited to 'fs') diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 8e100c3bf72c..7f5f3699fc6c 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -1130,7 +1130,7 @@ error: return err; } -int fat_alloc_new_dir(struct inode *dir, struct timespec *ts) +int fat_alloc_new_dir(struct inode *dir, struct timespec64 *ts) { struct super_block *sb = dir->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); diff --git a/fs/fat/fat.h b/fs/fat/fat.h index df84d5710b59..9d7d2d5da28b 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -304,7 +304,7 @@ extern int fat_scan_logstart(struct inode *dir, int i_logstart, struct fat_slot_info *sinfo); extern int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh, struct msdos_dir_entry **de); -extern int fat_alloc_new_dir(struct inode *dir, struct timespec *ts); +extern int fat_alloc_new_dir(struct inode *dir, struct timespec64 *ts); extern int fat_add_entries(struct inode *dir, void *slots, int nr_slots, struct fat_slot_info *sinfo); extern int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo); @@ -412,9 +412,9 @@ void fat_msg(struct super_block *sb, const char *level, const char *fmt, ...); } while (0) extern int fat_clusters_flush(struct super_block *sb); extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster); -extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, +extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec64 *ts, __le16 __time, __le16 __date, u8 time_cs); -extern void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec *ts, +extern void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec64 *ts, __le16 *time, __le16 *date, u8 *time_cs); extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index bfd589ea74c0..d6b81e31f9f5 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -508,7 +508,6 @@ static int fat_validate_dir(struct inode *dir) /* doesn't deal with root inode */ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) { - struct timespec ts; struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); int error; @@ -559,14 +558,11 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1)) & ~((loff_t)sbi->cluster_size - 1)) >> 9; - fat_time_fat2unix(sbi, &ts, de->time, de->date, 0); - inode->i_mtime = timespec_to_timespec64(ts); + fat_time_fat2unix(sbi, &inode->i_mtime, de->time, de->date, 0); if (sbi->options.isvfat) { - fat_time_fat2unix(sbi, &ts, de->ctime, + fat_time_fat2unix(sbi, &inode->i_ctime, de->ctime, de->cdate, de->ctime_cs); - inode->i_ctime = timespec_to_timespec64(ts); - fat_time_fat2unix(sbi, &ts, 0, de->adate, 0); - inode->i_atime = timespec_to_timespec64(ts); + fat_time_fat2unix(sbi, &inode->i_atime, 0, de->adate, 0); } else inode->i_ctime = inode->i_atime = inode->i_mtime; @@ -843,7 +839,6 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf) static int __fat_write_inode(struct inode *inode, int wait) { - struct timespec ts; struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); struct buffer_head *bh; @@ -881,16 +876,13 @@ retry: raw_entry->size = cpu_to_le32(inode->i_size); raw_entry->attr = fat_make_attrs(inode); fat_set_start(raw_entry, MSDOS_I(inode)->i_logstart); - ts = timespec64_to_timespec(inode->i_mtime); - fat_time_unix2fat(sbi, &ts, &raw_entry->time, + fat_time_unix2fat(sbi, &inode->i_mtime, &raw_entry->time, &raw_entry->date, NULL); if (sbi->options.isvfat) { __le16 atime; - ts = timespec64_to_timespec(inode->i_ctime); - fat_time_unix2fat(sbi, &ts, &raw_entry->ctime, + fat_time_unix2fat(sbi, &inode->i_ctime, &raw_entry->ctime, &raw_entry->cdate, &raw_entry->ctime_cs); - ts = timespec64_to_timespec(inode->i_atime); - fat_time_unix2fat(sbi, &ts, &atime, + fat_time_unix2fat(sbi, &inode->i_atime, &atime, &raw_entry->adate, NULL); } spin_unlock(&sbi->inode_hash_lock); diff --git a/fs/fat/misc.c b/fs/fat/misc.c index f9bdc1e01c98..573836dcaefc 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -180,17 +180,18 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster) #define IS_LEAP_YEAR(y) (!((y) & 3) && (y) != YEAR_2100) /* Linear day numbers of the respective 1sts in non-leap years. */ -static time_t days_in_year[] = { +static long days_in_year[] = { /* Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec */ 0, 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0, }; /* Convert a FAT time/date pair to a UNIX date (seconds since 1 1 70). */ -void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, +void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec64 *ts, __le16 __time, __le16 __date, u8 time_cs) { u16 time = le16_to_cpu(__time), date = le16_to_cpu(__date); - time_t second, day, leap_day, month, year; + time64_t second; + long day, leap_day, month, year; year = date >> 9; month = max(1, (date >> 5) & 0xf); @@ -205,7 +206,7 @@ void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, second = (time & 0x1f) << 1; second += ((time >> 5) & 0x3f) * SECS_PER_MIN; second += (time >> 11) * SECS_PER_HOUR; - second += (year * 365 + leap_day + second += (time64_t)(year * 365 + leap_day + days_in_year[month] + day + DAYS_DELTA) * SECS_PER_DAY; @@ -224,11 +225,11 @@ void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts, } /* Convert linear UNIX date to a FAT time/date pair. */ -void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec *ts, +void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec64 *ts, __le16 *time, __le16 *date, u8 *time_cs) { struct tm tm; - time_to_tm(ts->tv_sec, + time64_to_tm(ts->tv_sec, (sbi->options.tz_set ? sbi->options.time_offset : -sys_tz.tz_minuteswest) * SECS_PER_MIN, &tm); diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index 16a832c37d66..efb8c40c9d27 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -225,7 +225,7 @@ static struct dentry *msdos_lookup(struct inode *dir, struct dentry *dentry, /***** Creates a directory entry (name is already formatted). */ static int msdos_add_entry(struct inode *dir, const unsigned char *name, int is_dir, int is_hid, int cluster, - struct timespec *ts, struct fat_slot_info *sinfo) + struct timespec64 *ts, struct fat_slot_info *sinfo) { struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb); struct msdos_dir_entry de; @@ -250,7 +250,7 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name, if (err) return err; - dir->i_ctime = dir->i_mtime = timespec_to_timespec64(*ts); + dir->i_ctime = dir->i_mtime = *ts; if (IS_DIRSYNC(dir)) (void)fat_sync_inode(dir); else @@ -267,7 +267,6 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct inode *inode = NULL; struct fat_slot_info sinfo; struct timespec64 ts; - struct timespec t; unsigned char msdos_name[MSDOS_NAME]; int err, is_hid; @@ -286,8 +285,7 @@ static int msdos_create(struct inode *dir, struct dentry *dentry, umode_t mode, } ts = current_time(dir); - t = timespec64_to_timespec(ts); - err = msdos_add_entry(dir, msdos_name, 0, is_hid, 0, &t, &sinfo); + err = msdos_add_entry(dir, msdos_name, 0, is_hid, 0, &ts, &sinfo); if (err) goto out; inode = fat_build_inode(sb, sinfo.de, sinfo.i_pos); @@ -347,7 +345,6 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct inode *inode; unsigned char msdos_name[MSDOS_NAME]; struct timespec64 ts; - struct timespec t; int err, is_hid, cluster; mutex_lock(&MSDOS_SB(sb)->s_lock); @@ -365,13 +362,12 @@ static int msdos_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) } ts = current_time(dir); - t = timespec64_to_timespec(ts); - cluster = fat_alloc_new_dir(dir, &t); + cluster = fat_alloc_new_dir(dir, &ts); if (cluster < 0) { err = cluster; goto out; } - err = msdos_add_entry(dir, msdos_name, 1, is_hid, cluster, &t, &sinfo); + err = msdos_add_entry(dir, msdos_name, 1, is_hid, cluster, &ts, &sinfo); if (err) goto out_free; inc_nlink(dir); @@ -503,9 +499,8 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name, new_i_pos = MSDOS_I(new_inode)->i_pos; fat_detach(new_inode); } else { - struct timespec t = timespec64_to_timespec(ts); err = msdos_add_entry(new_dir, new_name, is_dir, is_hid, 0, - &t, &sinfo); + &ts, &sinfo); if (err) goto out; new_i_pos = sinfo.i_pos; diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 9a5469120caa..82cd1e69cbdf 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -577,7 +577,7 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname, static int vfat_build_slots(struct inode *dir, const unsigned char *name, int len, int is_dir, int cluster, - struct timespec *ts, + struct timespec64 *ts, struct msdos_dir_slot *slots, int *nr_slots) { struct msdos_sb_info *sbi = MSDOS_SB(dir->i_sb); @@ -653,7 +653,7 @@ out_free: } static int vfat_add_entry(struct inode *dir, const struct qstr *qname, - int is_dir, int cluster, struct timespec *ts, + int is_dir, int cluster, struct timespec64 *ts, struct fat_slot_info *sinfo) { struct msdos_dir_slot *slots; @@ -678,7 +678,7 @@ static int vfat_add_entry(struct inode *dir, const struct qstr *qname, goto cleanup; /* update timestamp */ - dir->i_ctime = dir->i_mtime = dir->i_atime = timespec_to_timespec64(*ts); + dir->i_ctime = dir->i_mtime = dir->i_atime = *ts; if (IS_DIRSYNC(dir)) (void)fat_sync_inode(dir); else @@ -762,14 +762,12 @@ static int vfat_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct inode *inode; struct fat_slot_info sinfo; struct timespec64 ts; - struct timespec t; int err; mutex_lock(&MSDOS_SB(sb)->s_lock); ts = current_time(dir); - t = timespec64_to_timespec(ts); - err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &t, &sinfo); + err = vfat_add_entry(dir, &dentry->d_name, 0, 0, &ts, &sinfo); if (err) goto out; inode_inc_iversion(dir); @@ -853,19 +851,17 @@ static int vfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct inode *inode; struct fat_slot_info sinfo; struct timespec64 ts; - struct timespec t; int err, cluster; mutex_lock(&MSDOS_SB(sb)->s_lock); ts = current_time(dir); - t = timespec64_to_timespec(ts); - cluster = fat_alloc_new_dir(dir, &t); + cluster = fat_alloc_new_dir(dir, &ts); if (cluster < 0) { err = cluster; goto out; } - err = vfat_add_entry(dir, &dentry->d_name, 1, cluster, &t, &sinfo); + err = vfat_add_entry(dir, &dentry->d_name, 1, cluster, &ts, &sinfo); if (err) goto out_free; inode_inc_iversion(dir); @@ -904,7 +900,6 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *old_inode, *new_inode; struct fat_slot_info old_sinfo, sinfo; struct timespec64 ts; - struct timespec t; loff_t new_i_pos; int err, is_dir, update_dotdot, corrupt = 0; struct super_block *sb = old_dir->i_sb; @@ -939,9 +934,8 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, new_i_pos = MSDOS_I(new_inode)->i_pos; fat_detach(new_inode); } else { - t = timespec64_to_timespec(ts); err = vfat_add_entry(new_dir, &new_dentry->d_name, is_dir, 0, - &t, &sinfo); + &ts, &sinfo); if (err) goto out; new_i_pos = sinfo.i_pos; -- cgit v1.2.3-59-g8ed1b From d9edcbc42c77b719e03dedb2aff719ae19659a0f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 21 Aug 2018 22:01:09 -0700 Subject: adfs: use timespec64 for time conversion We just truncate the seconds to 32-bit in one place now, so this can trivially be converted over to using timespec64 consistently. Link: http://lkml.kernel.org/r/20180620100133.4035614-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/adfs/inode.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index e91028d4340a..66621e96f9af 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -167,7 +167,7 @@ adfs_mode2atts(struct super_block *sb, struct inode *inode) * of time to convert from RISC OS epoch to Unix epoch. */ static void -adfs_adfs2unix_time(struct timespec *tv, struct inode *inode) +adfs_adfs2unix_time(struct timespec64 *tv, struct inode *inode) { unsigned int high, low; /* 01 Jan 1970 00:00:00 (Unix epoch) as nanoseconds since @@ -195,11 +195,11 @@ adfs_adfs2unix_time(struct timespec *tv, struct inode *inode) /* convert from RISC OS to Unix epoch */ nsec -= nsec_unix_epoch_diff_risc_os_epoch; - *tv = ns_to_timespec(nsec); + *tv = ns_to_timespec64(nsec); return; cur_time: - *tv = timespec64_to_timespec(current_time(inode)); + *tv = current_time(inode); return; too_early: @@ -242,7 +242,6 @@ adfs_unix2adfs_time(struct inode *inode, unsigned int secs) struct inode * adfs_iget(struct super_block *sb, struct object_info *obj) { - struct timespec ts; struct inode *inode; inode = new_inode(sb); @@ -271,9 +270,7 @@ adfs_iget(struct super_block *sb, struct object_info *obj) ADFS_I(inode)->stamped = ((obj->loadaddr & 0xfff00000) == 0xfff00000); inode->i_mode = adfs_atts2mode(sb, inode); - ts = timespec64_to_timespec(inode->i_mtime); - adfs_adfs2unix_time(&ts, inode); - inode->i_mtime = timespec_to_timespec64(ts); + adfs_adfs2unix_time(&inode->i_mtime, inode); inode->i_atime = inode->i_mtime; inode->i_ctime = inode->i_mtime; -- cgit v1.2.3-59-g8ed1b From 3e811f053aec66e8a6d2a0ee3d031e7c988e3d15 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 21 Aug 2018 22:01:13 -0700 Subject: fs/sysv/inode.c: use ktime_get_real_seconds() for superblock stamp get_seconds() is deprecated in favor of ktime_get_real_seconds(), which returns a 64-bit timestamp. In the SYSV file system, the superblock timestamp is only 32 bits wide, and it is used to check whether a file system is clean, so the best solution seems to be to force a wraparound and explicitly convert it to an unsigned 32-bit value. This is independent of the inode timestamps that are also 32-bit wide on disk and that come from current_time(). Link: http://lkml.kernel.org/r/20180713145236.3152513-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Acked-by: Thomas Gleixner Reviewed-by: Andrew Morton Cc: Alexander Viro Cc: Christoph Hellwig Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/sysv/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index bec9f79adb25..499a20a5a010 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -35,7 +35,7 @@ static int sysv_sync_fs(struct super_block *sb, int wait) { struct sysv_sb_info *sbi = SYSV_SB(sb); - unsigned long time = get_seconds(), old_time; + u32 time = (u32)ktime_get_real_seconds(), old_time; mutex_lock(&sbi->s_lock); @@ -46,8 +46,8 @@ static int sysv_sync_fs(struct super_block *sb, int wait) */ old_time = fs32_to_cpu(sbi, *sbi->s_sb_time); if (sbi->s_type == FSTYPE_SYSV4) { - if (*sbi->s_sb_state == cpu_to_fs32(sbi, 0x7c269d38 - old_time)) - *sbi->s_sb_state = cpu_to_fs32(sbi, 0x7c269d38 - time); + if (*sbi->s_sb_state == cpu_to_fs32(sbi, 0x7c269d38u - old_time)) + *sbi->s_sb_state = cpu_to_fs32(sbi, 0x7c269d38u - time); *sbi->s_sb_time = cpu_to_fs32(sbi, time); mark_buffer_dirty(sbi->s_bh2); } -- cgit v1.2.3-59-g8ed1b