From bb230c124730f21eea13deab433f9f8fc96bd5f3 Mon Sep 17 00:00:00 2001 From: Bill O'Donnell Date: Mon, 12 Oct 2015 05:15:45 +1100 Subject: xfs: create global stats and stats_clear in sysfs Currently, xfs global stats are in procfs. This patch introduces (replicates) the global stats in sysfs. Additionally a stats_clear file is introduced in sysfs. Signed-off-by: Bill O'Donnell Reviewed-by: Eric Sandeen Signed-off-by: Dave Chinner --- fs/xfs/xfs_stats.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_stats.h | 2 ++ fs/xfs/xfs_super.c | 20 +++++++++---- fs/xfs/xfs_sysctl.c | 15 ++-------- fs/xfs/xfs_sysfs.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_sysfs.h | 1 + 6 files changed, 178 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index f2240383d4bb..e19b84a327d1 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -29,6 +29,88 @@ static int counter_val(int idx) return val; } +int xfs_stats_format(char *buf) +{ + int i, j; + int len = 0; + __uint64_t xs_xstrat_bytes = 0; + __uint64_t xs_write_bytes = 0; + __uint64_t xs_read_bytes = 0; + + static const struct xstats_entry { + char *desc; + int endpoint; + } xstats[] = { + { "extent_alloc", XFSSTAT_END_EXTENT_ALLOC }, + { "abt", XFSSTAT_END_ALLOC_BTREE }, + { "blk_map", XFSSTAT_END_BLOCK_MAPPING }, + { "bmbt", XFSSTAT_END_BLOCK_MAP_BTREE }, + { "dir", XFSSTAT_END_DIRECTORY_OPS }, + { "trans", XFSSTAT_END_TRANSACTIONS }, + { "ig", XFSSTAT_END_INODE_OPS }, + { "log", XFSSTAT_END_LOG_OPS }, + { "push_ail", XFSSTAT_END_TAIL_PUSHING }, + { "xstrat", XFSSTAT_END_WRITE_CONVERT }, + { "rw", XFSSTAT_END_READ_WRITE_OPS }, + { "attr", XFSSTAT_END_ATTRIBUTE_OPS }, + { "icluster", XFSSTAT_END_INODE_CLUSTER }, + { "vnodes", XFSSTAT_END_VNODE_OPS }, + { "buf", XFSSTAT_END_BUF }, + { "abtb2", XFSSTAT_END_ABTB_V2 }, + { "abtc2", XFSSTAT_END_ABTC_V2 }, + { "bmbt2", XFSSTAT_END_BMBT_V2 }, + { "ibt2", XFSSTAT_END_IBT_V2 }, + { "fibt2", XFSSTAT_END_FIBT_V2 }, + /* we print both series of quota information together */ + { "qm", XFSSTAT_END_QM }, + }; + + /* Loop over all stats groups */ + + for (i = j = 0; i < ARRAY_SIZE(xstats); i++) { + len += snprintf(buf + len, PATH_MAX - len, "%s", + xstats[i].desc); + /* inner loop does each group */ + for (; j < xstats[i].endpoint; j++) + len += snprintf(buf + len, PATH_MAX - len, " %u", + counter_val(j)); + len += snprintf(buf + len, PATH_MAX - len, "\n"); + } + /* extra precision counters */ + for_each_possible_cpu(i) { + xs_xstrat_bytes += per_cpu(xfsstats, i).xs_xstrat_bytes; + xs_write_bytes += per_cpu(xfsstats, i).xs_write_bytes; + xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes; + } + + len += snprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n", + xs_xstrat_bytes, xs_write_bytes, xs_read_bytes); + len += snprintf(buf + len, PATH_MAX-len, "debug %u\n", +#if defined(DEBUG) + 1); +#else + 0); +#endif + + return len; +} + +void xfs_stats_clearall(void) +{ + int c; + __uint32_t vn_active; + + xfs_notice(NULL, "Clearing xfsstats"); + for_each_possible_cpu(c) { + preempt_disable(); + /* save vn_active, it's a universal truth! */ + vn_active = per_cpu(xfsstats, c).vn_active; + memset(&per_cpu(xfsstats, c), 0, sizeof(struct xfsstats)); + per_cpu(xfsstats, c).vn_active = vn_active; + preempt_enable(); + } +} + static int xfs_stat_proc_show(struct seq_file *m, void *v) { int i, j; diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index c8f238b8299a..18807b576394 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -18,6 +18,8 @@ #ifndef __XFS_STATS_H__ #define __XFS_STATS_H__ +int xfs_stats_format(char *buf); +void xfs_stats_clearall(void); #if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 904f637cfa5f..0dfc53ba62fb 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -61,6 +61,7 @@ static kmem_zone_t *xfs_ioend_zone; mempool_t *xfs_ioend_pool; static struct kset *xfs_kset; /* top-level xfs sysfs dir */ +static struct xfs_kobj xfs_stats_kobj; /* global stats sysfs attrs */ #ifdef DEBUG static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */ #endif @@ -1838,19 +1839,25 @@ init_xfs_fs(void) xfs_kset = kset_create_and_add("xfs", NULL, fs_kobj); if (!xfs_kset) { error = -ENOMEM; - goto out_sysctl_unregister;; + goto out_sysctl_unregister; } + xfs_stats_kobj.kobject.kset = xfs_kset; + error = xfs_sysfs_init(&xfs_stats_kobj, &xfs_stats_ktype, NULL, + "stats"); + if (error) + goto out_kset_unregister; + #ifdef DEBUG xfs_dbg_kobj.kobject.kset = xfs_kset; error = xfs_sysfs_init(&xfs_dbg_kobj, &xfs_dbg_ktype, NULL, "debug"); if (error) - goto out_kset_unregister; + goto out_remove_stats_kobj; #endif error = xfs_qm_init(); if (error) - goto out_remove_kobj; + goto out_remove_dbg_kobj; error = register_filesystem(&xfs_fs_type); if (error) @@ -1859,11 +1866,13 @@ init_xfs_fs(void) out_qm_exit: xfs_qm_exit(); - out_remove_kobj: + out_remove_dbg_kobj: #ifdef DEBUG xfs_sysfs_del(&xfs_dbg_kobj); - out_kset_unregister: + out_remove_stats_kobj: #endif + xfs_sysfs_del(&xfs_stats_kobj); + out_kset_unregister: kset_unregister(xfs_kset); out_sysctl_unregister: xfs_sysctl_unregister(); @@ -1889,6 +1898,7 @@ exit_xfs_fs(void) #ifdef DEBUG xfs_sysfs_del(&xfs_dbg_kobj); #endif + xfs_sysfs_del(&xfs_stats_kobj); kset_unregister(xfs_kset); xfs_sysctl_unregister(); xfs_cleanup_procfs(); diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index a0c8067cea6f..5defabb391cb 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -19,6 +19,7 @@ #include #include #include "xfs_error.h" +#include "xfs_stats.h" static struct ctl_table_header *xfs_table_header; @@ -31,22 +32,12 @@ xfs_stats_clear_proc_handler( size_t *lenp, loff_t *ppos) { - int c, ret, *valp = ctl->data; - __uint32_t vn_active; + int ret, *valp = ctl->data; ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); if (!ret && write && *valp) { - xfs_notice(NULL, "Clearing xfsstats"); - for_each_possible_cpu(c) { - preempt_disable(); - /* save vn_active, it's a universal truth! */ - vn_active = per_cpu(xfsstats, c).vn_active; - memset(&per_cpu(xfsstats, c), 0, - sizeof(struct xfsstats)); - per_cpu(xfsstats, c).vn_active = vn_active; - preempt_enable(); - } + xfs_stats_clearall(); xfs_stats_clear = 0; } diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index aa03670851d8..a094e20f567b 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -21,6 +21,7 @@ #include "xfs_log_format.h" #include "xfs_log.h" #include "xfs_log_priv.h" +#include "xfs_stats.h" struct xfs_sysfs_attr { struct attribute attr; @@ -38,6 +39,8 @@ to_attr(struct attribute *attr) static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RW(name) #define XFS_SYSFS_ATTR_RO(name) \ static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RO(name) +#define XFS_SYSFS_ATTR_WO(name) \ + static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_WO(name) #define ATTR_LIST(name) &xfs_sysfs_attr_##name.attr @@ -125,6 +128,78 @@ struct kobj_type xfs_dbg_ktype = { #endif /* DEBUG */ + +/* stats */ + +STATIC ssize_t +stats_show( + char *buf, + void *data) +{ + return xfs_stats_format(buf); +} +XFS_SYSFS_ATTR_RO(stats); + +STATIC ssize_t +stats_clear_store( + const char *buf, + size_t count, + void *data) +{ + int ret; + int val; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + if (val != 1) + return -EINVAL; + xfs_stats_clearall(); + return count; +} +XFS_SYSFS_ATTR_WO(stats_clear); + +static struct attribute *xfs_stats_attrs[] = { + ATTR_LIST(stats), + ATTR_LIST(stats_clear), + NULL, +}; + +STATIC ssize_t +xfs_stats_show( + struct kobject *kobject, + struct attribute *attr, + char *buf) +{ + struct xfs_sysfs_attr *xfs_attr = to_attr(attr); + + return xfs_attr->show ? xfs_attr->show(buf, NULL) : 0; +} + +STATIC ssize_t +xfs_stats_store( + struct kobject *kobject, + struct attribute *attr, + const char *buf, + size_t count) +{ + struct xfs_sysfs_attr *xfs_attr = to_attr(attr); + + return xfs_attr->store ? xfs_attr->store(buf, count, NULL) : 0; +} + +static struct sysfs_ops xfs_stats_ops = { + .show = xfs_stats_show, + .store = xfs_stats_store, +}; + +struct kobj_type xfs_stats_ktype = { + .release = xfs_sysfs_release, + .sysfs_ops = &xfs_stats_ops, + .default_attrs = xfs_stats_attrs, +}; + /* xlog */ STATIC ssize_t diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h index 240eee35f342..be692e59938d 100644 --- a/fs/xfs/xfs_sysfs.h +++ b/fs/xfs/xfs_sysfs.h @@ -22,6 +22,7 @@ extern struct kobj_type xfs_mp_ktype; /* xfs_mount */ extern struct kobj_type xfs_dbg_ktype; /* debug */ extern struct kobj_type xfs_log_ktype; /* xlog */ +extern struct kobj_type xfs_stats_ktype; /* stats */ static inline struct xfs_kobj * to_kobj(struct kobject *kobject) -- cgit v1.2.3-59-g8ed1b From 32f0ea05219e5212cafcbeaa255e627314f87e7e Mon Sep 17 00:00:00 2001 From: Bill O'Donnell Date: Mon, 12 Oct 2015 05:16:45 +1100 Subject: xfs: create symlink proc/fs/xfs/stat to sys/fs/xfs/stats As a part of the work to move xfs global stats from procfs to sysfs, this patch creates the symlink from proc/fs/xfs/stat to sys/fs/xfs/stats. Signed-off-by: Bill O'Donnell Reviewed-by: Eric Sandeen Signed-off-by: Dave Chinner --- fs/xfs/xfs_stats.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index e19b84a327d1..e6efebbdb793 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -243,9 +243,10 @@ xfs_init_procfs(void) if (!proc_mkdir("fs/xfs", NULL)) goto out; - if (!proc_create("fs/xfs/stat", 0, NULL, - &xfs_stat_proc_fops)) + if (!proc_symlink("fs/xfs/stat", NULL, + "/sys/fs/xfs/stats/stats")) goto out_remove_xfs_dir; + #ifdef CONFIG_XFS_QUOTA if (!proc_create("fs/xfs/xqmstat", 0, NULL, &xqmstat_proc_fops)) -- cgit v1.2.3-59-g8ed1b From 50cf5b7402633265fe11430fd63af82c401fad46 Mon Sep 17 00:00:00 2001 From: Bill O'Donnell Date: Mon, 12 Oct 2015 05:17:45 +1100 Subject: xfs: remove unused procfs code As a part of the work to move xfs global stats from procfs to sysfs, this patch removes the now unused procfs code that was xfs stat specific. Signed-off-by: Bill O'Donnell Reviewed-by: Eric Sandeen Signed-off-by: Dave Chinner --- fs/xfs/xfs_stats.c | 74 ------------------------------------------------------ 1 file changed, 74 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index e6efebbdb793..f481da09c388 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -111,80 +111,6 @@ void xfs_stats_clearall(void) } } -static int xfs_stat_proc_show(struct seq_file *m, void *v) -{ - int i, j; - __uint64_t xs_xstrat_bytes = 0; - __uint64_t xs_write_bytes = 0; - __uint64_t xs_read_bytes = 0; - - static const struct xstats_entry { - char *desc; - int endpoint; - } xstats[] = { - { "extent_alloc", XFSSTAT_END_EXTENT_ALLOC }, - { "abt", XFSSTAT_END_ALLOC_BTREE }, - { "blk_map", XFSSTAT_END_BLOCK_MAPPING }, - { "bmbt", XFSSTAT_END_BLOCK_MAP_BTREE }, - { "dir", XFSSTAT_END_DIRECTORY_OPS }, - { "trans", XFSSTAT_END_TRANSACTIONS }, - { "ig", XFSSTAT_END_INODE_OPS }, - { "log", XFSSTAT_END_LOG_OPS }, - { "push_ail", XFSSTAT_END_TAIL_PUSHING }, - { "xstrat", XFSSTAT_END_WRITE_CONVERT }, - { "rw", XFSSTAT_END_READ_WRITE_OPS }, - { "attr", XFSSTAT_END_ATTRIBUTE_OPS }, - { "icluster", XFSSTAT_END_INODE_CLUSTER }, - { "vnodes", XFSSTAT_END_VNODE_OPS }, - { "buf", XFSSTAT_END_BUF }, - { "abtb2", XFSSTAT_END_ABTB_V2 }, - { "abtc2", XFSSTAT_END_ABTC_V2 }, - { "bmbt2", XFSSTAT_END_BMBT_V2 }, - { "ibt2", XFSSTAT_END_IBT_V2 }, - { "fibt2", XFSSTAT_END_FIBT_V2 }, - /* we print both series of quota information together */ - { "qm", XFSSTAT_END_QM }, - }; - - /* Loop over all stats groups */ - for (i = j = 0; i < ARRAY_SIZE(xstats); i++) { - seq_printf(m, "%s", xstats[i].desc); - /* inner loop does each group */ - for (; j < xstats[i].endpoint; j++) - seq_printf(m, " %u", counter_val(j)); - seq_putc(m, '\n'); - } - /* extra precision counters */ - for_each_possible_cpu(i) { - xs_xstrat_bytes += per_cpu(xfsstats, i).xs_xstrat_bytes; - xs_write_bytes += per_cpu(xfsstats, i).xs_write_bytes; - xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes; - } - - seq_printf(m, "xpc %Lu %Lu %Lu\n", - xs_xstrat_bytes, xs_write_bytes, xs_read_bytes); - seq_printf(m, "debug %u\n", -#if defined(DEBUG) - 1); -#else - 0); -#endif - return 0; -} - -static int xfs_stat_proc_open(struct inode *inode, struct file *file) -{ - return single_open(file, xfs_stat_proc_show, NULL); -} - -static const struct file_operations xfs_stat_proc_fops = { - .owner = THIS_MODULE, - .open = xfs_stat_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - /* legacy quota interfaces */ #ifdef CONFIG_XFS_QUOTA static int xqm_proc_show(struct seq_file *m, void *v) -- cgit v1.2.3-59-g8ed1b From a27c264009cb236dc209875043a0c237af46a1af Mon Sep 17 00:00:00 2001 From: Bill O'Donnell Date: Mon, 12 Oct 2015 05:18:45 +1100 Subject: xfs: consolidate sysfs ops As a part of the series to move xfs global stats from procfs to sysfs, this patch consolidates the sysfs ops functions and removes redundancy. Signed-off-by: Bill O'Donnell Reviewed-by: Eric Sandeen Signed-off-by: Dave Chinner --- fs/xfs/xfs_sysfs.c | 182 +++++++++++++++++++---------------------------------- 1 file changed, 63 insertions(+), 119 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index a094e20f567b..fa3dc66d82f1 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -25,8 +25,9 @@ struct xfs_sysfs_attr { struct attribute attr; - ssize_t (*show)(char *buf, void *data); - ssize_t (*store)(const char *buf, size_t count, void *data); + ssize_t (*show)(struct kobject *kobject, char *buf); + ssize_t (*store)(struct kobject *kobject, const char *buf, + size_t count); }; static inline struct xfs_sysfs_attr * @@ -54,14 +55,42 @@ struct kobj_type xfs_mp_ktype = { .release = xfs_sysfs_release, }; +STATIC ssize_t +xfs_sysfs_object_show( + struct kobject *kobject, + struct attribute *attr, + char *buf) +{ + struct xfs_sysfs_attr *xfs_attr = to_attr(attr); + + return xfs_attr->show ? xfs_attr->show(kobject, buf) : 0; +} + +STATIC ssize_t +xfs_sysfs_object_store( + struct kobject *kobject, + struct attribute *attr, + const char *buf, + size_t count) +{ + struct xfs_sysfs_attr *xfs_attr = to_attr(attr); + + return xfs_attr->store ? xfs_attr->store(kobject, buf, count) : 0; +} + +static const struct sysfs_ops xfs_sysfs_ops = { + .show = xfs_sysfs_object_show, + .store = xfs_sysfs_object_store, +}; + #ifdef DEBUG /* debug */ STATIC ssize_t log_recovery_delay_store( + struct kobject *kobject, const char *buf, - size_t count, - void *data) + size_t count) { int ret; int val; @@ -80,8 +109,8 @@ log_recovery_delay_store( STATIC ssize_t log_recovery_delay_show( - char *buf, - void *data) + struct kobject *kobject, + char *buf) { return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.log_recovery_delay); } @@ -92,49 +121,20 @@ static struct attribute *xfs_dbg_attrs[] = { NULL, }; -STATIC ssize_t -xfs_dbg_show( - struct kobject *kobject, - struct attribute *attr, - char *buf) -{ - struct xfs_sysfs_attr *xfs_attr = to_attr(attr); - - return xfs_attr->show ? xfs_attr->show(buf, NULL) : 0; -} - -STATIC ssize_t -xfs_dbg_store( - struct kobject *kobject, - struct attribute *attr, - const char *buf, - size_t count) -{ - struct xfs_sysfs_attr *xfs_attr = to_attr(attr); - - return xfs_attr->store ? xfs_attr->store(buf, count, NULL) : 0; -} - -static struct sysfs_ops xfs_dbg_ops = { - .show = xfs_dbg_show, - .store = xfs_dbg_store, -}; - struct kobj_type xfs_dbg_ktype = { .release = xfs_sysfs_release, - .sysfs_ops = &xfs_dbg_ops, + .sysfs_ops = &xfs_sysfs_ops, .default_attrs = xfs_dbg_attrs, }; #endif /* DEBUG */ - /* stats */ STATIC ssize_t stats_show( - char *buf, - void *data) + struct kobject *kobject, + char *buf) { return xfs_stats_format(buf); } @@ -142,9 +142,9 @@ XFS_SYSFS_ATTR_RO(stats); STATIC ssize_t stats_clear_store( + struct kobject *kobject, const char *buf, - size_t count, - void *data) + size_t count) { int ret; int val; @@ -166,50 +166,30 @@ static struct attribute *xfs_stats_attrs[] = { NULL, }; -STATIC ssize_t -xfs_stats_show( - struct kobject *kobject, - struct attribute *attr, - char *buf) -{ - struct xfs_sysfs_attr *xfs_attr = to_attr(attr); - - return xfs_attr->show ? xfs_attr->show(buf, NULL) : 0; -} - -STATIC ssize_t -xfs_stats_store( - struct kobject *kobject, - struct attribute *attr, - const char *buf, - size_t count) -{ - struct xfs_sysfs_attr *xfs_attr = to_attr(attr); - - return xfs_attr->store ? xfs_attr->store(buf, count, NULL) : 0; -} - -static struct sysfs_ops xfs_stats_ops = { - .show = xfs_stats_show, - .store = xfs_stats_store, -}; - struct kobj_type xfs_stats_ktype = { .release = xfs_sysfs_release, - .sysfs_ops = &xfs_stats_ops, + .sysfs_ops = &xfs_sysfs_ops, .default_attrs = xfs_stats_attrs, }; /* xlog */ +static inline struct xlog * +to_xlog(struct kobject *kobject) +{ + struct xfs_kobj *kobj = to_kobj(kobject); + + return container_of(kobj, struct xlog, l_kobj); +} + STATIC ssize_t log_head_lsn_show( - char *buf, - void *data) + struct kobject *kobject, + char *buf) { - struct xlog *log = data; int cycle; int block; + struct xlog *log = to_xlog(kobject); spin_lock(&log->l_icloglock); cycle = log->l_curr_cycle; @@ -222,12 +202,12 @@ XFS_SYSFS_ATTR_RO(log_head_lsn); STATIC ssize_t log_tail_lsn_show( - char *buf, - void *data) + struct kobject *kobject, + char *buf) { - struct xlog *log = data; int cycle; int block; + struct xlog *log = to_xlog(kobject); xlog_crack_atomic_lsn(&log->l_tail_lsn, &cycle, &block); return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block); @@ -236,12 +216,13 @@ XFS_SYSFS_ATTR_RO(log_tail_lsn); STATIC ssize_t reserve_grant_head_show( - char *buf, - void *data) + struct kobject *kobject, + char *buf) + { - struct xlog *log = data; int cycle; int bytes; + struct xlog *log = to_xlog(kobject); xlog_crack_grant_head(&log->l_reserve_head.grant, &cycle, &bytes); return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes); @@ -250,12 +231,12 @@ XFS_SYSFS_ATTR_RO(reserve_grant_head); STATIC ssize_t write_grant_head_show( - char *buf, - void *data) + struct kobject *kobject, + char *buf) { - struct xlog *log = data; int cycle; int bytes; + struct xlog *log = to_xlog(kobject); xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &bytes); return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes); @@ -270,45 +251,8 @@ static struct attribute *xfs_log_attrs[] = { NULL, }; -static inline struct xlog * -to_xlog(struct kobject *kobject) -{ - struct xfs_kobj *kobj = to_kobj(kobject); - return container_of(kobj, struct xlog, l_kobj); -} - -STATIC ssize_t -xfs_log_show( - struct kobject *kobject, - struct attribute *attr, - char *buf) -{ - struct xlog *log = to_xlog(kobject); - struct xfs_sysfs_attr *xfs_attr = to_attr(attr); - - return xfs_attr->show ? xfs_attr->show(buf, log) : 0; -} - -STATIC ssize_t -xfs_log_store( - struct kobject *kobject, - struct attribute *attr, - const char *buf, - size_t count) -{ - struct xlog *log = to_xlog(kobject); - struct xfs_sysfs_attr *xfs_attr = to_attr(attr); - - return xfs_attr->store ? xfs_attr->store(buf, count, log) : 0; -} - -static struct sysfs_ops xfs_log_ops = { - .show = xfs_log_show, - .store = xfs_log_store, -}; - struct kobj_type xfs_log_ktype = { .release = xfs_sysfs_release, - .sysfs_ops = &xfs_log_ops, + .sysfs_ops = &xfs_sysfs_ops, .default_attrs = xfs_log_attrs, }; -- cgit v1.2.3-59-g8ed1b From 80529c45ab66188a0b3814ba31409b31f5fcb53d Mon Sep 17 00:00:00 2001 From: Bill O'Donnell Date: Mon, 12 Oct 2015 05:19:45 +1100 Subject: xfs: pass xfsstats structures to handlers and macros This patch is the next step toward per-fs xfs stats. The patch makes the show and clear routines able to handle any stats structure associated with a kobject. Instead of a single global xfsstats structure, add kobject and a pointer to a per-cpu struct xfsstats. Modify the macros that manipulate the stats accordingly: XFS_STATS_INC, XFS_STATS_DEC, and XFS_STATS_ADD now access xfsstats->xs_stats. The sysfs functions need to get from the kobject back to the xfsstats structure which contains it, and pass the pointer to the ->xs_stats percpu structure into the show & clear routines. Signed-off-by: Bill O'Donnell Reviewed-by: Eric Sandeen Signed-off-by: Dave Chinner --- fs/xfs/xfs_linux.h | 7 +++++++ fs/xfs/xfs_stats.c | 32 +++++++++++++++----------------- fs/xfs/xfs_stats.h | 22 +++++++++++----------- fs/xfs/xfs_super.c | 21 +++++++++++++++------ fs/xfs/xfs_sysctl.c | 2 +- fs/xfs/xfs_sysfs.c | 16 ++++++++++++++-- 6 files changed, 63 insertions(+), 37 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 85f883dd6207..ec0e239a0fa9 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -171,6 +171,13 @@ struct xfs_kobj { struct completion complete; }; +struct xstats { + struct xfsstats __percpu *xs_stats; + struct xfs_kobj xs_kobj; +}; + +extern struct xstats xfsstats; + /* Kernel uid/gid conversion. These are used to convert to/from the on disk * uid_t/gid_t types to the kuid_t/kgid_t types that the kernel uses internally. * The conversion here is type only, the value will remain the same since we diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index f481da09c388..3348b0b6177d 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -18,18 +18,18 @@ #include "xfs.h" #include -DEFINE_PER_CPU(struct xfsstats, xfsstats); +struct xstats xfsstats; -static int counter_val(int idx) +static int counter_val(struct xfsstats __percpu *stats, int idx) { int val = 0, cpu; for_each_possible_cpu(cpu) - val += *(((__u32 *)&per_cpu(xfsstats, cpu) + idx)); + val += *(((__u32 *)per_cpu_ptr(stats, cpu) + idx)); return val; } -int xfs_stats_format(char *buf) +int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) { int i, j; int len = 0; @@ -73,14 +73,14 @@ int xfs_stats_format(char *buf) /* inner loop does each group */ for (; j < xstats[i].endpoint; j++) len += snprintf(buf + len, PATH_MAX - len, " %u", - counter_val(j)); + counter_val(stats, j)); len += snprintf(buf + len, PATH_MAX - len, "\n"); } /* extra precision counters */ for_each_possible_cpu(i) { - xs_xstrat_bytes += per_cpu(xfsstats, i).xs_xstrat_bytes; - xs_write_bytes += per_cpu(xfsstats, i).xs_write_bytes; - xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes; + xs_xstrat_bytes += per_cpu_ptr(stats, i)->xs_xstrat_bytes; + xs_write_bytes += per_cpu_ptr(stats, i)->xs_write_bytes; + xs_read_bytes += per_cpu_ptr(stats, i)->xs_read_bytes; } len += snprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n", @@ -95,7 +95,7 @@ int xfs_stats_format(char *buf) return len; } -void xfs_stats_clearall(void) +void xfs_stats_clearall(struct xfsstats __percpu *stats) { int c; __uint32_t vn_active; @@ -104,9 +104,9 @@ void xfs_stats_clearall(void) for_each_possible_cpu(c) { preempt_disable(); /* save vn_active, it's a universal truth! */ - vn_active = per_cpu(xfsstats, c).vn_active; - memset(&per_cpu(xfsstats, c), 0, sizeof(struct xfsstats)); - per_cpu(xfsstats, c).vn_active = vn_active; + vn_active = per_cpu_ptr(stats, c)->vn_active; + memset(per_cpu_ptr(stats, c), 0, sizeof(*stats)); + per_cpu_ptr(stats, c)->vn_active = vn_active; preempt_enable(); } } @@ -117,10 +117,8 @@ static int xqm_proc_show(struct seq_file *m, void *v) { /* maximum; incore; ratio free to inuse; freelist */ seq_printf(m, "%d\t%d\t%d\t%u\n", - 0, - counter_val(XFSSTAT_END_XQMSTAT), - 0, - counter_val(XFSSTAT_END_XQMSTAT + 1)); + 0, counter_val(xfsstats.xs_stats, XFSSTAT_END_XQMSTAT), + 0, counter_val(xfsstats.xs_stats, XFSSTAT_END_XQMSTAT + 1)); return 0; } @@ -144,7 +142,7 @@ static int xqmstat_proc_show(struct seq_file *m, void *v) seq_printf(m, "qm"); for (j = XFSSTAT_END_IBT_V2; j < XFSSTAT_END_XQMSTAT; j++) - seq_printf(m, " %u", counter_val(j)); + seq_printf(m, " %u", counter_val(xfsstats.xs_stats, j)); seq_putc(m, '\n'); return 0; } diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index 18807b576394..54f2260299d1 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -18,9 +18,6 @@ #ifndef __XFS_STATS_H__ #define __XFS_STATS_H__ -int xfs_stats_format(char *buf); -void xfs_stats_clearall(void); - #if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF) #include @@ -217,15 +214,18 @@ struct xfsstats { __uint64_t xs_read_bytes; }; -DECLARE_PER_CPU(struct xfsstats, xfsstats); +int xfs_stats_format(struct xfsstats __percpu *stats, char *buf); +void xfs_stats_clearall(struct xfsstats __percpu *stats); +extern struct xstats xfsstats; -/* - * We don't disable preempt, not too worried about poking the - * wrong CPU's stat for now (also aggregated before reporting). - */ -#define XFS_STATS_INC(v) (per_cpu(xfsstats, current_cpu()).v++) -#define XFS_STATS_DEC(v) (per_cpu(xfsstats, current_cpu()).v--) -#define XFS_STATS_ADD(v, inc) (per_cpu(xfsstats, current_cpu()).v += (inc)) +#define XFS_STATS_INC(v) \ + (per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v++) + +#define XFS_STATS_DEC(v) \ + (per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v--) + +#define XFS_STATS_ADD(v, inc) \ + (per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v += (inc)) extern int xfs_init_procfs(void); extern void xfs_cleanup_procfs(void); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 0dfc53ba62fb..e1a35a5104e8 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -61,7 +61,6 @@ static kmem_zone_t *xfs_ioend_zone; mempool_t *xfs_ioend_pool; static struct kset *xfs_kset; /* top-level xfs sysfs dir */ -static struct xfs_kobj xfs_stats_kobj; /* global stats sysfs attrs */ #ifdef DEBUG static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */ #endif @@ -1842,11 +1841,18 @@ init_xfs_fs(void) goto out_sysctl_unregister; } - xfs_stats_kobj.kobject.kset = xfs_kset; - error = xfs_sysfs_init(&xfs_stats_kobj, &xfs_stats_ktype, NULL, + xfsstats.xs_kobj.kobject.kset = xfs_kset; + + xfsstats.xs_stats = alloc_percpu(struct xfsstats); + if (!xfsstats.xs_stats) { + error = -ENOMEM; + goto out_kset_unregister; + } + + error = xfs_sysfs_init(&xfsstats.xs_kobj, &xfs_stats_ktype, NULL, "stats"); if (error) - goto out_kset_unregister; + goto out_free_stats; #ifdef DEBUG xfs_dbg_kobj.kobject.kset = xfs_kset; @@ -1871,7 +1877,9 @@ init_xfs_fs(void) xfs_sysfs_del(&xfs_dbg_kobj); out_remove_stats_kobj: #endif - xfs_sysfs_del(&xfs_stats_kobj); + xfs_sysfs_del(&xfsstats.xs_kobj); + out_free_stats: + free_percpu(xfsstats.xs_stats); out_kset_unregister: kset_unregister(xfs_kset); out_sysctl_unregister: @@ -1898,7 +1906,8 @@ exit_xfs_fs(void) #ifdef DEBUG xfs_sysfs_del(&xfs_dbg_kobj); #endif - xfs_sysfs_del(&xfs_stats_kobj); + xfs_sysfs_del(&xfsstats.xs_kobj); + free_percpu(xfsstats.xs_stats); kset_unregister(xfs_kset); xfs_sysctl_unregister(); xfs_cleanup_procfs(); diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index 5defabb391cb..aed74d3f8da9 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -37,7 +37,7 @@ xfs_stats_clear_proc_handler( ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); if (!ret && write && *valp) { - xfs_stats_clearall(); + xfs_stats_clearall(xfsstats.xs_stats); xfs_stats_clear = 0; } diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index fa3dc66d82f1..ee70f5dec9dc 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -131,12 +131,22 @@ struct kobj_type xfs_dbg_ktype = { /* stats */ +static inline struct xstats * +to_xstats(struct kobject *kobject) +{ + struct xfs_kobj *kobj = to_kobj(kobject); + + return container_of(kobj, struct xstats, xs_kobj); +} + STATIC ssize_t stats_show( struct kobject *kobject, char *buf) { - return xfs_stats_format(buf); + struct xstats *stats = to_xstats(kobject); + + return xfs_stats_format(stats->xs_stats, buf); } XFS_SYSFS_ATTR_RO(stats); @@ -148,6 +158,7 @@ stats_clear_store( { int ret; int val; + struct xstats *stats = to_xstats(kobject); ret = kstrtoint(buf, 0, &val); if (ret) @@ -155,7 +166,8 @@ stats_clear_store( if (val != 1) return -EINVAL; - xfs_stats_clearall(); + + xfs_stats_clearall(stats->xs_stats); return count; } XFS_SYSFS_ATTR_WO(stats_clear); -- cgit v1.2.3-59-g8ed1b From 5cb13dcd0fac071b45c4bebe1801a08ff0d89cad Mon Sep 17 00:00:00 2001 From: Zhaohongjiang Date: Mon, 12 Oct 2015 15:28:39 +1100 Subject: cancel the setfilesize transation when io error happen When I ran xfstest/073 case, the remount process was blocked to wait transactions to be zero. I found there was a io error happened, and the setfilesize transaction was not released properly. We should add the changes to cancel the io error in this case. Reproduction steps: 1. dd if=/dev/zero of=xfs1.img bs=1M count=2048 2. mkfs.xfs xfs1.img 3. losetup -f ./xfs1.img /dev/loop0 4. mount -t xfs /dev/loop0 /home/test_dir/ 5. mkdir /home/test_dir/test 6. mkfs.xfs -dfile,name=image,size=2g 7. mount -t xfs -o loop image /home/test_dir/test 8. cp a file bigger than 2g to /home/test_dir/test 9. mount -t xfs -o remount,ro /home/test_dir/test [ dchinner: moved io error detection to xfs_setfilesize_ioend() after transaction context restoration. ] Signed-off-by: Zhao Hongjiang Signed-off-by: Dave Chinner --- fs/xfs/xfs_aops.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 50ab2879b9da..e485e31813fa 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -172,6 +172,12 @@ xfs_setfilesize_ioend( current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS); + /* we abort the update if there was an IO error */ + if (ioend->io_error) { + xfs_trans_cancel(tp); + return ioend->io_error; + } + return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size); } @@ -212,14 +218,17 @@ xfs_end_io( ioend->io_error = -EIO; goto done; } - if (ioend->io_error) - goto done; /* * For unwritten extents we need to issue transactions to convert a * range to normal written extens after the data I/O has finished. + * Detecting and handling completion IO errors is done individually + * for each case as different cleanup operations need to be performed + * on error. */ if (ioend->io_type == XFS_IO_UNWRITTEN) { + if (ioend->io_error) + goto done; error = xfs_iomap_write_unwritten(ip, ioend->io_offset, ioend->io_size); } else if (ioend->io_append_trans) { -- cgit v1.2.3-59-g8ed1b From 009c6e871e98aa23bc2e58474c3d9feb05dd1ae6 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 12 Oct 2015 15:34:20 +1100 Subject: xfs: add missing ilock around dio write last extent alignment The iomap codepath (via get_blocks()) acquires and release the inode lock in the case of a direct write that requires block allocation. This is because xfs_iomap_write_direct() allocates a transaction, which means the ilock must be dropped and reacquired after the transaction is allocated and reserved. xfs_iomap_write_direct() invokes xfs_iomap_eof_align_last_fsb() before the transaction is created and thus before the ilock is reacquired. This can lead to calls to xfs_iread_extents() and reads of the in-core extent list without any synchronization (via xfs_bmap_eof() and xfs_bmap_last_extent()). xfs_iread_extents() assert fails if the ilock is not held, but this is not currently seen in practice as the current callers had already invoked xfs_bmapi_read(). What has been seen in practice are reports of crashes down in the xfs_bmap_eof() codepath on direct writes due to seemingly bogus pointer references from xfs_iext_get_ext(). While an explicit reproducer is not currently available to confirm the cause of the problem, crash analysis and code inspection from David Jeffrey had identified the insufficient locking. xfs_iomap_eof_align_last_fsb() is called from other contexts with the inode lock already held, so we cannot acquire it therein. __xfs_get_blocks() acquires and drops the ilock with variable flags to cover the event that the extent list must be read in. The common case is that __xfs_get_blocks() acquires the shared ilock. To provide locking around the last extent alignment call without adding more lock cycles to the dio path, update xfs_iomap_write_direct() to expect the shared ilock held on entry and do the extent alignment under its protection. Demote the lock, if necessary, from __xfs_get_blocks() and push the xfs_qm_dqattach() call outside of the shared lock critical section. Also, add an assert to document that the extent list is always expected to be present in this path. Otherwise, we risk a call to xfs_iread_extents() while under the shared ilock. This is safe as all current callers have executed an xfs_bmapi_read() call under the current iolock context. Reported-by: David Jeffery Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_aops.c | 10 +++++----- fs/xfs/xfs_iomap.c | 33 ++++++++++++++++++++++++++------- fs/xfs/xfs_pnfs.c | 5 +++++ 3 files changed, 36 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index e485e31813fa..e4fff5898c1c 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1408,12 +1408,12 @@ __xfs_get_blocks( imap.br_startblock == DELAYSTARTBLOCK))) { if (direct || xfs_get_extsz_hint(ip)) { /* - * Drop the ilock in preparation for starting the block - * allocation transaction. It will be retaken - * exclusively inside xfs_iomap_write_direct for the - * actual allocation. + * xfs_iomap_write_direct() expects the shared lock. It + * is unlocked on return. */ - xfs_iunlock(ip, lockmode); + if (lockmode == XFS_ILOCK_EXCL) + xfs_ilock_demote(ip, lockmode); + error = xfs_iomap_write_direct(ip, offset, size, &imap, nimaps); if (error) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 1f86033171c8..1beda331d8a7 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -131,20 +131,29 @@ xfs_iomap_write_direct( uint qblocks, resblks, resrtextents; int committed; int error; - - error = xfs_qm_dqattach(ip, 0); - if (error) - return error; + int lockmode; rt = XFS_IS_REALTIME_INODE(ip); extsz = xfs_get_extsz_hint(ip); + lockmode = XFS_ILOCK_SHARED; /* locked by caller */ + + ASSERT(xfs_isilocked(ip, lockmode)); offset_fsb = XFS_B_TO_FSBT(mp, offset); last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count))); if ((offset + count) > XFS_ISIZE(ip)) { + /* + * Assert that the in-core extent list is present since this can + * call xfs_iread_extents() and we only have the ilock shared. + * This should be safe because the lock was held around a bmapi + * call in the caller and we only need it to access the in-core + * list. + */ + ASSERT(XFS_IFORK_PTR(ip, XFS_DATA_FORK)->if_flags & + XFS_IFEXTENTS); error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb); if (error) - return error; + goto out_unlock; } else { if (nmaps && (imap->br_startblock == HOLESTARTBLOCK)) last_fsb = MIN(last_fsb, (xfs_fileoff_t) @@ -173,6 +182,15 @@ xfs_iomap_write_direct( quota_flag = XFS_QMOPT_RES_REGBLKS; } + /* + * Drop the shared lock acquired by the caller, attach the dquot if + * necessary and move on to transaction setup. + */ + xfs_iunlock(ip, lockmode); + error = xfs_qm_dqattach(ip, 0); + if (error) + return error; + /* * Allocate and setup the transaction */ @@ -187,7 +205,8 @@ xfs_iomap_write_direct( return error; } - xfs_ilock(ip, XFS_ILOCK_EXCL); + lockmode = XFS_ILOCK_EXCL; + xfs_ilock(ip, lockmode); error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag); if (error) @@ -229,7 +248,7 @@ xfs_iomap_write_direct( error = xfs_alert_fsblock_zero(ip, imap); out_unlock: - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(ip, lockmode); return error; out_bmap_cancel: diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index ab4a6066f7ca..dc6221942b85 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -181,6 +181,11 @@ xfs_fs_map_blocks( ASSERT(imap.br_startblock != DELAYSTARTBLOCK); if (!nimaps || imap.br_startblock == HOLESTARTBLOCK) { + /* + * xfs_iomap_write_direct() expects to take ownership of + * the shared ilock. + */ + xfs_ilock(ip, XFS_ILOCK_SHARED); error = xfs_iomap_write_direct(ip, offset, length, &imap, nimaps); if (error) -- cgit v1.2.3-59-g8ed1b From b7cdc66be54b64daef593894d12ecc405f117829 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 12 Oct 2015 15:40:24 +1100 Subject: xfs: log local to remote symlink conversions correctly on v5 supers A local format symlink inode is converted to extent format when an extended attribute is set on an inode as part of the attribute fork creation. This means a block is allocated, the local symlink target name is copied to the block and the block is logged. Currently, xfs_bmap_local_to_extents() handles logging the remote block data based on the size of the data fork prior to the conversion. This is not correct on v5 superblock filesystems, which add an additional header to remote symlink blocks that is nonexistent in local format inodes. As a result, the full length of the remote symlink block content is not logged. This can lead to corruption should a crash occur and log recovery replay this transaction. Since a callout is already used to initialize the new remote symlink block, update the local-to-extents conversion mechanism to make the callout also responsible for logging the block. It is already required to set the log buffer type and format the block appropriately based on the superblock version. This ensures the remote symlink is always logged correctly. Note that xfs_bmap_local_to_extents() is only called for symlinks so there are no other callouts that require modification. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_bmap.c | 10 ++++++---- fs/xfs/libxfs/xfs_symlink_remote.c | 3 +++ 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 8e2010d53b07..b5cb61ecd5bb 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -948,14 +948,16 @@ xfs_bmap_local_to_extents( bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0); /* - * Initialise the block and copy the data + * Initialize the block, copy the data and log the remote buffer. * - * Note: init_fn must set the buffer log item type correctly! + * The callout is responsible for logging because the remote format + * might differ from the local format and thus we don't know how much to + * log here. Note that init_fn must also set the buffer log item type + * correctly. */ init_fn(tp, bp, ip, ifp); - /* account for the change in fork size and log everything */ - xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); + /* account for the change in fork size */ xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); xfs_bmap_local_to_extents_empty(ip, whichfork); flags |= XFS_ILOG_CORE; diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index 8f8af05b3f13..b9884aab46d3 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -183,6 +183,7 @@ xfs_symlink_local_to_remote( if (!xfs_sb_version_hascrc(&mp->m_sb)) { bp->b_ops = NULL; memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); + xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); return; } @@ -198,4 +199,6 @@ xfs_symlink_local_to_remote( buf = bp->b_addr; buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp); memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes); + xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsymlink_hdr) + + ifp->if_bytes - 1); } -- cgit v1.2.3-59-g8ed1b From 5bf97b1cb430a3a6da4341ae913299706ebc52f5 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 12 Oct 2015 15:41:29 +1100 Subject: xfs: Print name and pid when memory allocation loops This patch adds comm name and pid to warning messages printed by kmem_alloc(), kmem_zone_alloc() and xfs_buf_allocate_memory(). This will help telling which memory allocations (e.g. kernel worker threads, OOM victim tasks, neither) are stalling because these functions are passing __GFP_NOWARN which suppresses not only backtrace but comm name and pid. Signed-off-by: Tetsuo Handa Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/kmem.c | 10 ++++++---- fs/xfs/xfs_buf.c | 3 ++- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index a7a3a63bb360..535c13677e7a 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -55,8 +55,9 @@ kmem_alloc(size_t size, xfs_km_flags_t flags) return ptr; if (!(++retries % 100)) xfs_err(NULL, - "possible memory allocation deadlock in %s (mode:0x%x)", - __func__, lflags); + "%s(%u) possible memory allocation deadlock in %s (mode:0x%x)", + current->comm, current->pid, + __func__, lflags); congestion_wait(BLK_RW_ASYNC, HZ/50); } while (1); } @@ -120,8 +121,9 @@ kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags) return ptr; if (!(++retries % 100)) xfs_err(NULL, - "possible memory allocation deadlock in %s (mode:0x%x)", - __func__, lflags); + "%s(%u) possible memory allocation deadlock in %s (mode:0x%x)", + current->comm, current->pid, + __func__, lflags); congestion_wait(BLK_RW_ASYNC, HZ/50); } while (1); } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 8ecffb35935b..cac62e14d73a 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -354,7 +354,8 @@ retry: */ if (!(++retries % 100)) xfs_err(NULL, - "possible memory allocation deadlock in %s (mode:0x%x)", + "%s(%u) possible memory allocation deadlock in %s (mode:0x%x)", + current->comm, current->pid, __func__, gfp_mask); XFS_STATS_INC(xb_page_retries); -- cgit v1.2.3-59-g8ed1b From a45086e27dfa21a4b39134f7505c8f60a3ecdec4 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 12 Oct 2015 15:59:25 +1100 Subject: xfs: validate metadata LSNs against log on v5 superblocks Since the onset of v5 superblocks, the LSN of the last modification has been included in a variety of on-disk data structures. This LSN is used to provide log recovery ordering guarantees (e.g., to ensure an older log recovery item is not replayed over a newer target data structure). While this works correctly from the point a filesystem is formatted and mounted, userspace tools have some problematic behaviors that defeat this mechanism. For example, xfs_repair historically zeroes out the log unconditionally (regardless of whether corruption is detected). If this occurs, the LSN of the filesystem is reset and the log is now in a problematic state with respect to on-disk metadata structures that might have a larger LSN. Until either the log catches up to the highest previously used metadata LSN or each affected data structure is modified and written out without incident (which resets the metadata LSN), log recovery is susceptible to filesystem corruption. This problem is ultimately addressed and repaired in the associated userspace tools. The kernel is still responsible to detect the problem and notify the user that something is wrong. Check the superblock LSN at mount time and fail the mount if it is invalid. From that point on, trigger verifier failure on any metadata I/O where an invalid LSN is detected. This results in a filesystem shutdown and guarantees that we do not log metadata changes with invalid LSNs on disk. Since this is a known issue with a known recovery path, present a warning to instruct the user how to recover. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_alloc.c | 12 ++++++--- fs/xfs/libxfs/xfs_attr_leaf.c | 3 +++ fs/xfs/libxfs/xfs_btree.c | 17 ++++++++++-- fs/xfs/libxfs/xfs_da_btree.c | 4 +++ fs/xfs/libxfs/xfs_dir2_block.c | 3 +++ fs/xfs/libxfs/xfs_dir2_data.c | 3 +++ fs/xfs/libxfs/xfs_dir2_leaf.c | 3 +++ fs/xfs/libxfs/xfs_dir2_node.c | 3 +++ fs/xfs/libxfs/xfs_ialloc.c | 10 +++++-- fs/xfs/libxfs/xfs_sb.c | 10 +++++++ fs/xfs/libxfs/xfs_symlink_remote.c | 4 +++ fs/xfs/xfs_log.c | 54 ++++++++++++++++++++++++++++++++++++-- fs/xfs/xfs_log.h | 1 + fs/xfs/xfs_log_priv.h | 51 +++++++++++++++++++++++++++++++++++ fs/xfs/xfs_log_recover.c | 12 ++++++++- 15 files changed, 180 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index ffad7f20342f..d39e6d8a4949 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -482,7 +482,9 @@ xfs_agfl_verify( be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks) return false; } - return true; + + return xfs_log_check_lsn(mp, + be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn)); } static void @@ -2259,9 +2261,13 @@ xfs_agf_verify( { struct xfs_agf *agf = XFS_BUF_TO_AGF(bp); - if (xfs_sb_version_hascrc(&mp->m_sb) && - !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid)) + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid)) return false; + if (!xfs_log_check_lsn(mp, + be64_to_cpu(XFS_BUF_TO_AGF(bp)->agf_lsn))) + return false; + } if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) && XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 33df52d97ec7..aa187f7ba2dd 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -41,6 +41,7 @@ #include "xfs_buf_item.h" #include "xfs_cksum.h" #include "xfs_dir2.h" +#include "xfs_log.h" /* @@ -266,6 +267,8 @@ xfs_attr3_leaf_verify( return false; if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) return false; + if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn))) + return false; } else { if (ichdr.magic != XFS_ATTR_LEAF_MAGIC) return false; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index f7d7ee7a2607..235d026c7f9c 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -32,6 +32,7 @@ #include "xfs_trace.h" #include "xfs_cksum.h" #include "xfs_alloc.h" +#include "xfs_log.h" /* * Cursor allocation zone. @@ -243,8 +244,14 @@ bool xfs_btree_lblock_verify_crc( struct xfs_buf *bp) { - if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn))) + return false; return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); + } return true; } @@ -275,8 +282,14 @@ bool xfs_btree_sblock_verify_crc( struct xfs_buf *bp) { - if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb)) + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn))) + return false; return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); + } return true; } diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index be43248a5822..e89a0f8f827c 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -39,6 +39,7 @@ #include "xfs_trace.h" #include "xfs_cksum.h" #include "xfs_buf_item.h" +#include "xfs_log.h" /* * xfs_da_btree.c @@ -150,6 +151,8 @@ xfs_da3_node_verify( return false; if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn) return false; + if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn))) + return false; } else { if (ichdr.magic != XFS_DA_NODE_MAGIC) return false; @@ -322,6 +325,7 @@ xfs_da3_node_create( if (xfs_sb_version_hascrc(&mp->m_sb)) { struct xfs_da3_node_hdr *hdr3 = bp->b_addr; + memset(hdr3, 0, sizeof(struct xfs_da3_node_hdr)); ichdr.magic = XFS_DA3_NODE_MAGIC; hdr3->info.blkno = cpu_to_be64(bp->b_bn); hdr3->info.owner = cpu_to_be64(args->dp->i_ino); diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 4778d1dd511a..9c10e2b8cfcb 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -34,6 +34,7 @@ #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_cksum.h" +#include "xfs_log.h" /* * Local function prototypes. @@ -71,6 +72,8 @@ xfs_dir3_block_verify( return false; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) return false; + if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) + return false; } else { if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) return false; diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index 824131e71bc5..af71a84f343c 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -31,6 +31,7 @@ #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_cksum.h" +#include "xfs_log.h" /* * Check the consistency of the data block. @@ -224,6 +225,8 @@ xfs_dir3_data_verify( return false; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) return false; + if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) + return false; } else { if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC)) return false; diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index f300240ebb8d..3923e1f94697 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -33,6 +33,7 @@ #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_cksum.h" +#include "xfs_log.h" /* * Local function declarations. @@ -164,6 +165,8 @@ xfs_dir3_leaf_verify( return false; if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) return false; + if (!xfs_log_check_lsn(mp, be64_to_cpu(leaf3->info.lsn))) + return false; } else { if (leaf->hdr.info.magic != cpu_to_be16(magic)) return false; diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index cc28e924545b..70b0cb2fd556 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -33,6 +33,7 @@ #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_cksum.h" +#include "xfs_log.h" /* * Function declarations. @@ -97,6 +98,8 @@ xfs_dir3_free_verify( return false; if (be64_to_cpu(hdr3->blkno) != bp->b_bn) return false; + if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) + return false; } else { if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC)) return false; diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 54deb2d12ac6..70c1db99f6a7 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -38,6 +38,7 @@ #include "xfs_icreate_item.h" #include "xfs_icache.h" #include "xfs_trace.h" +#include "xfs_log.h" /* @@ -2500,9 +2501,14 @@ xfs_agi_verify( struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_agi *agi = XFS_BUF_TO_AGI(bp); - if (xfs_sb_version_hascrc(&mp->m_sb) && - !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid)) + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid)) + return false; + if (!xfs_log_check_lsn(mp, + be64_to_cpu(XFS_BUF_TO_AGI(bp)->agi_lsn))) return false; + } + /* * Validate the magic number of the agi block. */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 47425140f343..a0b071d881a0 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -35,6 +35,7 @@ #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_ialloc_btree.h" +#include "xfs_log.h" /* * Physical superblock buffer manipulations. Shared with libxfs in userspace. @@ -163,6 +164,15 @@ xfs_mount_validate_sb( "Filesystem can not be safely mounted by this kernel."); return -EINVAL; } + } else if (xfs_sb_version_hascrc(sbp)) { + /* + * We can't read verify the sb LSN because the read verifier is + * called before the log is allocated and processed. We know the + * log is set up before write verifier (!check_version) calls, + * so just check it here. + */ + if (!xfs_log_check_lsn(mp, sbp->sb_lsn)) + return -EFSCORRUPTED; } if (xfs_sb_version_has_pquotino(sbp)) { diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index b9884aab46d3..cb6fd20a4d3d 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -31,6 +31,7 @@ #include "xfs_cksum.h" #include "xfs_trans.h" #include "xfs_buf_item.h" +#include "xfs_log.h" /* @@ -60,6 +61,7 @@ xfs_symlink_hdr_set( if (!xfs_sb_version_hascrc(&mp->m_sb)) return 0; + memset(dsl, 0, sizeof(struct xfs_dsymlink_hdr)); dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC); dsl->sl_offset = cpu_to_be32(offset); dsl->sl_bytes = cpu_to_be32(size); @@ -116,6 +118,8 @@ xfs_symlink_verify( return false; if (dsl->sl_owner == 0) return false; + if (!xfs_log_check_lsn(mp, be64_to_cpu(dsl->sl_lsn))) + return false; return true; } diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index aaadee0969c9..0c8ef767c3a9 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3165,11 +3165,19 @@ xlog_state_switch_iclogs( } if (log->l_curr_block >= log->l_logBBsize) { + /* + * Rewind the current block before the cycle is bumped to make + * sure that the combined LSN never transiently moves forward + * when the log wraps to the next cycle. This is to support the + * unlocked sample of these fields from xlog_valid_lsn(). Most + * other cases should acquire l_icloglock. + */ + log->l_curr_block -= log->l_logBBsize; + ASSERT(log->l_curr_block >= 0); + smp_wmb(); log->l_curr_cycle++; if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM) log->l_curr_cycle++; - log->l_curr_block -= log->l_logBBsize; - ASSERT(log->l_curr_block >= 0); } ASSERT(iclog == log->l_iclog); log->l_iclog = iclog->ic_next; @@ -4023,3 +4031,45 @@ xlog_iclogs_empty( return 1; } +/* + * Verify that an LSN stamped into a piece of metadata is valid. This is + * intended for use in read verifiers on v5 superblocks. + */ +bool +xfs_log_check_lsn( + struct xfs_mount *mp, + xfs_lsn_t lsn) +{ + struct xlog *log = mp->m_log; + bool valid; + + /* + * norecovery mode skips mount-time log processing and unconditionally + * resets the in-core LSN. We can't validate in this mode, but + * modifications are not allowed anyways so just return true. + */ + if (mp->m_flags & XFS_MOUNT_NORECOVERY) + return true; + + /* + * Some metadata LSNs are initialized to NULL (e.g., the agfl). This is + * handled by recovery and thus safe to ignore here. + */ + if (lsn == NULLCOMMITLSN) + return true; + + valid = xlog_valid_lsn(mp->m_log, lsn); + + /* warn the user about what's gone wrong before verifier failure */ + if (!valid) { + spin_lock(&log->l_icloglock); + xfs_warn(mp, +"Corruption warning: Metadata has LSN (%d:%d) ahead of current LSN (%d:%d). " +"Please unmount and run xfs_repair (>= v4.3) to resolve.", + CYCLE_LSN(lsn), BLOCK_LSN(lsn), + log->l_curr_cycle, log->l_curr_block); + spin_unlock(&log->l_icloglock); + } + + return valid; +} diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 09d91d3166cd..aa533a7d50f2 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -181,5 +181,6 @@ bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); void xfs_log_work_queue(struct xfs_mount *mp); void xfs_log_worker(struct work_struct *work); void xfs_log_quiesce(struct xfs_mount *mp); +bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t); #endif /* __XFS_LOG_H__ */ diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 950f3f94720c..8daba7491b13 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -560,4 +560,55 @@ static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock) remove_wait_queue(wq, &wait); } +/* + * The LSN is valid so long as it is behind the current LSN. If it isn't, this + * means that the next log record that includes this metadata could have a + * smaller LSN. In turn, this means that the modification in the log would not + * replay. + */ +static inline bool +xlog_valid_lsn( + struct xlog *log, + xfs_lsn_t lsn) +{ + int cur_cycle; + int cur_block; + bool valid = true; + + /* + * First, sample the current lsn without locking to avoid added + * contention from metadata I/O. The current cycle and block are updated + * (in xlog_state_switch_iclogs()) and read here in a particular order + * to avoid false negatives (e.g., thinking the metadata LSN is valid + * when it is not). + * + * The current block is always rewound before the cycle is bumped in + * xlog_state_switch_iclogs() to ensure the current LSN is never seen in + * a transiently forward state. Instead, we can see the LSN in a + * transiently behind state if we happen to race with a cycle wrap. + */ + cur_cycle = ACCESS_ONCE(log->l_curr_cycle); + smp_rmb(); + cur_block = ACCESS_ONCE(log->l_curr_block); + + if ((CYCLE_LSN(lsn) > cur_cycle) || + (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block)) { + /* + * If the metadata LSN appears invalid, it's possible the check + * above raced with a wrap to the next log cycle. Grab the lock + * to check for sure. + */ + spin_lock(&log->l_icloglock); + cur_cycle = log->l_curr_cycle; + cur_block = log->l_curr_block; + spin_unlock(&log->l_icloglock); + + if ((CYCLE_LSN(lsn) > cur_cycle) || + (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block)) + valid = false; + } + + return valid; +} + #endif /* __XFS_LOG_PRIV_H__ */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 512a0945d52a..f8f1363dc045 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -4609,9 +4609,19 @@ xlog_recover( int error; /* find the tail of the log */ - if ((error = xlog_find_tail(log, &head_blk, &tail_blk))) + error = xlog_find_tail(log, &head_blk, &tail_blk); + if (error) return error; + /* + * The superblock was read before the log was available and thus the LSN + * could not be verified. Check the superblock LSN against the current + * LSN now that it's known. + */ + if (xfs_sb_version_hascrc(&log->l_mp->m_sb) && + !xfs_log_check_lsn(log->l_mp, log->l_mp->m_sb.sb_lsn)) + return -EINVAL; + if (tail_blk != head_blk) { /* There used to be a comment here: * -- cgit v1.2.3-59-g8ed1b From 3136e8bb3054d3bb68942f8f1ee6c26c05f798b0 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 12 Oct 2015 16:02:05 +1100 Subject: xfs: always drain dio before extending aio write submission XFS supports and typically allows concurrent asynchronous direct I/O submission to a single file. One exception to the rule is that file extending dio writes that start beyond the current EOF (e.g., potentially create a hole at EOF) require exclusive I/O access to the file. This is because such writes must zero any pre-existing blocks beyond EOF that are exposed by virtue of now residing within EOF as a result of the write about to be submitted. Before EOF zeroing can occur, the current file i_size must be stabilized to avoid data corruption. In this scenario, XFS upgrades the iolock to exclude any further I/O submission, waits on in-flight I/O to complete to ensure i_size is up to date (i_size is updated on dio write completion) and restarts the various checks against the state of the file. The problem is that this protection sequence is triggered only when the iolock is currently held shared. While this is true for async dio in most cases, the caller may upgrade the lock in advance based on arbitrary circumstances with respect to EOF zeroing. For example, the iolock is always acquired exclusively if the start offset is not block aligned. This means that even though the iolock is already held exclusive for such I/Os, pending I/O is not drained and thus EOF zeroing can occur based on an unstable i_size. This problem has been reproduced as guest data corruption in virtual machines with file-backed qcow2 virtual disks hosted on an XFS filesystem. The virtual disks must be configured with aio=native mode and the must not be truncated out to the maximum file size (as some virt managers will do). Update xfs_file_aio_write_checks() to unconditionally drain in-flight dio before EOF zeroing can occur. Rather than trigger the wait based on iolock state, use a new flag and upgrade the iolock when necessary. Note that this results in a full restart of the inode checks even when the iolock was already held exclusive when technically it is only required to recheck i_size. This should be a rare enough occurrence that it is preferable to keep the code simple rather than create an alternate restart jump target. Signed-off-by: Brian Foster Reviewed-by: Eric Sandeen Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e78feb400e22..347b3e07ec2b 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -574,6 +574,7 @@ xfs_file_aio_write_checks( struct xfs_inode *ip = XFS_I(inode); ssize_t error = 0; size_t count = iov_iter_count(from); + bool drained_dio = false; restart: error = generic_write_checks(iocb, from); @@ -611,12 +612,13 @@ restart: bool zero = false; spin_unlock(&ip->i_flags_lock); - if (*iolock == XFS_IOLOCK_SHARED) { - xfs_rw_iunlock(ip, *iolock); - *iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, *iolock); - iov_iter_reexpand(from, count); - + if (!drained_dio) { + if (*iolock == XFS_IOLOCK_SHARED) { + xfs_rw_iunlock(ip, *iolock); + *iolock = XFS_IOLOCK_EXCL; + xfs_rw_ilock(ip, *iolock); + iov_iter_reexpand(from, count); + } /* * We now have an IO submission barrier in place, but * AIO can do EOF updates during IO completion and hence @@ -626,6 +628,7 @@ restart: * no-op. */ inode_dio_wait(inode); + drained_dio = true; goto restart; } error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero); -- cgit v1.2.3-59-g8ed1b From 0a50f162af6ddc2db02c9edc5bbb823c336100a4 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 12 Oct 2015 16:02:08 +1100 Subject: xfs: add an xfs_zero_eof() tracepoint Add a tracepoint in xfs_zero_eof() to facilitate tracking and debugging EOF zeroing events. This has proven useful in the context of other direct I/O tracepoints to ensure EOF zeroing occurs within appropriate file ranges. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 2 ++ fs/xfs/xfs_trace.h | 1 + 2 files changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 347b3e07ec2b..541dcfbc6f49 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -482,6 +482,8 @@ xfs_zero_eof( ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(offset > isize); + trace_xfs_zero_eof(ip, isize, offset - isize); + /* * First handle zeroing the block on which isize resides. * diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 5ed36b1e04c1..957f5ccdd84f 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1312,6 +1312,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc); DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert); DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound); DEFINE_SIMPLE_IO_EVENT(xfs_setfilesize); +DEFINE_SIMPLE_IO_EVENT(xfs_zero_eof); DECLARE_EVENT_CLASS(xfs_itrunc_class, TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), -- cgit v1.2.3-59-g8ed1b From fef4ded8cb2e53a47093b334e7730d55a3badc59 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 12 Oct 2015 16:02:32 +1100 Subject: libxfs: fix two comment typos Just fix two typos in code comments. Signed-off-by: Geliang Tang Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_btree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index f7d7ee7a2607..1bd35323d793 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -222,7 +222,7 @@ xfs_btree_check_ptr( * long-form btree header. * * Prior to calculting the CRC, pull the LSN out of the buffer log item and put - * it into the buffer so recovery knows what the last modifcation was that made + * it into the buffer so recovery knows what the last modification was that made * it to disk. */ void @@ -254,7 +254,7 @@ xfs_btree_lblock_verify_crc( * short-form btree header. * * Prior to calculting the CRC, pull the LSN out of the buffer log item and put - * it into the buffer so recovery knows what the last modifcation was that made + * it into the buffer so recovery knows what the last modification was that made * it to disk. */ void -- cgit v1.2.3-59-g8ed1b From 4e247614a92852e8eb29e49be094b6b981a7895f Mon Sep 17 00:00:00 2001 From: Jan Tulak Date: Mon, 12 Oct 2015 16:02:56 +1100 Subject: xfs: prefix XATTR_LIST_MAX with XFS_ Remove a hard dependency of Linux XATTR_LIST_MAX value by using a prefixed version. This patch reflects the same change in xfsprogs. Signed-off-by: Jan Tulak Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_fs.h | 10 ++++++++++ fs/xfs/xfs_ioctl.c | 2 +- fs/xfs/xfs_ioctl32.c | 2 +- 3 files changed, 12 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 89689c6a43e2..b2b73a998d42 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -489,6 +489,16 @@ typedef struct xfs_swapext #define XFS_FSOP_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ #define XFS_FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ +/* + * ioctl limits + */ +#ifdef XATTR_LIST_MAX +# define XFS_XATTR_LIST_MAX XATTR_LIST_MAX +#else +# define XFS_XATTR_LIST_MAX 65536 +#endif + + /* * ioctl commands that are used by Linux filesystems */ diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index ea7d85af5310..0e692a68aad5 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -411,7 +411,7 @@ xfs_attrlist_by_handle( if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t))) return -EFAULT; if (al_hreq.buflen < sizeof(struct attrlist) || - al_hreq.buflen > XATTR_LIST_MAX) + al_hreq.buflen > XFS_XATTR_LIST_MAX) return -EINVAL; /* diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index b88bdc85dd3d..1a05d8ae327d 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -356,7 +356,7 @@ xfs_compat_attrlist_by_handle( sizeof(compat_xfs_fsop_attrlist_handlereq_t))) return -EFAULT; if (al_hreq.buflen < sizeof(struct attrlist) || - al_hreq.buflen > XATTR_LIST_MAX) + al_hreq.buflen > XFS_XATTR_LIST_MAX) return -EINVAL; /* -- cgit v1.2.3-59-g8ed1b From 51fcbfe7092a18a138e28110df3ab7e666bf69ee Mon Sep 17 00:00:00 2001 From: Jan Tulak Date: Mon, 12 Oct 2015 16:03:59 +1100 Subject: xfs: avoid dependency on Linux XATTR_SIZE_MAX Currently, we depends on Linux XATTR value for on disk definition. Which causes trouble on other platforms and maybe also if this value was to change. Fix it by creating a custom definition independent from those in Linux (although with the same values), so it is OK with the be16 fields used for holding these attributes. This patch reflects a change in xfsprogs. Signed-off-by: Jan Tulak Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_attr_remote.c | 2 +- fs/xfs/libxfs/xfs_format.h | 10 +++++++++- fs/xfs/xfs_ioctl.c | 4 ++-- 3 files changed, 12 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index f38f9bd81557..5ab95ffa4ae9 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -107,7 +107,7 @@ xfs_attr3_rmt_verify( if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt)) return false; if (be32_to_cpu(rmt->rm_offset) + - be32_to_cpu(rmt->rm_bytes) > XATTR_SIZE_MAX) + be32_to_cpu(rmt->rm_bytes) > XFS_XATTR_SIZE_MAX) return false; if (rmt->rm_owner == 0) return false; diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 9590a069e556..8568de163004 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -59,6 +59,14 @@ struct xfs_ifork; #define XFS_SB_VERSION_BORGBIT 0x4000 /* ASCII only case-insens. */ #define XFS_SB_VERSION_MOREBITSBIT 0x8000 +/* + * The size of a single extended attribute on disk is limited by + * the size of index values within the attribute entries themselves. + * These are be16 fields, so we can only support attribute data + * sizes up to 2^16 bytes in length. + */ +#define XFS_XATTR_SIZE_MAX (1 << 16) + /* * Supported feature bit list is just all bits in the versionnum field because * we've used them all up and understand them all. Except, of course, for the @@ -1483,7 +1491,7 @@ struct xfs_acl { */ #define XFS_ACL_MAX_ENTRIES(mp) \ (xfs_sb_version_hascrc(&mp->m_sb) \ - ? (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \ + ? (XFS_XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \ sizeof(struct xfs_acl_entry) \ : 25) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 0e692a68aad5..9963f7c7fc7a 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -455,7 +455,7 @@ xfs_attrmulti_attr_get( unsigned char *kbuf; int error = -EFAULT; - if (*len > XATTR_SIZE_MAX) + if (*len > XFS_XATTR_SIZE_MAX) return -EINVAL; kbuf = kmem_zalloc_large(*len, KM_SLEEP); if (!kbuf) @@ -485,7 +485,7 @@ xfs_attrmulti_attr_set( if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return -EPERM; - if (len > XATTR_SIZE_MAX) + if (len > XFS_XATTR_SIZE_MAX) return -EINVAL; kbuf = memdup_user(ubuf, len); -- cgit v1.2.3-59-g8ed1b From dbd5c8c9a28899c6ca719eb21afc0afba9dd5574 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 12 Oct 2015 16:04:13 +1100 Subject: xfs: pass total block res. as total xfs_bmapi_write() parameter The total field from struct xfs_alloc_arg is a bit of an unknown commodity. It is documented as the total block requirement for the transaction and is used in this manner from most call sites by virtue of passing the total block reservation of the transaction associated with an allocation. Several xfs_bmapi_write() callers pass hardcoded values of 0 or 1 for the total block requirement, which is a historical oddity without any clear reasoning. The xfs_iomap_write_direct() caller, for example, passes 0 for the total block requirement. This has been determined to cause problems in the form of ABBA deadlocks of AGF buffers due to incorrect AG selection in the block allocator. Specifically, the xfs_alloc_space_available() function incorrectly selects an AG that doesn't actually have sufficient space for the allocation. This occurs because the args.total field is 0 and thus the remaining free space check on the AG doesn't actually consider the size of the allocation request. This locks the AGF buffer, the allocation attempt proceeds and ultimately fails (in xfs_alloc_fix_minleft()), and xfs_alloc_vexent() moves on to the next AG. In turn, this can lead to incorrect AG locking order (if the allocator wraps around, attempting to lock AG 0 after acquiring AG N) and thus deadlock if racing with another operation. This problem has been reproduced via generic/299 on smallish (1GB) ramdisk test devices. To avoid this problem, replace the undocumented hardcoded total parameters from the iomap and utility callers to pass the block reservation used for the associated transaction. This is consistent with other xfs_bmapi_write() callers throughout XFS. The assumption is that the total field allows the selection of an AG that can handle the entire operation rather than simply the allocation/range being requested (e.g., resulting btree splits, etc.). This addresses the aforementioned generic/299 hang by ensuring AG selection only occurs when the allocation can be satisfied by the AG. Reported-by: Ross Zwisler Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_bmap_util.c | 2 +- fs/xfs/xfs_iomap.c | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 3bf4ad0d19e4..eca325e42261 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1027,7 +1027,7 @@ xfs_alloc_file_space( xfs_bmap_init(&free_list, &firstfsb); error = xfs_bmapi_write(tp, ip, startoffset_fsb, allocatesize_fsb, alloc_type, &firstfsb, - 0, imapp, &nimaps, &free_list); + resblks, imapp, &nimaps, &free_list); if (error) { goto error0; } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 1f86033171c8..3d9fa36d472b 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -202,8 +202,8 @@ xfs_iomap_write_direct( xfs_bmap_init(&free_list, &firstfsb); nimaps = 1; error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, - XFS_BMAPI_PREALLOC, &firstfsb, 0, - imap, &nimaps, &free_list); + XFS_BMAPI_PREALLOC, &firstfsb, resblks, imap, + &nimaps, &free_list); if (error) goto out_bmap_cancel; @@ -750,9 +750,9 @@ xfs_iomap_write_allocate( * pointer that the caller gave to us. */ error = xfs_bmapi_write(tp, ip, map_start_fsb, - count_fsb, 0, - &first_block, 1, - imap, &nimaps, &free_list); + count_fsb, 0, &first_block, + nres, imap, &nimaps, + &free_list); if (error) goto trans_cancel; @@ -866,8 +866,8 @@ xfs_iomap_write_unwritten( xfs_bmap_init(&free_list, &firstfsb); nimaps = 1; error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, - XFS_BMAPI_CONVERT, &firstfsb, - 1, &imap, &nimaps, &free_list); + XFS_BMAPI_CONVERT, &firstfsb, resblks, + &imap, &nimaps, &free_list); if (error) goto error_on_bmapi_transaction; -- cgit v1.2.3-59-g8ed1b From 91f9f5fe1e7350e872b3fbc3194e8183bddce514 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 12 Oct 2015 16:04:15 +1100 Subject: xfs: avoid null *src in memcpy call in xlog_write The gcc undefined behavior sanitizer caught this; surely any sane memcpy implementation will no-op if size == 0, but behavior with a *src of NULL is technically undefined (declared nonnull), so avoid it here. We are actually in this situation frequently via xlog_commit_record(), because: struct xfs_log_iovec reg = { .i_addr = NULL, .i_len = 0, .i_type = XLOG_REG_TYPE_COMMIT, }; Reported-by: Eric Sandeen Signed-off-by: Dave Chinner Reviewed-by: Eric Sandeen Signed-off-by: Dave Chinner --- fs/xfs/xfs_log.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 0c8ef767c3a9..af08326b0e41 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -2422,11 +2422,20 @@ xlog_write( &partial_copy_len); xlog_verify_dest_ptr(log, ptr); - /* copy region */ + /* + * Copy region. + * + * Unmount records just log an opheader, so can have + * empty payloads with no data region to copy. Hence we + * only copy the payload if the vector says it has data + * to copy. + */ ASSERT(copy_len >= 0); - memcpy(ptr, reg->i_addr + copy_off, copy_len); - xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len); - + if (copy_len > 0) { + memcpy(ptr, reg->i_addr + copy_off, copy_len); + xlog_write_adv_cnt(&ptr, &len, &log_offset, + copy_len); + } copy_len += start_rec_copy + sizeof(xlog_op_header_t); record_cnt++; data_cnt += contwr ? copy_len : 0; -- cgit v1.2.3-59-g8ed1b From 847f9f6875fb02b576035e3dc31f5e647b7617a7 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 12 Oct 2015 16:04:45 +1100 Subject: xfs: more info from kmem deadlocks and high-level error msgs In an effort to get more useful out of "possible memory allocation deadlock" messages, print the size of the requested allocation, and dump the stack if the xfs error level is tuned high. The stack dump is implemented in define_xfs_printk_level() for error levels >= LOGLEVEL_ERR, partly because it seems generically useful, and also because kmem.c has no knowledge of xfs error level tunables or other such bits, it's very kmem-specific. Signed-off-by: Eric Sandeen Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/kmem.c | 4 ++-- fs/xfs/xfs_message.c | 7 +++++++ 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 535c13677e7a..686ba6fb20dd 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -55,9 +55,9 @@ kmem_alloc(size_t size, xfs_km_flags_t flags) return ptr; if (!(++retries % 100)) xfs_err(NULL, - "%s(%u) possible memory allocation deadlock in %s (mode:0x%x)", + "%s(%u) possible memory allocation deadlock size %u in %s (mode:0x%x)", current->comm, current->pid, - __func__, lflags); + (unsigned int)size, __func__, lflags); congestion_wait(BLK_RW_ASYNC, HZ/50); } while (1); } diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index d8b67547ab34..11792d888e4e 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -17,6 +17,7 @@ #include "xfs.h" #include "xfs_fs.h" +#include "xfs_error.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" @@ -43,6 +44,7 @@ void func(const struct xfs_mount *mp, const char *fmt, ...) \ { \ struct va_format vaf; \ va_list args; \ + int level; \ \ va_start(args, fmt); \ \ @@ -51,6 +53,11 @@ void func(const struct xfs_mount *mp, const char *fmt, ...) \ \ __xfs_printk(kern_level, mp, &vaf); \ va_end(args); \ + \ + if (!kstrtoint(kern_level, 0, &level) && \ + level <= LOGLEVEL_ERR && \ + xfs_error_level >= XFS_ERRLEVEL_HIGH) \ + xfs_stack_trace(); \ } \ define_xfs_printk_level(xfs_emerg, KERN_EMERG); -- cgit v1.2.3-59-g8ed1b From 225e4635580ce9fb12f8a2dc88473161cd64dbf6 Mon Sep 17 00:00:00 2001 From: Bill O'Donnell Date: Mon, 12 Oct 2015 18:21:19 +1100 Subject: xfs: per-filesystem stats in sysfs This patch implements per-filesystem stats objects in sysfs. It depends on the application of the previous patch series that develops the infrastructure to support both xfs global stats and xfs per-fs stats in sysfs. Stats objects are instantiated when an xfs filesystem is mounted and deleted on unmount. With this patch, the stats directory is created and populated with the familiar stats and stats_clear files. Example: /sys/fs/xfs/sda9/stats/stats /sys/fs/xfs/sda9/stats/stats_clear With this patch, the individual counts within the new per-fs stats file(s) remain at zero. Functions that use the the macros to increment, decrement, and add-to the per-fs stats counts will be covered in a separate new patch to follow this one. Note that the counts within the global stats file (/sys/fs/xfs/stats/stats) advance normally and can be cleared as it was prior to this patch. [dchinner: move setup/teardown to xfs_fs_{fill|put}_super() so it is down before/after any path that uses the per-mount stats. ] Signed-off-by: Bill O'Donnell Reviewed-by: Eric Sandeen Signed-off-by: Dave Chinner --- fs/xfs/xfs_mount.c | 11 ++++++++++- fs/xfs/xfs_mount.h | 1 + fs/xfs/xfs_super.c | 14 ++++++++++++-- 3 files changed, 23 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index bf92e0c037c7..eb498ce39f4c 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -693,10 +693,15 @@ xfs_mountfs( if (error) goto out; - error = xfs_uuid_mount(mp); + error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype, + &mp->m_kobj, "stats"); if (error) goto out_remove_sysfs; + error = xfs_uuid_mount(mp); + if (error) + goto out_del_stats; + /* * Set the minimum read and write sizes */ @@ -971,6 +976,8 @@ xfs_mountfs( xfs_da_unmount(mp); out_remove_uuid: xfs_uuid_unmount(mp); + out_del_stats: + xfs_sysfs_del(&mp->m_stats.xs_kobj); out_remove_sysfs: xfs_sysfs_del(&mp->m_kobj); out: @@ -1047,6 +1054,7 @@ xfs_unmountfs( xfs_warn(mp, "Unable to update superblock counters. " "Freespace may not be correct on next mount."); + xfs_log_unmount(mp); xfs_da_unmount(mp); xfs_uuid_unmount(mp); @@ -1056,6 +1064,7 @@ xfs_unmountfs( #endif xfs_free_perag(mp); + xfs_sysfs_del(&mp->m_stats.xs_kobj); xfs_sysfs_del(&mp->m_kobj); } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 7999e91cd49a..8795272cb662 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -127,6 +127,7 @@ typedef struct xfs_mount { int64_t m_low_space[XFS_LOWSP_MAX]; /* low free space thresholds */ struct xfs_kobj m_kobj; + struct xstats m_stats; /* per-fs stats */ struct workqueue_struct *m_buf_workqueue; struct workqueue_struct *m_data_workqueue; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index e1a35a5104e8..9d11d3ea8030 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1474,9 +1474,16 @@ xfs_fs_fill_super( if (error) goto out_destroy_workqueues; + /* Allocate stats memory before we do operations that might use it */ + mp->m_stats.xs_stats = alloc_percpu(struct xfsstats); + if (!mp->m_stats.xs_stats) { + error = PTR_ERR(mp->m_stats.xs_stats); + goto out_destroy_counters; + } + error = xfs_readsb(mp, flags); if (error) - goto out_destroy_counters; + goto out_free_stats; error = xfs_finish_flags(mp); if (error) @@ -1545,9 +1552,11 @@ xfs_fs_fill_super( xfs_filestream_unmount(mp); out_free_sb: xfs_freesb(mp); + out_free_stats: + free_percpu(mp->m_stats.xs_stats); out_destroy_counters: xfs_destroy_percpu_counters(mp); -out_destroy_workqueues: + out_destroy_workqueues: xfs_destroy_mount_workqueues(mp); out_close_devices: xfs_close_devices(mp); @@ -1574,6 +1583,7 @@ xfs_fs_put_super( xfs_unmountfs(mp); xfs_freesb(mp); + free_percpu(mp->m_stats.xs_stats); xfs_destroy_percpu_counters(mp); xfs_destroy_mount_workqueues(mp); xfs_close_devices(mp); -- cgit v1.2.3-59-g8ed1b From ff6d6af2351caea7db681f4539d0d893e400557a Mon Sep 17 00:00:00 2001 From: Bill O'Donnell Date: Mon, 12 Oct 2015 18:21:22 +1100 Subject: xfs: per-filesystem stats counter implementation This patch modifies the stats counting macros and the callers to those macros to properly increment, decrement, and add-to the xfs stats counts. The counts for global and per-fs stats are correctly advanced, and cleared by writing a "1" to the corresponding clear file. global counts: /sys/fs/xfs/stats/stats per-fs counts: /sys/fs/xfs/sda*/stats/stats global clear: /sys/fs/xfs/stats/stats_clear per-fs clear: /sys/fs/xfs/sda*/stats/stats_clear [dchinner: cleaned up macro variables, removed CONFIG_FS_PROC around stats structures and macros. ] Signed-off-by: Bill O'Donnell Reviewed-by: Eric Sandeen Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_alloc.c | 8 ++++---- fs/xfs/libxfs/xfs_attr.c | 6 +++--- fs/xfs/libxfs/xfs_bmap.c | 20 ++++++++++---------- fs/xfs/libxfs/xfs_btree.h | 39 +++++++++++++++++++++++---------------- fs/xfs/libxfs/xfs_dir2.c | 6 +++--- fs/xfs/xfs_attr_list.c | 2 +- fs/xfs/xfs_buf.c | 18 +++++++++--------- fs/xfs/xfs_dir2_readdir.c | 2 +- fs/xfs/xfs_dquot.c | 14 +++++++------- fs/xfs/xfs_file.c | 12 ++++++------ fs/xfs/xfs_icache.c | 18 +++++++++--------- fs/xfs/xfs_inode.c | 6 +++--- fs/xfs/xfs_ioctl.c | 2 +- fs/xfs/xfs_iomap.c | 4 ++-- fs/xfs/xfs_iops.c | 4 ++-- fs/xfs/xfs_log.c | 22 +++++++++++----------- fs/xfs/xfs_qm.c | 14 +++++++------- fs/xfs/xfs_stats.h | 28 +++++++++++++++++----------- fs/xfs/xfs_super.c | 6 +++--- fs/xfs/xfs_trans.c | 6 +++--- fs/xfs/xfs_trans_ail.c | 12 ++++++------ 21 files changed, 131 insertions(+), 118 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index ffad7f20342f..9b5da7e3b5c2 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -651,8 +651,8 @@ xfs_alloc_ag_vextent( -((long)(args->len))); } - XFS_STATS_INC(xs_allocx); - XFS_STATS_ADD(xs_allocb, args->len); + XFS_STATS_INC(args->mp, xs_allocx); + XFS_STATS_ADD(args->mp, xs_allocb, args->len); return error; } @@ -1808,8 +1808,8 @@ xfs_free_ag_extent( if (!isfl) xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len); - XFS_STATS_INC(xs_freex); - XFS_STATS_ADD(xs_freeb, len); + XFS_STATS_INC(mp, xs_freex); + XFS_STATS_ADD(mp, xs_freeb, len); trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright); diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index ff065578969f..f949818fa1c7 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -125,7 +125,7 @@ xfs_attr_get( uint lock_mode; int error; - XFS_STATS_INC(xs_attr_get); + XFS_STATS_INC(ip->i_mount, xs_attr_get); if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; @@ -209,7 +209,7 @@ xfs_attr_set( int rsvd = (flags & ATTR_ROOT) != 0; int error, err2, committed, local; - XFS_STATS_INC(xs_attr_set); + XFS_STATS_INC(mp, xs_attr_set); if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return -EIO; @@ -412,7 +412,7 @@ xfs_attr_remove( xfs_fsblock_t firstblock; int error; - XFS_STATS_INC(xs_attr_remove); + XFS_STATS_INC(mp, xs_attr_remove); if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return -EIO; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 8e2010d53b07..5256fe59623b 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1435,7 +1435,7 @@ xfs_bmap_search_extents( xfs_ifork_t *ifp; /* inode fork pointer */ xfs_bmbt_rec_host_t *ep; /* extent record pointer */ - XFS_STATS_INC(xs_look_exlist); + XFS_STATS_INC(ip->i_mount, xs_look_exlist); ifp = XFS_IFORK_PTR(ip, fork); ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp); @@ -1732,7 +1732,7 @@ xfs_bmap_add_extent_delay_real( ASSERT(!bma->cur || (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); - XFS_STATS_INC(xs_add_exlist); + XFS_STATS_INC(mp, xs_add_exlist); #define LEFT r[0] #define RIGHT r[1] @@ -2286,7 +2286,7 @@ xfs_bmap_add_extent_unwritten_real( ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); ASSERT(!isnullstartblock(new->br_startblock)); - XFS_STATS_INC(xs_add_exlist); + XFS_STATS_INC(mp, xs_add_exlist); #define LEFT r[0] #define RIGHT r[1] @@ -2946,7 +2946,7 @@ xfs_bmap_add_extent_hole_real( ASSERT(!bma->cur || !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); - XFS_STATS_INC(xs_add_exlist); + XFS_STATS_INC(mp, xs_add_exlist); state = 0; if (whichfork == XFS_ATTR_FORK) @@ -4036,7 +4036,7 @@ xfs_bmapi_read( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - XFS_STATS_INC(xs_blk_mapr); + XFS_STATS_INC(mp, xs_blk_mapr); ifp = XFS_IFORK_PTR(ip, whichfork); @@ -4221,7 +4221,7 @@ xfs_bmapi_delay( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - XFS_STATS_INC(xs_blk_mapw); + XFS_STATS_INC(mp, xs_blk_mapw); if (!(ifp->if_flags & XFS_IFEXTENTS)) { error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); @@ -4525,7 +4525,7 @@ xfs_bmapi_write( ifp = XFS_IFORK_PTR(ip, whichfork); - XFS_STATS_INC(xs_blk_mapw); + XFS_STATS_INC(mp, xs_blk_mapw); if (*firstblock == NULLFSBLOCK) { if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE) @@ -4718,12 +4718,12 @@ xfs_bmap_del_extent( xfs_filblks_t temp2; /* for indirect length calculations */ int state = 0; - XFS_STATS_INC(xs_del_exlist); + mp = ip->i_mount; + XFS_STATS_INC(mp, xs_del_exlist); if (whichfork == XFS_ATTR_FORK) state |= BMAP_ATTRFORK; - mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT((*idx >= 0) && (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); @@ -5070,7 +5070,7 @@ xfs_bunmapi( *done = 1; return 0; } - XFS_STATS_INC(xs_blk_unmap); + XFS_STATS_INC(mp, xs_blk_unmap); isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip); start = bno; bno = start + len - 1; diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 8f18bab73ea5..992dec0638f3 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -84,31 +84,38 @@ union xfs_btree_rec { /* * Generic stats interface */ -#define __XFS_BTREE_STATS_INC(type, stat) \ - XFS_STATS_INC(xs_ ## type ## _2_ ## stat) -#define XFS_BTREE_STATS_INC(cur, stat) \ +#define __XFS_BTREE_STATS_INC(mp, type, stat) \ + XFS_STATS_INC(mp, xs_ ## type ## _2_ ## stat) +#define XFS_BTREE_STATS_INC(cur, stat) \ do { \ + struct xfs_mount *__mp = cur->bc_mp; \ switch (cur->bc_btnum) { \ - case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(abtb, stat); break; \ - case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(abtc, stat); break; \ - case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(bmbt, stat); break; \ - case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(ibt, stat); break; \ - case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(fibt, stat); break; \ + case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(__mp, abtb, stat); break; \ + case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(__mp, abtc, stat); break; \ + case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(__mp, bmbt, stat); break; \ + case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(__mp, ibt, stat); break; \ + case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(__mp, fibt, stat); break; \ case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ } \ } while (0) -#define __XFS_BTREE_STATS_ADD(type, stat, val) \ - XFS_STATS_ADD(xs_ ## type ## _2_ ## stat, val) +#define __XFS_BTREE_STATS_ADD(mp, type, stat, val) \ + XFS_STATS_ADD(mp, xs_ ## type ## _2_ ## stat, val) #define XFS_BTREE_STATS_ADD(cur, stat, val) \ do { \ + struct xfs_mount *__mp = cur->bc_mp; \ switch (cur->bc_btnum) { \ - case XFS_BTNUM_BNO: __XFS_BTREE_STATS_ADD(abtb, stat, val); break; \ - case XFS_BTNUM_CNT: __XFS_BTREE_STATS_ADD(abtc, stat, val); break; \ - case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_ADD(bmbt, stat, val); break; \ - case XFS_BTNUM_INO: __XFS_BTREE_STATS_ADD(ibt, stat, val); break; \ - case XFS_BTNUM_FINO: __XFS_BTREE_STATS_ADD(fibt, stat, val); break; \ - case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ + case XFS_BTNUM_BNO: \ + __XFS_BTREE_STATS_ADD(__mp, abtb, stat, val); break; \ + case XFS_BTNUM_CNT: \ + __XFS_BTREE_STATS_ADD(__mp, abtc, stat, val); break; \ + case XFS_BTNUM_BMAP: \ + __XFS_BTREE_STATS_ADD(__mp, bmbt, stat, val); break; \ + case XFS_BTNUM_INO: \ + __XFS_BTREE_STATS_ADD(__mp, ibt, stat, val); break; \ + case XFS_BTNUM_FINO: \ + __XFS_BTREE_STATS_ADD(__mp, fibt, stat, val); break; \ + case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ } \ } while (0) diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 9de401d297e5..2fb53a5c0a74 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -271,7 +271,7 @@ xfs_dir_createname( rval = xfs_dir_ino_validate(tp->t_mountp, inum); if (rval) return rval; - XFS_STATS_INC(xs_dir_create); + XFS_STATS_INC(dp->i_mount, xs_dir_create); } args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); @@ -365,7 +365,7 @@ xfs_dir_lookup( int lock_mode; ASSERT(S_ISDIR(dp->i_d.di_mode)); - XFS_STATS_INC(xs_dir_lookup); + XFS_STATS_INC(dp->i_mount, xs_dir_lookup); /* * We need to use KM_NOFS here so that lockdep will not throw false @@ -444,7 +444,7 @@ xfs_dir_removename( int v; /* type-checking value */ ASSERT(S_ISDIR(dp->i_d.di_mode)); - XFS_STATS_INC(xs_dir_remove); + XFS_STATS_INC(dp->i_mount, xs_dir_remove); args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); if (!args) diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 65fb37a18e92..0ef7c2ed3f8a 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -511,7 +511,7 @@ xfs_attr_list_int( xfs_inode_t *dp = context->dp; uint lock_mode; - XFS_STATS_INC(xs_attr_list); + XFS_STATS_INC(dp->i_mount, xs_attr_list); if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return -EIO; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 8ecffb35935b..90815c22b22d 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -201,7 +201,7 @@ _xfs_buf_alloc( atomic_set(&bp->b_pin_count, 0); init_waitqueue_head(&bp->b_waiters); - XFS_STATS_INC(xb_create); + XFS_STATS_INC(target->bt_mount, xb_create); trace_xfs_buf_init(bp, _RET_IP_); return bp; @@ -357,12 +357,12 @@ retry: "possible memory allocation deadlock in %s (mode:0x%x)", __func__, gfp_mask); - XFS_STATS_INC(xb_page_retries); + XFS_STATS_INC(bp->b_target->bt_mount, xb_page_retries); congestion_wait(BLK_RW_ASYNC, HZ/50); goto retry; } - XFS_STATS_INC(xb_page_found); + XFS_STATS_INC(bp->b_target->bt_mount, xb_page_found); nbytes = min_t(size_t, size, PAGE_SIZE - offset); size -= nbytes; @@ -516,7 +516,7 @@ _xfs_buf_find( new_bp->b_pag = pag; spin_unlock(&pag->pag_buf_lock); } else { - XFS_STATS_INC(xb_miss_locked); + XFS_STATS_INC(btp->bt_mount, xb_miss_locked); spin_unlock(&pag->pag_buf_lock); xfs_perag_put(pag); } @@ -529,11 +529,11 @@ found: if (!xfs_buf_trylock(bp)) { if (flags & XBF_TRYLOCK) { xfs_buf_rele(bp); - XFS_STATS_INC(xb_busy_locked); + XFS_STATS_INC(btp->bt_mount, xb_busy_locked); return NULL; } xfs_buf_lock(bp); - XFS_STATS_INC(xb_get_locked_waited); + XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited); } /* @@ -549,7 +549,7 @@ found: } trace_xfs_buf_find(bp, flags, _RET_IP_); - XFS_STATS_INC(xb_get_locked); + XFS_STATS_INC(btp->bt_mount, xb_get_locked); return bp; } @@ -603,7 +603,7 @@ found: } } - XFS_STATS_INC(xb_get); + XFS_STATS_INC(target->bt_mount, xb_get); trace_xfs_buf_get(bp, flags, _RET_IP_); return bp; } @@ -643,7 +643,7 @@ xfs_buf_read_map( trace_xfs_buf_read(bp, flags, _RET_IP_); if (!XFS_BUF_ISDONE(bp)) { - XFS_STATS_INC(xb_get_read); + XFS_STATS_INC(target->bt_mount, xb_get_read); bp->b_ops = ops; _xfs_buf_read(bp, flags); } else if (flags & XBF_ASYNC) { diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index a989a9c7edb7..642d55d10075 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -666,7 +666,7 @@ xfs_readdir( return -EIO; ASSERT(S_ISDIR(dp->i_d.di_mode)); - XFS_STATS_INC(xs_dir_getdents); + XFS_STATS_INC(dp->i_mount, xs_dir_getdents); args.dp = dp; args.geo = dp->i_mount->m_dir_geo; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 30cb3afb67f0..7ac6c5c586cb 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -75,9 +75,9 @@ xfs_qm_dqdestroy( ASSERT(list_empty(&dqp->q_lru)); mutex_destroy(&dqp->q_qlock); - kmem_zone_free(xfs_qm_dqzone, dqp); - XFS_STATS_DEC(xs_qm_dquot); + XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot); + kmem_zone_free(xfs_qm_dqzone, dqp); } /* @@ -605,7 +605,7 @@ xfs_qm_dqread( break; } - XFS_STATS_INC(xs_qm_dquot); + XFS_STATS_INC(mp, xs_qm_dquot); trace_xfs_dqread(dqp); @@ -747,12 +747,12 @@ restart: mutex_unlock(&qi->qi_tree_lock); trace_xfs_dqget_hit(dqp); - XFS_STATS_INC(xs_qm_dqcachehits); + XFS_STATS_INC(mp, xs_qm_dqcachehits); *O_dqpp = dqp; return 0; } mutex_unlock(&qi->qi_tree_lock); - XFS_STATS_INC(xs_qm_dqcachemisses); + XFS_STATS_INC(mp, xs_qm_dqcachemisses); /* * Dquot cache miss. We don't want to keep the inode lock across @@ -806,7 +806,7 @@ restart: mutex_unlock(&qi->qi_tree_lock); trace_xfs_dqget_dup(dqp); xfs_qm_dqdestroy(dqp); - XFS_STATS_INC(xs_qm_dquot_dups); + XFS_STATS_INC(mp, xs_qm_dquot_dups); goto restart; } @@ -846,7 +846,7 @@ xfs_qm_dqput( trace_xfs_dqput_free(dqp); if (list_lru_add(&qi->qi_lru, &dqp->q_lru)) - XFS_STATS_INC(xs_qm_dquot_unused); + XFS_STATS_INC(dqp->q_mount, xs_qm_dquot_unused); } xfs_dqunlock(dqp); } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e78feb400e22..088e50923830 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -287,7 +287,7 @@ xfs_file_read_iter( xfs_fsize_t n; loff_t pos = iocb->ki_pos; - XFS_STATS_INC(xs_read_calls); + XFS_STATS_INC(mp, xs_read_calls); if (unlikely(iocb->ki_flags & IOCB_DIRECT)) ioflags |= XFS_IO_ISDIRECT; @@ -365,7 +365,7 @@ xfs_file_read_iter( ret = generic_file_read_iter(iocb, to); if (ret > 0) - XFS_STATS_ADD(xs_read_bytes, ret); + XFS_STATS_ADD(mp, xs_read_bytes, ret); xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); return ret; @@ -383,7 +383,7 @@ xfs_file_splice_read( int ioflags = 0; ssize_t ret; - XFS_STATS_INC(xs_read_calls); + XFS_STATS_INC(ip->i_mount, xs_read_calls); if (infilp->f_mode & FMODE_NOCMTIME) ioflags |= XFS_IO_INVIS; @@ -401,7 +401,7 @@ xfs_file_splice_read( else ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); if (ret > 0) - XFS_STATS_ADD(xs_read_bytes, ret); + XFS_STATS_ADD(ip->i_mount, xs_read_bytes, ret); xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); return ret; @@ -867,7 +867,7 @@ xfs_file_write_iter( ssize_t ret; size_t ocount = iov_iter_count(from); - XFS_STATS_INC(xs_write_calls); + XFS_STATS_INC(ip->i_mount, xs_write_calls); if (ocount == 0) return 0; @@ -883,7 +883,7 @@ xfs_file_write_iter( if (ret > 0) { ssize_t err; - XFS_STATS_ADD(xs_write_bytes, ret); + XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); /* Handle various SYNC-type writes */ err = generic_write_sync(file, iocb->ki_pos - ret, ret); diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 0a326bd64d4e..d7a490f24ead 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -63,7 +63,7 @@ xfs_inode_alloc( return NULL; } - XFS_STATS_INC(vn_active); + XFS_STATS_INC(mp, vn_active); ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(!spin_is_locked(&ip->i_flags_lock)); ASSERT(!xfs_isiflocked(ip)); @@ -129,7 +129,7 @@ xfs_inode_free( /* asserts to verify all state is correct here */ ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(!xfs_isiflocked(ip)); - XFS_STATS_DEC(vn_active); + XFS_STATS_DEC(ip->i_mount, vn_active); call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); } @@ -159,7 +159,7 @@ xfs_iget_cache_hit( spin_lock(&ip->i_flags_lock); if (ip->i_ino != ino) { trace_xfs_iget_skip(ip); - XFS_STATS_INC(xs_ig_frecycle); + XFS_STATS_INC(mp, xs_ig_frecycle); error = -EAGAIN; goto out_error; } @@ -177,7 +177,7 @@ xfs_iget_cache_hit( */ if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { trace_xfs_iget_skip(ip); - XFS_STATS_INC(xs_ig_frecycle); + XFS_STATS_INC(mp, xs_ig_frecycle); error = -EAGAIN; goto out_error; } @@ -259,7 +259,7 @@ xfs_iget_cache_hit( xfs_ilock(ip, lock_flags); xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE); - XFS_STATS_INC(xs_ig_found); + XFS_STATS_INC(mp, xs_ig_found); return 0; @@ -342,7 +342,7 @@ xfs_iget_cache_miss( error = radix_tree_insert(&pag->pag_ici_root, agino, ip); if (unlikely(error)) { WARN_ON(error != -EEXIST); - XFS_STATS_INC(xs_ig_dup); + XFS_STATS_INC(mp, xs_ig_dup); error = -EAGAIN; goto out_preload_end; } @@ -412,7 +412,7 @@ xfs_iget( if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) return -EINVAL; - XFS_STATS_INC(xs_ig_attempts); + XFS_STATS_INC(mp, xs_ig_attempts); /* get the perag structure and ensure that it's inode capable */ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); @@ -429,7 +429,7 @@ again: goto out_error_or_again; } else { rcu_read_unlock(); - XFS_STATS_INC(xs_ig_missed); + XFS_STATS_INC(mp, xs_ig_missed); error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, flags, lock_flags); @@ -965,7 +965,7 @@ reclaim: xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); - XFS_STATS_INC(xs_ig_reclaims); + XFS_STATS_INC(ip->i_mount, xs_ig_reclaims); /* * Remove the inode from the per-AG radix tree. * diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index dc40a6d5ae0d..a0f2bae6dc1e 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3271,8 +3271,8 @@ xfs_iflush_cluster( } if (clcount) { - XFS_STATS_INC(xs_icluster_flushcnt); - XFS_STATS_ADD(xs_icluster_flushinode, clcount); + XFS_STATS_INC(mp, xs_icluster_flushcnt); + XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount); } out_free: @@ -3345,7 +3345,7 @@ xfs_iflush( struct xfs_dinode *dip; int error; - XFS_STATS_INC(xs_iflush_count); + XFS_STATS_INC(mp, xs_iflush_count); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(xfs_isiflocked(ip)); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index ea7d85af5310..b67a130134fb 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1028,7 +1028,7 @@ xfs_ioctl_setattr_xflags( xfs_diflags_to_linux(ip); xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - XFS_STATS_INC(xs_ig_attrchg); + XFS_STATS_INC(mp, xs_ig_attrchg); return 0; } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 1f86033171c8..dca69c6711b2 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -670,7 +670,7 @@ xfs_iomap_write_allocate( count_fsb = imap->br_blockcount; map_start_fsb = imap->br_startoff; - XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb)); + XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb)); while (count_fsb != 0) { /* @@ -777,7 +777,7 @@ xfs_iomap_write_allocate( if ((offset_fsb >= imap->br_startoff) && (offset_fsb < (imap->br_startoff + imap->br_blockcount))) { - XFS_STATS_INC(xs_xstrat_quick); + XFS_STATS_INC(mp, xs_xstrat_quick); return 0; } diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 8294132e6a3c..245268a0cdf0 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -695,7 +695,7 @@ xfs_setattr_nonsize( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - XFS_STATS_INC(xs_ig_attrchg); + XFS_STATS_INC(mp, xs_ig_attrchg); if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); @@ -922,7 +922,7 @@ xfs_setattr_size( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - XFS_STATS_INC(xs_ig_attrchg); + XFS_STATS_INC(mp, xs_ig_attrchg); if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index aaadee0969c9..401252360a62 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -268,7 +268,7 @@ xlog_grant_head_wait( __set_current_state(TASK_UNINTERRUPTIBLE); spin_unlock(&head->lock); - XFS_STATS_INC(xs_sleep_logspace); + XFS_STATS_INC(log->l_mp, xs_sleep_logspace); trace_xfs_log_grant_sleep(log, tic); schedule(); @@ -379,7 +379,7 @@ xfs_log_regrant( if (XLOG_FORCED_SHUTDOWN(log)) return -EIO; - XFS_STATS_INC(xs_try_logspace); + XFS_STATS_INC(mp, xs_try_logspace); /* * This is a new transaction on the ticket, so we need to change the @@ -448,7 +448,7 @@ xfs_log_reserve( if (XLOG_FORCED_SHUTDOWN(log)) return -EIO; - XFS_STATS_INC(xs_try_logspace); + XFS_STATS_INC(mp, xs_try_logspace); ASSERT(*ticp == NULL); tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent, @@ -1768,7 +1768,7 @@ xlog_sync( int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb); int size; - XFS_STATS_INC(xs_log_writes); + XFS_STATS_INC(log->l_mp, xs_log_writes); ASSERT(atomic_read(&iclog->ic_refcnt) == 0); /* Add for LR header */ @@ -1805,7 +1805,7 @@ xlog_sync( bp = iclog->ic_bp; XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn))); - XFS_STATS_ADD(xs_log_blocks, BTOBB(count)); + XFS_STATS_ADD(log->l_mp, xs_log_blocks, BTOBB(count)); /* Do we need to split this write into 2 parts? */ if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) { @@ -2913,7 +2913,7 @@ restart: iclog = log->l_iclog; if (iclog->ic_state != XLOG_STATE_ACTIVE) { - XFS_STATS_INC(xs_log_noiclogs); + XFS_STATS_INC(log->l_mp, xs_log_noiclogs); /* Wait for log writes to have flushed */ xlog_wait(&log->l_flush_wait, &log->l_icloglock); @@ -3212,7 +3212,7 @@ _xfs_log_force( struct xlog_in_core *iclog; xfs_lsn_t lsn; - XFS_STATS_INC(xs_log_force); + XFS_STATS_INC(mp, xs_log_force); xlog_cil_force(log); @@ -3297,7 +3297,7 @@ maybe_sleep: spin_unlock(&log->l_icloglock); return -EIO; } - XFS_STATS_INC(xs_log_force_sleep); + XFS_STATS_INC(mp, xs_log_force_sleep); xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); /* * No need to grab the log lock here since we're @@ -3362,7 +3362,7 @@ _xfs_log_force_lsn( ASSERT(lsn != 0); - XFS_STATS_INC(xs_log_force); + XFS_STATS_INC(mp, xs_log_force); lsn = xlog_cil_force_lsn(log, lsn); if (lsn == NULLCOMMITLSN) @@ -3411,7 +3411,7 @@ try_again: (XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) { ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR)); - XFS_STATS_INC(xs_log_force_sleep); + XFS_STATS_INC(mp, xs_log_force_sleep); xlog_wait(&iclog->ic_prev->ic_write_wait, &log->l_icloglock); @@ -3441,7 +3441,7 @@ try_again: spin_unlock(&log->l_icloglock); return -EIO; } - XFS_STATS_INC(xs_log_force_sleep); + XFS_STATS_INC(mp, xs_log_force_sleep); xlog_wait(&iclog->ic_force_wait, &log->l_icloglock); /* * No need to grab the log lock here since we're diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index eac9549efd52..7af7648c06c6 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -184,7 +184,7 @@ xfs_qm_dqpurge( */ ASSERT(!list_empty(&dqp->q_lru)); list_lru_del(&qi->qi_lru, &dqp->q_lru); - XFS_STATS_DEC(xs_qm_dquot_unused); + XFS_STATS_DEC(mp, xs_qm_dquot_unused); xfs_qm_dqdestroy(dqp); return 0; @@ -448,11 +448,11 @@ xfs_qm_dquot_isolate( */ if (dqp->q_nrefs) { xfs_dqunlock(dqp); - XFS_STATS_INC(xs_qm_dqwants); + XFS_STATS_INC(dqp->q_mount, xs_qm_dqwants); trace_xfs_dqreclaim_want(dqp); list_lru_isolate(lru, &dqp->q_lru); - XFS_STATS_DEC(xs_qm_dquot_unused); + XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused); return LRU_REMOVED; } @@ -496,19 +496,19 @@ xfs_qm_dquot_isolate( ASSERT(dqp->q_nrefs == 0); list_lru_isolate_move(lru, &dqp->q_lru, &isol->dispose); - XFS_STATS_DEC(xs_qm_dquot_unused); + XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused); trace_xfs_dqreclaim_done(dqp); - XFS_STATS_INC(xs_qm_dqreclaims); + XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaims); return LRU_REMOVED; out_miss_busy: trace_xfs_dqreclaim_busy(dqp); - XFS_STATS_INC(xs_qm_dqreclaim_misses); + XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses); return LRU_SKIP; out_unlock_dirty: trace_xfs_dqreclaim_busy(dqp); - XFS_STATS_INC(xs_qm_dqreclaim_misses); + XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses); xfs_dqunlock(dqp); spin_lock(lru_lock); return LRU_RETRY; diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index 54f2260299d1..483b0eff1988 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -18,7 +18,6 @@ #ifndef __XFS_STATS_H__ #define __XFS_STATS_H__ -#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF) #include @@ -218,14 +217,25 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf); void xfs_stats_clearall(struct xfsstats __percpu *stats); extern struct xstats xfsstats; -#define XFS_STATS_INC(v) \ - (per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v++) +#define XFS_STATS_INC(mp, v) \ +do { \ + per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v++; \ + per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v++; \ +} while (0) -#define XFS_STATS_DEC(v) \ - (per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v--) +#define XFS_STATS_DEC(mp, v) \ +do { \ + per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v--; \ + per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v--; \ +} while (0) -#define XFS_STATS_ADD(v, inc) \ - (per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v += (inc)) +#define XFS_STATS_ADD(mp, v, inc) \ +do { \ + per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v += (inc); \ + per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v += (inc); \ +} while (0) + +#if defined(CONFIG_PROC_FS) extern int xfs_init_procfs(void); extern void xfs_cleanup_procfs(void); @@ -233,10 +243,6 @@ extern void xfs_cleanup_procfs(void); #else /* !CONFIG_PROC_FS */ -# define XFS_STATS_INC(count) -# define XFS_STATS_DEC(count) -# define XFS_STATS_ADD(count, inc) - static inline int xfs_init_procfs(void) { return 0; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 9d11d3ea8030..368c55adee9d 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -922,7 +922,7 @@ xfs_fs_destroy_inode( trace_xfs_destroy_inode(ip); - XFS_STATS_INC(vn_reclaim); + XFS_STATS_INC(ip->i_mount, vn_reclaim); ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0); @@ -983,8 +983,8 @@ xfs_fs_evict_inode( truncate_inode_pages_final(&inode->i_data); clear_inode(inode); - XFS_STATS_INC(vn_rele); - XFS_STATS_INC(vn_remove); + XFS_STATS_INC(ip->i_mount, vn_rele); + XFS_STATS_INC(ip->i_mount, vn_remove); xfs_inactive(ip); } diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index a0ab1dae9c31..748b16aff45a 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -930,9 +930,9 @@ __xfs_trans_commit( */ if (sync) { error = _xfs_log_force_lsn(mp, commit_lsn, XFS_LOG_SYNC, NULL); - XFS_STATS_INC(xs_trans_sync); + XFS_STATS_INC(mp, xs_trans_sync); } else { - XFS_STATS_INC(xs_trans_async); + XFS_STATS_INC(mp, xs_trans_async); } return error; @@ -955,7 +955,7 @@ out_unreserve: xfs_trans_free_items(tp, NULLCOMMITLSN, !!error); xfs_trans_free(tp); - XFS_STATS_INC(xs_trans_empty); + XFS_STATS_INC(mp, xs_trans_empty); return error; } diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 1098cf490189..4f18fd92ca13 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -349,7 +349,7 @@ xfsaild_push( xfs_ail_min_lsn(ailp))) { ailp->xa_log_flush = 0; - XFS_STATS_INC(xs_push_ail_flush); + XFS_STATS_INC(mp, xs_push_ail_flush); xfs_log_force(mp, XFS_LOG_SYNC); } @@ -371,7 +371,7 @@ xfsaild_push( goto out_done; } - XFS_STATS_INC(xs_push_ail); + XFS_STATS_INC(mp, xs_push_ail); lsn = lip->li_lsn; while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) { @@ -385,7 +385,7 @@ xfsaild_push( lock_result = lip->li_ops->iop_push(lip, &ailp->xa_buf_list); switch (lock_result) { case XFS_ITEM_SUCCESS: - XFS_STATS_INC(xs_push_ail_success); + XFS_STATS_INC(mp, xs_push_ail_success); trace_xfs_ail_push(lip); ailp->xa_last_pushed_lsn = lsn; @@ -403,7 +403,7 @@ xfsaild_push( * re-try the flushing relatively soon if most of the * AIL is beeing flushed. */ - XFS_STATS_INC(xs_push_ail_flushing); + XFS_STATS_INC(mp, xs_push_ail_flushing); trace_xfs_ail_flushing(lip); flushing++; @@ -411,14 +411,14 @@ xfsaild_push( break; case XFS_ITEM_PINNED: - XFS_STATS_INC(xs_push_ail_pinned); + XFS_STATS_INC(mp, xs_push_ail_pinned); trace_xfs_ail_pinned(lip); stuck++; ailp->xa_log_flush++; break; case XFS_ITEM_LOCKED: - XFS_STATS_INC(xs_push_ail_locked); + XFS_STATS_INC(mp, xs_push_ail_locked); trace_xfs_ail_locked(lip); stuck++; -- cgit v1.2.3-59-g8ed1b From 9e92054e8e049f8f4c64d2c6961b2a7e3e13977f Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 12 Oct 2015 18:21:22 +1100 Subject: xfs: simplify /proc teardown & error handling remove_proc_subtree() was added in 3.9, and can be used to simplify our procfile creation error handling and cleanup, removing the nested gotos. It simply removes fs/xfs and everything created under it. Signed-off-by: Eric Sandeen Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_stats.c | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 3348b0b6177d..bd50619fa8c9 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -165,41 +165,29 @@ int xfs_init_procfs(void) { if (!proc_mkdir("fs/xfs", NULL)) - goto out; + return -ENOMEM; if (!proc_symlink("fs/xfs/stat", NULL, "/sys/fs/xfs/stats/stats")) - goto out_remove_xfs_dir; + goto out; #ifdef CONFIG_XFS_QUOTA if (!proc_create("fs/xfs/xqmstat", 0, NULL, &xqmstat_proc_fops)) - goto out_remove_stat_file; + goto out; if (!proc_create("fs/xfs/xqm", 0, NULL, &xqm_proc_fops)) - goto out_remove_xqmstat_file; + goto out; #endif return 0; -#ifdef CONFIG_XFS_QUOTA - out_remove_xqmstat_file: - remove_proc_entry("fs/xfs/xqmstat", NULL); - out_remove_stat_file: - remove_proc_entry("fs/xfs/stat", NULL); -#endif - out_remove_xfs_dir: - remove_proc_entry("fs/xfs", NULL); - out: +out: + remove_proc_subtree("fs/xfs", NULL); return -ENOMEM; } void xfs_cleanup_procfs(void) { -#ifdef CONFIG_XFS_QUOTA - remove_proc_entry("fs/xfs/xqm", NULL); - remove_proc_entry("fs/xfs/xqmstat", NULL); -#endif - remove_proc_entry("fs/xfs/stat", NULL); - remove_proc_entry("fs/xfs", NULL); + remove_proc_subtree("fs/xfs", NULL); } -- cgit v1.2.3-59-g8ed1b From 985ef4dcf94ac0113e2b32ef86bdd5bc47c016b3 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 19 Oct 2015 08:42:46 +1100 Subject: xfs: stats are no longer dependent on CONFIG_PROC_FS So we need to fix the makefile to understand this, otherwise build errors with CONFIG_PROC_FS=n occur. Reported-and-tested-by: Jim Davis Signed-off-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/Makefile | 2 +- fs/xfs/xfs_stats.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index a096841bd06c..f64639176670 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -84,6 +84,7 @@ xfs-y += xfs_aops.o \ xfs_message.o \ xfs_mount.o \ xfs_mru_cache.o \ + xfs_stats.o \ xfs_super.o \ xfs_symlink.o \ xfs_sysfs.o \ @@ -118,7 +119,6 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \ xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o -xfs-$(CONFIG_PROC_FS) += xfs_stats.o xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o xfs-$(CONFIG_NFSD_PNFS) += xfs_pnfs.o diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index bd50619fa8c9..8686df6c7609 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -161,6 +161,7 @@ static const struct file_operations xqmstat_proc_fops = { }; #endif /* CONFIG_XFS_QUOTA */ +#ifdef CONFIG_PROC_FS int xfs_init_procfs(void) { @@ -191,3 +192,4 @@ xfs_cleanup_procfs(void) { remove_proc_subtree("fs/xfs", NULL); } +#endif /* CONFIG_PROC_FS */ -- cgit v1.2.3-59-g8ed1b From f9d460b341f23a9bb7df8868975fdfcc2e71aa9b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 19 Oct 2015 08:42:47 +1100 Subject: xfs: fix an error code in xfs_fs_fill_super() If alloc_percpu() fails, we accidentally return PTR_ERR(NULL), which means success, but we intended to return -ENOMEM. Fixes: 225e4635580c ('xfs: per-filesystem stats in sysfs') Signed-off-by: Dan Carpenter Reviewed-by: Bill O'Donnell Signed-off-by: Dave Chinner --- fs/xfs/xfs_super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 368c55adee9d..b2c252cd7172 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1477,7 +1477,7 @@ xfs_fs_fill_super( /* Allocate stats memory before we do operations that might use it */ mp->m_stats.xs_stats = alloc_percpu(struct xfsstats); if (!mp->m_stats.xs_stats) { - error = PTR_ERR(mp->m_stats.xs_stats); + error = -ENOMEM; goto out_destroy_counters; } -- cgit v1.2.3-59-g8ed1b From 24ba16bb3d499c49974669cd8429c3e4138ab102 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Mon, 2 Nov 2015 13:46:58 +1100 Subject: xfs: clear PF_NOFREEZE for xfsaild kthread Since xfsaild has been converted to kthread in 0030807c, it calls try_to_freeze() during every AIL push iteration. It however doesn't set itself as freezable, and therefore this try_to_freeze() will never do anything. Before (hopefully eventually) kthread freezing gets converted to fileystem freezing, we'd rather mark xfsaild freezable (as it can generate I/O during suspend). Signed-off-by: Jiri Kosina Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_trans_ail.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 1098cf490189..06d1a29a5cf9 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -497,6 +497,7 @@ xfsaild( long tout = 0; /* milliseconds */ current->flags |= PF_MEMALLOC; + set_freezable(); while (!kthread_should_stop()) { if (tout && tout <= 20) -- cgit v1.2.3-59-g8ed1b From 3e12dbbdbd8809f0455920e42fdbf9eddc002651 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 3 Nov 2015 12:27:22 +1100 Subject: xfs: fix inode size update overflow in xfs_map_direct() Both direct IO and DAX pass an offset and count into get_blocks that will overflow a s64 variable when an IO goes into the last supported block in a file (i.e. at offset 2^63 - 1FSB bytes). This can be seen from the tracing: xfs_get_blocks_alloc: [...] offset 0x7ffffffffffff000 count 4096 xfs_gbmap_direct: [...] offset 0x7ffffffffffff000 count 4096 xfs_gbmap_direct_none:[...] offset 0x7ffffffffffff000 count 4096 0x7ffffffffffff000 + 4096 = 0x8000000000000000, and hence that overflows the s64 offset and we fail to detect the need for a filesize update and an ioend is not allocated. This is *mostly* avoided for direct IO because such extending IOs occur with full block allocation, and so the "IS_UNWRITTEN()" check still evaluates as true and we get an ioend that way. However, doing single sector extending IOs to this last block will expose the fact that file size updates will not occur after the first allocating direct IO as the overflow will then be exposed. There is one further complexity: the DAX page fault path also exposes the same issue in block allocation. However, page faults cannot extend the file size, so in this case we want to allocate the block but do not want to allocate an ioend to enable file size update at IO completion. Hence we now need to distinguish between the direct IO patch allocation and dax fault path allocation to avoid leaking ioend structures. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_aops.c | 50 ++++++++++++++++++++++++++++++++++++++++++++------ fs/xfs/xfs_aops.h | 2 ++ fs/xfs/xfs_file.c | 6 +++--- 3 files changed, 49 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 50ab2879b9da..e747d6ad5d18 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1250,13 +1250,28 @@ xfs_vm_releasepage( * the DIO. There is only going to be one reference to the ioend and its life * cycle is constrained by the DIO completion code. hence we don't need * reference counting here. + * + * Note that for DIO, an IO to the highest supported file block offset (i.e. + * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64 + * bit variable. Hence if we see this overflow, we have to assume that the IO is + * extending the file size. We won't know for sure until IO completion is run + * and the actual max write offset is communicated to the IO completion + * routine. + * + * For DAX page faults, we are preparing to never see unwritten extents here, + * nor should we ever extend the inode size. Hence we will soon have nothing to + * do here for this case, ensuring we don't have to provide an IO completion + * callback to free an ioend that we don't actually need for a fault into the + * page at offset (2^63 - 1FSB) bytes. */ + static void xfs_map_direct( struct inode *inode, struct buffer_head *bh_result, struct xfs_bmbt_irec *imap, - xfs_off_t offset) + xfs_off_t offset, + bool dax_fault) { struct xfs_ioend *ioend; xfs_off_t size = bh_result->b_size; @@ -1269,6 +1284,16 @@ xfs_map_direct( trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap); + /* XXX: preparation for removing unwritten extents in DAX */ +#if 0 + if (dax_fault) { + ASSERT(type == XFS_IO_OVERWRITE); + trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type, + imap); + return; + } +#endif + if (bh_result->b_private) { ioend = bh_result->b_private; ASSERT(ioend->io_size > 0); @@ -1283,7 +1308,8 @@ xfs_map_direct( ioend->io_size, ioend->io_type, imap); } else if (type == XFS_IO_UNWRITTEN || - offset + size > i_size_read(inode)) { + offset + size > i_size_read(inode) || + offset + size < 0) { ioend = xfs_alloc_ioend(inode, type); ioend->io_offset = offset; ioend->io_size = size; @@ -1345,7 +1371,8 @@ __xfs_get_blocks( sector_t iblock, struct buffer_head *bh_result, int create, - bool direct) + bool direct, + bool dax_fault) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -1458,7 +1485,8 @@ __xfs_get_blocks( set_buffer_unwritten(bh_result); /* direct IO needs special help */ if (create && direct) - xfs_map_direct(inode, bh_result, &imap, offset); + xfs_map_direct(inode, bh_result, &imap, offset, + dax_fault); } /* @@ -1505,7 +1533,7 @@ xfs_get_blocks( struct buffer_head *bh_result, int create) { - return __xfs_get_blocks(inode, iblock, bh_result, create, false); + return __xfs_get_blocks(inode, iblock, bh_result, create, false, false); } int @@ -1515,7 +1543,17 @@ xfs_get_blocks_direct( struct buffer_head *bh_result, int create) { - return __xfs_get_blocks(inode, iblock, bh_result, create, true); + return __xfs_get_blocks(inode, iblock, bh_result, create, true, false); +} + +int +xfs_get_blocks_dax_fault( + struct inode *inode, + sector_t iblock, + struct buffer_head *bh_result, + int create) +{ + return __xfs_get_blocks(inode, iblock, bh_result, create, true, true); } static void diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 86afd1ac7895..d39ba25ccc98 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -58,6 +58,8 @@ int xfs_get_blocks(struct inode *inode, sector_t offset, struct buffer_head *map_bh, int create); int xfs_get_blocks_direct(struct inode *inode, sector_t offset, struct buffer_head *map_bh, int create); +int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset, + struct buffer_head *map_bh, int create); void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate); extern void xfs_count_page_state(struct page *, int *, int *); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e78feb400e22..27abe1c92184 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1503,7 +1503,7 @@ xfs_filemap_page_mkwrite( xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { - ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct, + ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, xfs_end_io_dax_write); } else { ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks); @@ -1538,7 +1538,7 @@ xfs_filemap_fault( * changes to xfs_get_blocks_direct() to map unwritten extent * ioend for conversion on read-only mappings. */ - ret = __dax_fault(vma, vmf, xfs_get_blocks_direct, NULL); + ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL); } else ret = filemap_fault(vma, vmf); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); @@ -1565,7 +1565,7 @@ xfs_filemap_pmd_fault( sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_direct, + ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault, xfs_end_io_dax_write); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); sb_end_pagefault(inode->i_sb); -- cgit v1.2.3-59-g8ed1b From 3fbbbea34bac049c0b5938dc065f7d8ee1ef7e67 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 3 Nov 2015 12:27:22 +1100 Subject: xfs: introduce BMAPI_ZERO for allocating zeroed extents To enable DAX to do atomic allocation of zeroed extents, we need to drive the block zeroing deep into the allocator. Because xfs_bmapi_write() can return merged extents on allocation that were only partially allocated (i.e. requested range spans allocated and hole regions, allocation into the hole was contiguous), we cannot zero the extent returned from xfs_bmapi_write() as that can overwrite existing data with zeros. Hence we have to drive the extent zeroing into the allocation code, prior to where we merge the extents into the BMBT and return the resultant map. This means we need to propagate this need down to the xfs_alloc_vextent() and issue the block zeroing at this point. While this functionality is being introduced for DAX, there is no reason why it is specific to DAX - we can per-zero blocks during the allocation transaction on any type of device. It's just slow (and usually slower than unwritten allocation and conversion) on traditional block devices so doesn't tend to get used. We can, however, hook hardware zeroing optimisations via sb_issue_zeroout() to this operation, so it may be useful in future and hence the "allocate zeroed blocks" API needs to be implementation neutral. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_alloc.c | 10 +++++++++- fs/xfs/libxfs/xfs_alloc.h | 8 +++++--- fs/xfs/libxfs/xfs_bmap.c | 35 +++++++++++++++++++++++++++++++++-- fs/xfs/libxfs/xfs_bmap.h | 13 +++++++++++-- fs/xfs/xfs_bmap_util.c | 36 ++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_mount.h | 3 +++ 6 files changed, 97 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index ffad7f20342f..4cffc1729bf8 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2503,7 +2503,7 @@ xfs_alloc_vextent( * Try near allocation first, then anywhere-in-ag after * the first a.g. fails. */ - if ((args->userdata == XFS_ALLOC_INITIAL_USER_DATA) && + if ((args->userdata & XFS_ALLOC_INITIAL_USER_DATA) && (mp->m_flags & XFS_MOUNT_32BITINODES)) { args->fsbno = XFS_AGB_TO_FSB(mp, ((mp->m_agfrotor / rotorstep) % @@ -2634,6 +2634,14 @@ xfs_alloc_vextent( XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno), args->len); #endif + + /* Zero the extent if we were asked to do so */ + if (args->userdata & XFS_ALLOC_USERDATA_ZERO) { + error = xfs_zero_extent(args->ip, args->fsbno, args->len); + if (error) + goto error0; + } + } xfs_perag_put(args->pag); return 0; diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index ca1c8168373a..0ecde4d5cac8 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -101,6 +101,7 @@ typedef struct xfs_alloc_arg { struct xfs_mount *mp; /* file system mount point */ struct xfs_buf *agbp; /* buffer for a.g. freelist header */ struct xfs_perag *pag; /* per-ag struct for this agno */ + struct xfs_inode *ip; /* for userdata zeroing method */ xfs_fsblock_t fsbno; /* file system block number */ xfs_agnumber_t agno; /* allocation group number */ xfs_agblock_t agbno; /* allocation group-relative block # */ @@ -120,15 +121,16 @@ typedef struct xfs_alloc_arg { char wasdel; /* set if allocation was prev delayed */ char wasfromfl; /* set if allocation is from freelist */ char isfl; /* set if is freelist blocks - !acctg */ - char userdata; /* set if this is user data */ + char userdata; /* mask defining userdata treatment */ xfs_fsblock_t firstblock; /* io first block allocated */ } xfs_alloc_arg_t; /* * Defines for userdata */ -#define XFS_ALLOC_USERDATA 1 /* allocation is for user data*/ -#define XFS_ALLOC_INITIAL_USER_DATA 2 /* special case start of file */ +#define XFS_ALLOC_USERDATA (1 << 0)/* allocation is for user data*/ +#define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */ +#define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */ xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp, struct xfs_perag *pag, xfs_extlen_t need); diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 8e2010d53b07..9390b7c8b5ef 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -3800,8 +3800,13 @@ xfs_bmap_btalloc( args.wasdel = ap->wasdel; args.isfl = 0; args.userdata = ap->userdata; - if ((error = xfs_alloc_vextent(&args))) + if (ap->userdata & XFS_ALLOC_USERDATA_ZERO) + args.ip = ap->ip; + + error = xfs_alloc_vextent(&args); + if (error) return error; + if (tryagain && args.fsbno == NULLFSBLOCK) { /* * Exact allocation failed. Now try with alignment @@ -4300,11 +4305,14 @@ xfs_bmapi_allocate( /* * Indicate if this is the first user data in the file, or just any - * user data. + * user data. And if it is userdata, indicate whether it needs to + * be initialised to zero during allocation. */ if (!(bma->flags & XFS_BMAPI_METADATA)) { bma->userdata = (bma->offset == 0) ? XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA; + if (bma->flags & XFS_BMAPI_ZERO) + bma->userdata |= XFS_ALLOC_USERDATA_ZERO; } bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1; @@ -4419,6 +4427,17 @@ xfs_bmapi_convert_unwritten( mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN) ? XFS_EXT_NORM : XFS_EXT_UNWRITTEN; + /* + * Before insertion into the bmbt, zero the range being converted + * if required. + */ + if (flags & XFS_BMAPI_ZERO) { + error = xfs_zero_extent(bma->ip, mval->br_startblock, + mval->br_blockcount); + if (error) + return error; + } + error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx, &bma->cur, mval, bma->firstblock, bma->flist, &tmp_logflags); @@ -4512,6 +4531,18 @@ xfs_bmapi_write( ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + /* zeroing is for currently only for data extents, not metadata */ + ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) != + (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)); + /* + * we can allocate unwritten extents or pre-zero allocated blocks, + * but it makes no sense to do both at once. This would result in + * zeroing the unwritten extent twice, but it still being an + * unwritten extent.... + */ + ASSERT((flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)) != + (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)); + if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 6aaa0c1c7200..a160f8a5a3fc 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -52,9 +52,9 @@ struct xfs_bmalloca { xfs_extlen_t minleft; /* amount must be left after alloc */ bool eof; /* set if allocating past last extent */ bool wasdel; /* replacing a delayed allocation */ - bool userdata;/* set if is user data */ bool aeof; /* allocated space at eof */ bool conv; /* overwriting unwritten extents */ + char userdata;/* userdata mask */ int flags; }; @@ -109,6 +109,14 @@ typedef struct xfs_bmap_free */ #define XFS_BMAPI_CONVERT 0x040 +/* + * allocate zeroed extents - this requires all newly allocated user data extents + * to be initialised to zero. It will be ignored if XFS_BMAPI_METADATA is set. + * Use in conjunction with XFS_BMAPI_CONVERT to convert unwritten extents found + * during the allocation range to zeroed written extents. + */ +#define XFS_BMAPI_ZERO 0x080 + #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \ @@ -116,7 +124,8 @@ typedef struct xfs_bmap_free { XFS_BMAPI_PREALLOC, "PREALLOC" }, \ { XFS_BMAPI_IGSTATE, "IGSTATE" }, \ { XFS_BMAPI_CONTIG, "CONTIG" }, \ - { XFS_BMAPI_CONVERT, "CONVERT" } + { XFS_BMAPI_CONVERT, "CONVERT" }, \ + { XFS_BMAPI_ZERO, "ZERO" } static inline int xfs_bmapi_aflag(int w) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 3bf4ad0d19e4..3f59698ecfd8 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -56,6 +56,35 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb) XFS_FSB_TO_DADDR((ip)->i_mount, (fsb))); } +/* + * Routine to zero an extent on disk allocated to the specific inode. + * + * The VFS functions take a linearised filesystem block offset, so we have to + * convert the sparse xfs fsb to the right format first. + * VFS types are real funky, too. + */ +int +xfs_zero_extent( + struct xfs_inode *ip, + xfs_fsblock_t start_fsb, + xfs_off_t count_fsb) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_daddr_t sector = xfs_fsb_to_db(ip, start_fsb); + sector_t block = XFS_BB_TO_FSBT(mp, sector); + ssize_t size = XFS_FSB_TO_B(mp, count_fsb); + + if (IS_DAX(VFS_I(ip))) + return dax_clear_blocks(VFS_I(ip), block, size); + + /* + * let the block layer decide on the fastest method of + * implementing the zeroing. + */ + return sb_issue_zeroout(mp->m_super, block, count_fsb, GFP_NOFS); + +} + /* * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi * caller. Frees all the extents that need freeing, which must be done @@ -229,6 +258,13 @@ xfs_bmap_rtalloc( xfs_trans_mod_dquot_byino(ap->tp, ap->ip, ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT : XFS_TRANS_DQ_RTBCOUNT, (long) ralen); + + /* Zero the extent if we were asked to do so */ + if (ap->userdata & XFS_ALLOC_USERDATA_ZERO) { + error = xfs_zero_extent(ap->ip, ap->blkno, ap->length); + if (error) + return error; + } } else { ap->length = 0; } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 7999e91cd49a..404bfa50468f 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -336,4 +336,7 @@ extern int xfs_dev_is_read_only(struct xfs_mount *, char *); extern void xfs_set_low_space_thresholds(struct xfs_mount *); +int xfs_zero_extent(struct xfs_inode *ip, xfs_fsblock_t start_fsb, + xfs_off_t count_fsb); + #endif /* __XFS_MOUNT_H__ */ -- cgit v1.2.3-59-g8ed1b From 1ca191576fc862b4766f58e41aa362b28a7c1866 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 3 Nov 2015 12:37:00 +1100 Subject: xfs: Don't use unwritten extents for DAX DAX has a page fault serialisation problem with block allocation. Because it allows concurrent page faults and does not have a page lock to serialise faults to the same page, it can get two concurrent faults to the page that race. When two read faults race, this isn't a huge problem as the data underlying the page is not changing and so "detect and drop" works just fine. The issues are to do with write faults. When two write faults occur, we serialise block allocation in get_blocks() so only one faul will allocate the extent. It will, however, be marked as an unwritten extent, and that is where the problem lies - the DAX fault code cannot differentiate between a block that was just allocated and a block that was preallocated and needs zeroing. The result is that both write faults end up zeroing the block and attempting to convert it back to written. The problem is that the first fault can zero and convert before the second fault starts zeroing, resulting in the zeroing for the second fault overwriting the data that the first fault wrote with zeros. The second fault then attempts to convert the unwritten extent, which is then a no-op because it's already written. Data loss occurs as a result of this race. Because there is no sane locking construct in the page fault code that we can use for serialisation across the page faults, we need to ensure block allocation and zeroing occurs atomically in the filesystem. This means we can still take concurrent page faults and the only time they will serialise is in the filesystem mapping/allocation callback. The page fault code will always see written, initialised extents, so we will be able to remove the unwritten extent handling from the DAX code when all filesystems are converted. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/dax.c | 5 +++++ fs/xfs/xfs_aops.c | 13 +++++++++---- fs/xfs/xfs_iomap.c | 23 +++++++++++++++++++++-- 3 files changed, 35 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index 7ae6df7ea1d2..74033ad1bc92 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -29,6 +29,11 @@ #include #include +/* + * dax_clear_blocks() is called from within transaction context from XFS, + * and hence this means the stack from this point must follow GFP_NOFS + * semantics for all operations. + */ int dax_clear_blocks(struct inode *inode, sector_t block, long size) { struct block_device *bdev = inode->i_sb->s_bdev; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index e747d6ad5d18..df3dabd469b9 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1284,15 +1284,12 @@ xfs_map_direct( trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap); - /* XXX: preparation for removing unwritten extents in DAX */ -#if 0 if (dax_fault) { ASSERT(type == XFS_IO_OVERWRITE); trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type, imap); return; } -#endif if (bh_result->b_private) { ioend = bh_result->b_private; @@ -1420,10 +1417,12 @@ __xfs_get_blocks( if (error) goto out_unlock; + /* for DAX, we convert unwritten extents directly */ if (create && (!nimaps || (imap.br_startblock == HOLESTARTBLOCK || - imap.br_startblock == DELAYSTARTBLOCK))) { + imap.br_startblock == DELAYSTARTBLOCK) || + (IS_DAX(inode) && ISUNWRITTEN(&imap)))) { if (direct || xfs_get_extsz_hint(ip)) { /* * Drop the ilock in preparation for starting the block @@ -1468,6 +1467,12 @@ __xfs_get_blocks( goto out_unlock; } + if (IS_DAX(inode) && create) { + ASSERT(!ISUNWRITTEN(&imap)); + /* zeroing is not needed at a higher layer */ + new = 0; + } + /* trim mapping down to size requested */ if (direct || size > (1 << inode->i_blkbits)) xfs_map_trim_size(inode, iblock, bh_result, diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 1f86033171c8..b48c6b525e77 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -131,6 +131,7 @@ xfs_iomap_write_direct( uint qblocks, resblks, resrtextents; int committed; int error; + int bmapi_flags = XFS_BMAPI_PREALLOC; error = xfs_qm_dqattach(ip, 0); if (error) @@ -177,6 +178,23 @@ xfs_iomap_write_direct( * Allocate and setup the transaction */ tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + + /* + * For DAX, we do not allocate unwritten extents, but instead we zero + * the block before we commit the transaction. Ideally we'd like to do + * this outside the transaction context, but if we commit and then crash + * we may not have zeroed the blocks and this will be exposed on + * recovery of the allocation. Hence we must zero before commit. + * Further, if we are mapping unwritten extents here, we need to zero + * and convert them to written so that we don't need an unwritten extent + * callback for DAX. This also means that we need to be able to dip into + * the reserve block pool if there is no space left but we need to do + * unwritten extent conversion. + */ + if (IS_DAX(VFS_I(ip))) { + bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO; + tp->t_flags |= XFS_TRANS_RESERVE; + } error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, resrtextents); /* @@ -202,8 +220,8 @@ xfs_iomap_write_direct( xfs_bmap_init(&free_list, &firstfsb); nimaps = 1; error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, - XFS_BMAPI_PREALLOC, &firstfsb, 0, - imap, &nimaps, &free_list); + bmapi_flags, &firstfsb, 0, imap, + &nimaps, &free_list); if (error) goto out_bmap_cancel; @@ -213,6 +231,7 @@ xfs_iomap_write_direct( error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) goto out_bmap_cancel; + error = xfs_trans_commit(tp); if (error) goto out_unlock; -- cgit v1.2.3-59-g8ed1b From 01a155e6cf7db1a8ff2aa73162d7d9ec05ad298f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 3 Nov 2015 12:37:02 +1100 Subject: xfs: DAX does not use IO completion callbacks For DAX, we are now doing block zeroing during allocation. This means we no longer need a special DAX fault IO completion callback to do unwritten extent conversion. Because mmap never extends the file size (it SEGVs the process) we don't need a callback to update the file size, either. Hence we can remove the completion callbacks from the __dax_fault and __dax_mkwrite calls. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_aops.c | 39 --------------------------------------- fs/xfs/xfs_aops.h | 1 - fs/xfs/xfs_file.c | 5 ++--- 3 files changed, 2 insertions(+), 43 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index df3dabd469b9..69c2dbc20836 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1657,45 +1657,6 @@ xfs_end_io_direct_write( __xfs_end_io_direct_write(inode, ioend, offset, size); } -/* - * For DAX we need a mapping buffer callback for unwritten extent conversion - * when page faults allocate blocks and then zero them. Note that in this - * case the mapping indicated by the ioend may extend beyond EOF. We most - * definitely do not want to extend EOF here, so we trim back the ioend size to - * EOF. - */ -#ifdef CONFIG_FS_DAX -void -xfs_end_io_dax_write( - struct buffer_head *bh, - int uptodate) -{ - struct xfs_ioend *ioend = bh->b_private; - struct inode *inode = ioend->io_inode; - ssize_t size = ioend->io_size; - - ASSERT(IS_DAX(ioend->io_inode)); - - /* if there was an error zeroing, then don't convert it */ - if (!uptodate) - ioend->io_error = -EIO; - - /* - * Trim update to EOF, so we don't extend EOF during unwritten extent - * conversion of partial EOF blocks. - */ - spin_lock(&XFS_I(inode)->i_flags_lock); - if (ioend->io_offset + size > i_size_read(inode)) - size = i_size_read(inode) - ioend->io_offset; - spin_unlock(&XFS_I(inode)->i_flags_lock); - - __xfs_end_io_direct_write(inode, ioend, ioend->io_offset, size); - -} -#else -void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate) { } -#endif - static inline ssize_t xfs_vm_do_dio( struct inode *inode, diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index d39ba25ccc98..f6ffc9ae5ceb 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -60,7 +60,6 @@ int xfs_get_blocks_direct(struct inode *inode, sector_t offset, struct buffer_head *map_bh, int create); int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset, struct buffer_head *map_bh, int create); -void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate); extern void xfs_count_page_state(struct page *, int *, int *); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 27abe1c92184..9c8eef7c57b4 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1503,8 +1503,7 @@ xfs_filemap_page_mkwrite( xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { - ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, - xfs_end_io_dax_write); + ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL); } else { ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks); ret = block_page_mkwrite_return(ret); @@ -1566,7 +1565,7 @@ xfs_filemap_pmd_fault( file_update_time(vma->vm_file); xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault, - xfs_end_io_dax_write); + NULL); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); sb_end_pagefault(inode->i_sb); -- cgit v1.2.3-59-g8ed1b From 3af49285854df66260a263198cc15abb07b95287 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 3 Nov 2015 12:37:02 +1100 Subject: xfs: add ->pfn_mkwrite support for DAX ->pfn_mkwrite support is needed so that when a page with allocated backing store takes a write fault we can check that the fault has not raced with a truncate and is pointing to a region beyond the current end of file. This also allows us to update the timestamp on the inode, too, which fixes a generic/080 failure. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 35 +++++++++++++++++++++++++++++++++++ fs/xfs/xfs_trace.h | 1 + 2 files changed, 36 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 9c8eef7c57b4..f429662d7d42 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1572,11 +1572,46 @@ xfs_filemap_pmd_fault( return ret; } +/* + * pfn_mkwrite was originally inteneded to ensure we capture time stamp + * updates on write faults. In reality, it's need to serialise against + * truncate similar to page_mkwrite. Hence we open-code dax_pfn_mkwrite() + * here and cycle the XFS_MMAPLOCK_SHARED to ensure we serialise the fault + * barrier in place. + */ +static int +xfs_filemap_pfn_mkwrite( + struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + + struct inode *inode = file_inode(vma->vm_file); + struct xfs_inode *ip = XFS_I(inode); + int ret = VM_FAULT_NOPAGE; + loff_t size; + + trace_xfs_filemap_pfn_mkwrite(ip); + + sb_start_pagefault(inode->i_sb); + file_update_time(vma->vm_file); + + /* check if the faulting page hasn't raced with truncate */ + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (vmf->pgoff >= size) + ret = VM_FAULT_SIGBUS; + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + sb_end_pagefault(inode->i_sb); + return ret; + +} + static const struct vm_operations_struct xfs_file_vm_ops = { .fault = xfs_filemap_fault, .pmd_fault = xfs_filemap_pmd_fault, .map_pages = filemap_map_pages, .page_mkwrite = xfs_filemap_page_mkwrite, + .pfn_mkwrite = xfs_filemap_pfn_mkwrite, }; STATIC int diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 5ed36b1e04c1..c53beda675d6 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -689,6 +689,7 @@ DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid); DEFINE_INODE_EVENT(xfs_filemap_fault); DEFINE_INODE_EVENT(xfs_filemap_pmd_fault); DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite); +DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite); DECLARE_EVENT_CLASS(xfs_iref_class, TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), -- cgit v1.2.3-59-g8ed1b From 13ad4fe3e087ab66a140f1e00d98f28aa4e3bb28 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 3 Nov 2015 12:37:02 +1100 Subject: xfs: xfs_filemap_pmd_fault treats read faults as write faults The code initially committed didn't have the same checks for write faults as the dax_pmd_fault code and hence treats all faults as write faults. We can get read faults through this path because they is no pmd_mkwrite path for write faults similar to the normal page fault path. Hence we need to ensure that we only do c/mtime updates on write faults, and freeze protection is unnecessary for read faults. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f429662d7d42..ce208e3896aa 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1477,7 +1477,7 @@ xfs_file_llseek( * * mmap_sem (MM) * sb_start_pagefault(vfs, freeze) - * i_mmap_lock (XFS - truncate serialisation) + * i_mmaplock (XFS - truncate serialisation) * page_lock (MM) * i_lock (XFS - extent map serialisation) */ @@ -1545,6 +1545,13 @@ xfs_filemap_fault( return ret; } +/* + * Similar to xfs_filemap_fault(), the DAX fault path can call into here on + * both read and write faults. Hence we need to handle both cases. There is no + * ->pmd_mkwrite callout for huge pages, so we have a single function here to + * handle both cases here. @flags carries the information on the type of fault + * occuring. + */ STATIC int xfs_filemap_pmd_fault( struct vm_area_struct *vma, @@ -1561,13 +1568,18 @@ xfs_filemap_pmd_fault( trace_xfs_filemap_pmd_fault(ip); - sb_start_pagefault(inode->i_sb); - file_update_time(vma->vm_file); + if (flags & FAULT_FLAG_WRITE) { + sb_start_pagefault(inode->i_sb); + file_update_time(vma->vm_file); + } + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault, NULL); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - sb_end_pagefault(inode->i_sb); + + if (flags & FAULT_FLAG_WRITE) + sb_end_pagefault(inode->i_sb); return ret; } -- cgit v1.2.3-59-g8ed1b From 67d8e04e345eafcb2940066f435815032eec467d Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 3 Nov 2015 12:40:59 +1100 Subject: xfs: invalidate cached acl if set directly via xattr ACLs are stored as extended attributes of the inode to which they apply. XFS converts the standard "system.posix_acl_[access|default]" attribute names used to control ACLs to "trusted.SGI_ACL_[FILE|DEFAULT]" as stored on-disk. These xattrs are directly exposed in on-disk format via getxattr/setxattr, without any ACL aware code in the path to perform validation, etc. This is partly historical and supports backup/restore applications such as xfsdump to back up and restore the binary blob that represents ACLs as-is. Andreas reports that the ACLs observed via the getfacl interface is not consistent when ACLs are set directly via the setxattr path. This occurs because the ACLs are cached in-core against the inode and the xattr path has no knowledge that the operation relates to ACLs. Update the xattr set codepath to trap writes of the special XFS ACL attributes and invalidate the associated cached ACL when this occurs. This ensures that the correct ACLs are used on a subsequent operation through the actual ACL interface. Note that this does not update or add support for setting the ACL xattrs directly beyond the restore use case that requires a correctly formatted binary blob and to restore a consistent i_mode at the same time. It is still possible for a root user to set an invalid or inconsistent (with i_mode) ACL blob on-disk and potentially cause corruption. [ With fixes from Andreas Gruenbacher. ] Reported-by: Andreas Gruenbacher Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_xattr.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index c036815183cb..1e08d3e85cd0 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -57,7 +57,8 @@ static int xfs_xattr_set(struct dentry *dentry, const char *name, const void *value, size_t size, int flags, int xflags) { - struct xfs_inode *ip = XFS_I(d_inode(dentry)); + struct xfs_inode *ip = XFS_I(d_inode(dentry)); + int error; if (strcmp(name, "") == 0) return -EINVAL; @@ -70,8 +71,24 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value, if (!value) return xfs_attr_remove(ip, (unsigned char *)name, xflags); - return xfs_attr_set(ip, (unsigned char *)name, + error = xfs_attr_set(ip, (unsigned char *)name, (void *)value, size, xflags); + /* + * Invalidate any cached ACLs if the user has bypassed the ACL + * interface. We don't validate the content whatsoever so it is caller + * responsibility to provide data in valid format and ensure i_mode is + * consistent. + */ +#ifdef CONFIG_XFS_POSIX_ACL + if (!error && (xflags & ATTR_ROOT)) { + if (!strcmp(name, SGI_ACL_FILE)) + forget_cached_acl(VFS_I(ip), ACL_TYPE_ACCESS); + else if (!strcmp(name, SGI_ACL_DEFAULT)) + forget_cached_acl(VFS_I(ip), ACL_TYPE_DEFAULT); + } +#endif + + return error; } static const struct xattr_handler xfs_xattr_user_handler = { -- cgit v1.2.3-59-g8ed1b From 86a21c79745ca97676cbd47f8608839382cc0448 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 3 Nov 2015 12:41:59 +1100 Subject: xfs: Validate the length of on-disk ACLs In xfs_acl_from_disk, instead of trusting that xfs_acl.acl_cnt is correct, make sure that the length of the attributes is correct as well. Also, turn the aclp parameter into a const pointer. Signed-off-by: Andreas Gruenbacher Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/libxfs/xfs_format.h | 8 ++++++-- fs/xfs/xfs_acl.c | 13 ++++++++----- 2 files changed, 14 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 9590a069e556..0e62682e7019 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -1487,9 +1487,13 @@ struct xfs_acl { sizeof(struct xfs_acl_entry) \ : 25) -#define XFS_ACL_MAX_SIZE(mp) \ +#define XFS_ACL_SIZE(cnt) \ (sizeof(struct xfs_acl) + \ - sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp))) + sizeof(struct xfs_acl_entry) * cnt) + +#define XFS_ACL_MAX_SIZE(mp) \ + XFS_ACL_SIZE(XFS_ACL_MAX_ENTRIES((mp))) + /* On-disk XFS extended attribute names */ #define SGI_ACL_FILE "SGI_ACL_FILE" diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 4b641676f258..763e36560681 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -37,16 +37,19 @@ STATIC struct posix_acl * xfs_acl_from_disk( - struct xfs_acl *aclp, - int max_entries) + const struct xfs_acl *aclp, + int len, + int max_entries) { struct posix_acl_entry *acl_e; struct posix_acl *acl; - struct xfs_acl_entry *ace; + const struct xfs_acl_entry *ace; unsigned int count, i; + if (len < sizeof(*aclp)) + return ERR_PTR(-EFSCORRUPTED); count = be32_to_cpu(aclp->acl_cnt); - if (count > max_entries) + if (count > max_entries || XFS_ACL_SIZE(count) != len) return ERR_PTR(-EFSCORRUPTED); acl = posix_acl_alloc(count, GFP_KERNEL); @@ -163,7 +166,7 @@ xfs_get_acl(struct inode *inode, int type) goto out; } - acl = xfs_acl_from_disk(xfs_acl, XFS_ACL_MAX_ENTRIES(ip->i_mount)); + acl = xfs_acl_from_disk(xfs_acl, len, XFS_ACL_MAX_ENTRIES(ip->i_mount)); if (IS_ERR(acl)) goto out; -- cgit v1.2.3-59-g8ed1b From 09cb22d2a57b51d7d052dfe508f260abc67b69b6 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 3 Nov 2015 12:53:54 +1100 Subject: xfs: Plug memory leak in xfs_attrmulti_attr_set When setting attributes via XFS_IOC_ATTRMULTI_BY_HANDLE, the user-space buffer is copied into a new kernel-space buffer via memdup_user; that buffer then isn't freed. Signed-off-by: Andreas Gruenbacher Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_ioctl.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index ea7d85af5310..e939c20cb4de 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -482,6 +482,7 @@ xfs_attrmulti_attr_set( __uint32_t flags) { unsigned char *kbuf; + int error; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return -EPERM; @@ -492,7 +493,9 @@ xfs_attrmulti_attr_set( if (IS_ERR(kbuf)) return PTR_ERR(kbuf); - return xfs_attr_set(XFS_I(inode), name, kbuf, len, flags); + error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags); + kfree(kbuf); + return error; } int -- cgit v1.2.3-59-g8ed1b From 47e1bf640558237b79d3009fb7dfe157f12f4f7a Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 3 Nov 2015 12:56:17 +1100 Subject: xfs: invalidate cached acl if set via ioctl Setting or removing the "SGI_ACL_[FILE|DEFAULT]" attributes via the XFS_IOC_ATTRMULTI_BY_HANDLE ioctl completely bypasses the POSIX ACL infrastructure, like setting the "trusted.SGI_ACL_[FILE|DEFAULT]" xattrs did until commit 6caa1056. Similar to that commit, invalidate cached acls when setting/removing them via the ioctl as well. Signed-off-by: Andreas Gruenbacher Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_acl.h | 3 +++ fs/xfs/xfs_ioctl.c | 10 +++++++++- fs/xfs/xfs_xattr.c | 38 ++++++++++++++++++++++++-------------- 3 files changed, 36 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 3841b07f27bf..75af0a4d9028 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -36,4 +36,7 @@ static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type) # define posix_acl_access_exists(inode) 0 # define posix_acl_default_exists(inode) 0 #endif /* CONFIG_XFS_POSIX_ACL */ + +extern void xfs_forget_acl(struct inode *inode, const char *name, int xflags); + #endif /* __XFS_ACL_H__ */ diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index e939c20cb4de..67bb2983c036 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -40,6 +40,7 @@ #include "xfs_symlink.h" #include "xfs_trans.h" #include "xfs_pnfs.h" +#include "xfs_acl.h" #include #include @@ -494,6 +495,8 @@ xfs_attrmulti_attr_set( return PTR_ERR(kbuf); error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags); + if (!error) + xfs_forget_acl(inode, name, flags); kfree(kbuf); return error; } @@ -504,9 +507,14 @@ xfs_attrmulti_attr_remove( unsigned char *name, __uint32_t flags) { + int error; + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return -EPERM; - return xfs_attr_remove(XFS_I(inode), name, flags); + error = xfs_attr_remove(XFS_I(inode), name, flags); + if (!error) + xfs_forget_acl(inode, name, flags); + return error; } STATIC int diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 1e08d3e85cd0..8294f86441bf 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -53,6 +53,28 @@ xfs_xattr_get(struct dentry *dentry, const char *name, return asize; } +void +xfs_forget_acl( + struct inode *inode, + const char *name, + int xflags) +{ + /* + * Invalidate any cached ACLs if the user has bypassed the ACL + * interface. We don't validate the content whatsoever so it is caller + * responsibility to provide data in valid format and ensure i_mode is + * consistent. + */ + if (xflags & ATTR_ROOT) { +#ifdef CONFIG_XFS_POSIX_ACL + if (!strcmp(name, SGI_ACL_FILE)) + forget_cached_acl(inode, ACL_TYPE_ACCESS); + else if (!strcmp(name, SGI_ACL_DEFAULT)) + forget_cached_acl(inode, ACL_TYPE_DEFAULT); +#endif + } +} + static int xfs_xattr_set(struct dentry *dentry, const char *name, const void *value, size_t size, int flags, int xflags) @@ -73,20 +95,8 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value, return xfs_attr_remove(ip, (unsigned char *)name, xflags); error = xfs_attr_set(ip, (unsigned char *)name, (void *)value, size, xflags); - /* - * Invalidate any cached ACLs if the user has bypassed the ACL - * interface. We don't validate the content whatsoever so it is caller - * responsibility to provide data in valid format and ensure i_mode is - * consistent. - */ -#ifdef CONFIG_XFS_POSIX_ACL - if (!error && (xflags & ATTR_ROOT)) { - if (!strcmp(name, SGI_ACL_FILE)) - forget_cached_acl(VFS_I(ip), ACL_TYPE_ACCESS); - else if (!strcmp(name, SGI_ACL_DEFAULT)) - forget_cached_acl(VFS_I(ip), ACL_TYPE_DEFAULT); - } -#endif + if (!error) + xfs_forget_acl(d_inode(dentry), name, xflags); return error; } -- cgit v1.2.3-59-g8ed1b From af3b63822e73b66f3ca9927b46df8b873ab8c6ec Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 3 Nov 2015 13:06:34 +1100 Subject: xfs: don't leak uuid table on rmmod Don't leak the UUID table when the module is unloaded. (Found with kmemleak.) Signed-off-by: Darrick J. Wong Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_mount.c | 10 ++++++++++ fs/xfs/xfs_mount.h | 1 + fs/xfs/xfs_super.c | 1 + 3 files changed, 12 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index bf92e0c037c7..fe6fe8a889d5 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -47,6 +47,16 @@ static DEFINE_MUTEX(xfs_uuid_table_mutex); static int xfs_uuid_table_size; static uuid_t *xfs_uuid_table; +void +xfs_uuid_table_free(void) +{ + if (xfs_uuid_table_size == 0) + return; + kmem_free(xfs_uuid_table); + xfs_uuid_table = NULL; + xfs_uuid_table_size = 0; +} + /* * See if the UUID is unique among mounted XFS filesystems. * Mount fails if UUID is nil or a FS with the same UUID is already mounted. diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 7999e91cd49a..d6cbdcb168dd 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -312,6 +312,7 @@ typedef struct xfs_perag { int pagb_count; /* pagb slots in use */ } xfs_perag_t; +extern void xfs_uuid_table_free(void); extern int xfs_log_sbcount(xfs_mount_t *); extern __uint64_t xfs_default_resblks(xfs_mount_t *mp); extern int xfs_mountfs(xfs_mount_t *mp); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 904f637cfa5f..29531ec19ba6 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1896,6 +1896,7 @@ exit_xfs_fs(void) xfs_mru_cache_uninit(); xfs_destroy_workqueues(); xfs_destroy_zones(); + xfs_uuid_table_free(); } module_init(init_xfs_fs); -- cgit v1.2.3-59-g8ed1b From fc0561cefc04e7803c0f6501ca4f310a502f65b8 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 3 Nov 2015 13:14:59 +1100 Subject: xfs: optimise away log forces on timestamp updates for fdatasync xfs: timestamp updates cause excessive fdatasync log traffic Sage Weil reported that a ceph test workload was writing to the log on every fdatasync during an overwrite workload. Event tracing showed that the only metadata modification being made was the timestamp updates during the write(2) syscall, but fdatasync(2) is supposed to ignore them. The key observation was that the transactions in the log all looked like this: INODE: #regs: 4 ino: 0x8b flags: 0x45 dsize: 32 And contained a flags field of 0x45 or 0x85, and had data and attribute forks following the inode core. This means that the timestamp updates were triggering dirty relogging of previously logged parts of the inode that hadn't yet been flushed back to disk. There are two parts to this problem. The first is that XFS relogs dirty regions in subsequent transactions, so it carries around the fields that have been dirtied since the last time the inode was written back to disk, not since the last time the inode was forced into the log. The second part is that on v5 filesystems, the inode change count update during inode dirtying also sets the XFS_ILOG_CORE flag, so on v5 filesystems this makes a timestamp update dirty the entire inode. As a result when fdatasync is run, it looks at the dirty fields in the inode, and sees more than just the timestamp flag, even though the only metadata change since the last fdatasync was just the timestamps. Hence we force the log on every subsequent fdatasync even though it is not needed. To fix this, add a new field to the inode log item that tracks changes since the last time fsync/fdatasync forced the log to flush the changes to the journal. This flag is updated when we dirty the inode, but we do it before updating the change count so it does not carry the "core dirty" flag from timestamp updates. The fields are zeroed when the inode is marked clean (due to writeback/freeing) or when an fsync/datasync forces the log. Hence if we only dirty the timestamps on the inode between fsync/fdatasync calls, the fdatasync will not trigger another log force. Over 100 runs of the test program: Ext4 baseline: runtime: 1.63s +/- 0.24s avg lat: 1.59ms +/- 0.24ms iops: ~2000 XFS, vanilla kernel: runtime: 2.45s +/- 0.18s avg lat: 2.39ms +/- 0.18ms log forces: ~400/s iops: ~1000 XFS, patched kernel: runtime: 1.49s +/- 0.26s avg lat: 1.46ms +/- 0.25ms log forces: ~30/s iops: ~1500 Reported-by: Sage Weil Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 21 ++++++++++++++++----- fs/xfs/xfs_inode.c | 2 ++ fs/xfs/xfs_inode_item.c | 1 + fs/xfs/xfs_inode_item.h | 1 + fs/xfs/xfs_trans_inode.c | 9 +++++++++ 5 files changed, 29 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e78feb400e22..c94699cbc667 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -242,19 +242,30 @@ xfs_file_fsync( } /* - * All metadata updates are logged, which means that we just have - * to flush the log up to the latest LSN that touched the inode. + * All metadata updates are logged, which means that we just have to + * flush the log up to the latest LSN that touched the inode. If we have + * concurrent fsync/fdatasync() calls, we need them to all block on the + * log force before we clear the ili_fsync_fields field. This ensures + * that we don't get a racing sync operation that does not wait for the + * metadata to hit the journal before returning. If we race with + * clearing the ili_fsync_fields, then all that will happen is the log + * force will do nothing as the lsn will already be on disk. We can't + * race with setting ili_fsync_fields because that is done under + * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared + * until after the ili_fsync_fields is cleared. */ xfs_ilock(ip, XFS_ILOCK_SHARED); if (xfs_ipincount(ip)) { if (!datasync || - (ip->i_itemp->ili_fields & ~XFS_ILOG_TIMESTAMP)) + (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) lsn = ip->i_itemp->ili_last_lsn; } - xfs_iunlock(ip, XFS_ILOCK_SHARED); - if (lsn) + if (lsn) { error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed); + ip->i_itemp->ili_fsync_fields = 0; + } + xfs_iunlock(ip, XFS_ILOCK_SHARED); /* * If we only have a single device, and the log force about was diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index dc40a6d5ae0d..ff629d544706 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2365,6 +2365,7 @@ retry: iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; + iip->ili_fsync_fields = 0; iip->ili_logged = 1; xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, &iip->ili_item.li_lsn); @@ -3560,6 +3561,7 @@ xfs_iflush_int( */ iip->ili_last_fields = iip->ili_fields; iip->ili_fields = 0; + iip->ili_fsync_fields = 0; iip->ili_logged = 1; xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 62bd80f4edd9..d14b12b8cfef 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -719,6 +719,7 @@ xfs_iflush_abort( * attempted. */ iip->ili_fields = 0; + iip->ili_fsync_fields = 0; } /* * Release the inode's flush lock since we're done with it. diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 488d81254e28..4c7722e325b3 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -34,6 +34,7 @@ typedef struct xfs_inode_log_item { unsigned short ili_logged; /* flushed logged data */ unsigned int ili_last_fields; /* fields when flushed */ unsigned int ili_fields; /* fields to be logged */ + unsigned int ili_fsync_fields; /* logged since last fsync */ } xfs_inode_log_item_t; static inline int xfs_inode_clean(xfs_inode_t *ip) diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index 17280cd71934..b97f1df910ab 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -107,6 +107,15 @@ xfs_trans_log_inode( ASSERT(ip->i_itemp != NULL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + /* + * Record the specific change for fdatasync optimisation. This + * allows fdatasync to skip log forces for inodes that are only + * timestamp dirty. We do this before the change count so that + * the core being logged in this case does not impact on fdatasync + * behaviour. + */ + ip->i_itemp->ili_fsync_fields |= flags; + /* * First time we log the inode in a transaction, bump the inode change * counter if it is configured for this to occur. We don't use -- cgit v1.2.3-59-g8ed1b From edfb8ebce225a0638cf62591d4ccb502f052ffd4 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 10 Nov 2015 10:09:45 +1100 Subject: xfs: Fix error path in xfs_get_acl Error codes from xfs_attr_get other than -ENOATTR were not properly reported. Fix that. In addition, the declaration of struct xfs_inode in xfs_acl.h isn't needed. Signed-off-by: Andreas Gruenbacher Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_acl.c | 1 + fs/xfs/xfs_acl.h | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 763e36560681..6bb470fbb8e8 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -163,6 +163,7 @@ xfs_get_acl(struct inode *inode, int type) */ if (error == -ENOATTR) goto out_update_cache; + acl = ERR_PTR(error); goto out; } diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 75af0a4d9028..52f8255d6bdf 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -20,7 +20,6 @@ struct inode; struct posix_acl; -struct xfs_inode; #ifdef CONFIG_XFS_POSIX_ACL extern struct posix_acl *xfs_get_acl(struct inode *inode, int type); -- cgit v1.2.3-59-g8ed1b From 848ccfc8fe0e8ae572ed0d8a9a2c3a0cda3bce3b Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 10 Nov 2015 10:10:33 +1100 Subject: xfs: fix log recovery op header validation assert Commit 89cebc84 ("xfs: validate transaction header length on log recovery") added additional validation of the on-disk op header length to protect from buffer overflow during log recovery. It accounts for the fact that the transaction header can be split across multiple op headers. It added an assert for when this occurs that verifies the length of the second part of a split transaction header is less than a full transaction header. In other words, it expects that the first op header of a split transaction header includes at least some portion of the transaction header. This expectation is not always valid as a zero-length op header can exist for the first op header of a split transaction header (see xlog_recover_add_to_trans() for details). This means that the second op header can have a valid, full length transaction header and thus the full header is copied in xlog_recover_add_to_cont_trans(). Fix the assert in xlog_recover_add_to_cont_trans() to handle this case correctly and require that the op header length is less than or equal to a full transaction header. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_log_recover.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 512a0945d52a..0d92b6f2355e 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3431,7 +3431,7 @@ xlog_recover_add_to_cont_trans( * previous record. Copy the rest of the header. */ if (list_empty(&trans->r_itemq)) { - ASSERT(len < sizeof(struct xfs_trans_header)); + ASSERT(len <= sizeof(struct xfs_trans_header)); if (len > sizeof(struct xfs_trans_header)) { xfs_warn(log->l_mp, "%s: bad header length", __func__); return -EIO; -- cgit v1.2.3-59-g8ed1b From 7a29ac474a47eb8cf212b45917683ae89d6fa13b Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 10 Nov 2015 10:10:34 +1100 Subject: xfs: give all workqueues rescuer threads We're consistently hitting deadlocks here with XFS on recent kernels. After some digging through the crash files, it looks like everyone in the system is waiting for XFS to reclaim memory. Something like this: PID: 2733434 TASK: ffff8808cd242800 CPU: 19 COMMAND: "java" #0 [ffff880019c53588] __schedule at ffffffff818c4df2 #1 [ffff880019c535d8] schedule at ffffffff818c5517 #2 [ffff880019c535f8] _xfs_log_force_lsn at ffffffff81316348 #3 [ffff880019c53688] xfs_log_force_lsn at ffffffff813164fb #4 [ffff880019c536b8] xfs_iunpin_wait at ffffffff8130835e #5 [ffff880019c53728] xfs_reclaim_inode at ffffffff812fd453 #6 [ffff880019c53778] xfs_reclaim_inodes_ag at ffffffff812fd8c7 #7 [ffff880019c53928] xfs_reclaim_inodes_nr at ffffffff812fe433 #8 [ffff880019c53958] xfs_fs_free_cached_objects at ffffffff8130d3b9 #9 [ffff880019c53968] super_cache_scan at ffffffff811a6f73 #10 [ffff880019c539c8] shrink_slab at ffffffff811460e6 #11 [ffff880019c53aa8] shrink_zone at ffffffff8114a53f #12 [ffff880019c53b48] do_try_to_free_pages at ffffffff8114a8ba #13 [ffff880019c53be8] try_to_free_pages at ffffffff8114ad5a #14 [ffff880019c53c78] __alloc_pages_nodemask at ffffffff8113e1b8 #15 [ffff880019c53d88] alloc_kmem_pages_node at ffffffff8113e671 #16 [ffff880019c53dd8] copy_process at ffffffff8104f781 #17 [ffff880019c53ec8] do_fork at ffffffff8105129c #18 [ffff880019c53f38] sys_clone at ffffffff810515b6 #19 [ffff880019c53f48] stub_clone at ffffffff818c8e4d xfs_log_force_lsn is waiting for logs to get cleaned, which is waiting for IO, which is waiting for workers to complete the IO which is waiting for worker threads that don't exist yet: PID: 2752451 TASK: ffff880bd6bdda00 CPU: 37 COMMAND: "kworker/37:1" #0 [ffff8808d20abbb0] __schedule at ffffffff818c4df2 #1 [ffff8808d20abc00] schedule at ffffffff818c5517 #2 [ffff8808d20abc20] schedule_timeout at ffffffff818c7c6c #3 [ffff8808d20abcc0] wait_for_completion_killable at ffffffff818c6495 #4 [ffff8808d20abd30] kthread_create_on_node at ffffffff8106ec82 #5 [ffff8808d20abdf0] create_worker at ffffffff8106752f #6 [ffff8808d20abe40] worker_thread at ffffffff810699be #7 [ffff8808d20abec0] kthread at ffffffff8106ef59 #8 [ffff8808d20abf50] ret_from_fork at ffffffff818c8ac8 I think we should be using WQ_MEM_RECLAIM to make sure this thread pool makes progress when we're not able to allocate new workers. [dchinner: make all workqueues WQ_MEM_RECLAIM] Signed-off-by: Chris Mason Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_super.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 29531ec19ba6..65fbfb7a029a 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -838,17 +838,18 @@ xfs_init_mount_workqueues( goto out_destroy_unwritten; mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s", - WQ_FREEZABLE, 0, mp->m_fsname); + WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); if (!mp->m_reclaim_workqueue) goto out_destroy_cil; mp->m_log_workqueue = alloc_workqueue("xfs-log/%s", - WQ_FREEZABLE|WQ_HIGHPRI, 0, mp->m_fsname); + WQ_MEM_RECLAIM|WQ_FREEZABLE|WQ_HIGHPRI, 0, + mp->m_fsname); if (!mp->m_log_workqueue) goto out_destroy_reclaim; mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s", - WQ_FREEZABLE, 0, mp->m_fsname); + WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); if (!mp->m_eofblocks_workqueue) goto out_destroy_log; -- cgit v1.2.3-59-g8ed1b