From a245769f254bbbea868e2cf8dc42daa061cd276f Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 20 Jan 2012 10:38:36 +0000 Subject: GFS2: glock statistics gathering The stats are divided into two sets: those relating to the super block and those relating to an individual glock. The super block stats are done on a per cpu basis in order to try and reduce the overhead of gathering them. They are also further divided by glock type. In the case of both the super block and glock statistics, the same information is gathered in each case. The super block statistics are used to provide default values for most of the glock statistics, so that newly created glocks should have, as far as possible, a sensible starting point. The statistics are divided into three pairs of mean and variance, plus two counters. The mean/variance pairs are smoothed exponential estimates and the algorithm used is one which will be very familiar to those used to calculation of round trip times in network code. The three pairs of mean/variance measure the following things: 1. DLM lock time (non-blocking requests) 2. DLM lock time (blocking requests) 3. Inter-request time (again to the DLM) A non-blocking request is one which will complete right away, whatever the state of the DLM lock in question. That currently means any requests when (a) the current state of the lock is exclusive (b) the requested state is either null or unlocked or (c) the "try lock" flag is set. A blocking request covers all the other lock requests. There are two counters. The first is there primarily to show how many lock requests have been made, and thus how much data has gone into the mean/variance calculations. The other counter is counting queueing of holders at the top layer of the glock code. Hopefully that number will be a lot larger than the number of dlm lock requests issued. So why gather these statistics? There are several reasons we'd like to get a better idea of these timings: 1. To be able to better set the glock "min hold time" 2. To spot performance issues more easily 3. To improve the algorithm for selecting resource groups for allocation (to base it on lock wait time, rather than blindly using a "try lock") Due to the smoothing action of the updates, a step change in some input quantity being sampled will only fully be taken into account after 8 samples (or 4 for the variance) and this needs to be carefully considered when interpreting the results. Knowing both the time it takes a lock request to complete and the average time between lock requests for a glock means we can compute the total percentage of the time for which the node is able to use a glock vs. time that the rest of the cluster has its share. That will be very useful when setting the lock min hold time. The other point to remember is that all times are in nanoseconds. Great care has been taken to ensure that we measure exactly the quantities that we want, as accurately as possible. There are always inaccuracies in any measuring system, but I hope this is as accurate as we can reasonably make it. Signed-off-by: Steven Whitehouse --- fs/gfs2/glock.c | 210 +++++++++++++++++++++++++++++++++++++++++++++++++-- fs/gfs2/incore.h | 49 ++++++++++-- fs/gfs2/lock_dlm.c | 123 ++++++++++++++++++++++++++++-- fs/gfs2/ops_fstype.c | 8 ++ fs/gfs2/trace_gfs2.h | 60 ++++++++++++++- 5 files changed, 431 insertions(+), 19 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 351a3e797789..dab2526071cc 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -29,6 +29,7 @@ #include #include #include +#include #include "gfs2.h" #include "incore.h" @@ -543,6 +544,11 @@ __acquires(&gl->gl_spin) do_error(gl, 0); /* Fail queued try locks */ } gl->gl_req = target; + set_bit(GLF_BLOCKING, &gl->gl_flags); + if ((gl->gl_req == LM_ST_UNLOCKED) || + (gl->gl_state == LM_ST_EXCLUSIVE) || + (lck_flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB))) + clear_bit(GLF_BLOCKING, &gl->gl_flags); spin_unlock(&gl->gl_spin); if (glops->go_xmote_th) glops->go_xmote_th(gl); @@ -744,6 +750,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, return -ENOMEM; atomic_inc(&sdp->sd_glock_disposal); + gl->gl_sbd = sdp; gl->gl_flags = 0; gl->gl_name = name; atomic_set(&gl->gl_ref, 1); @@ -752,12 +759,17 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, gl->gl_demote_state = LM_ST_EXCLUSIVE; gl->gl_hash = hash; gl->gl_ops = glops; - snprintf(gl->gl_strname, GDLM_STRNAME_BYTES, "%8x%16llx", name.ln_type, (unsigned long long)number); + gl->gl_dstamp = ktime_set(0, 0); + preempt_disable(); + /* We use the global stats to estimate the initial per-glock stats */ + gl->gl_stats = this_cpu_ptr(sdp->sd_lkstats)->lkstats[glops->go_type]; + preempt_enable(); + gl->gl_stats.stats[GFS2_LKS_DCOUNT] = 0; + gl->gl_stats.stats[GFS2_LKS_QCOUNT] = 0; memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb)); gl->gl_lksb.sb_lvbptr = gl->gl_lvb; gl->gl_tchange = jiffies; gl->gl_object = NULL; - gl->gl_sbd = sdp; gl->gl_hold_time = GL_GLOCK_DFT_HOLD; INIT_DELAYED_WORK(&gl->gl_work, glock_work_func); INIT_WORK(&gl->gl_delete, delete_work_func); @@ -999,6 +1011,8 @@ fail: } set_bit(GLF_QUEUED, &gl->gl_flags); trace_gfs2_glock_queue(gh, 1); + gfs2_glstats_inc(gl, GFS2_LKS_QCOUNT); + gfs2_sbstats_inc(gl, GFS2_LKS_QCOUNT); if (likely(insert_pt == NULL)) { list_add_tail(&gh->gh_list, &gl->gl_holders); if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY)) @@ -1658,6 +1672,8 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl) *p++ = 'L'; if (gl->gl_object) *p++ = 'o'; + if (test_bit(GLF_BLOCKING, gflags)) + *p++ = 'b'; *p = 0; return buf; } @@ -1714,8 +1730,78 @@ out: return error; } +static int gfs2_glstats_seq_show(struct seq_file *seq, void *iter_ptr) +{ + struct gfs2_glock *gl = iter_ptr; + + seq_printf(seq, "G: n:%u/%llx rtt:%lld/%lld rttb:%lld/%lld irt:%lld/%lld dcnt: %lld qcnt: %lld\n", + gl->gl_name.ln_type, + (unsigned long long)gl->gl_name.ln_number, + (long long)gl->gl_stats.stats[GFS2_LKS_SRTT], + (long long)gl->gl_stats.stats[GFS2_LKS_SRTTVAR], + (long long)gl->gl_stats.stats[GFS2_LKS_SRTTB], + (long long)gl->gl_stats.stats[GFS2_LKS_SRTTVARB], + (long long)gl->gl_stats.stats[GFS2_LKS_SIRT], + (long long)gl->gl_stats.stats[GFS2_LKS_SIRTVAR], + (long long)gl->gl_stats.stats[GFS2_LKS_DCOUNT], + (long long)gl->gl_stats.stats[GFS2_LKS_QCOUNT]); + return 0; +} + +static const char *gfs2_gltype[] = { + "type", + "reserved", + "nondisk", + "inode", + "rgrp", + "meta", + "iopen", + "flock", + "plock", + "quota", + "journal", +}; + +static const char *gfs2_stype[] = { + [GFS2_LKS_SRTT] = "srtt", + [GFS2_LKS_SRTTVAR] = "srttvar", + [GFS2_LKS_SRTTB] = "srttb", + [GFS2_LKS_SRTTVARB] = "srttvarb", + [GFS2_LKS_SIRT] = "sirt", + [GFS2_LKS_SIRTVAR] = "sirtvar", + [GFS2_LKS_DCOUNT] = "dlm", + [GFS2_LKS_QCOUNT] = "queue", +}; + +#define GFS2_NR_SBSTATS (ARRAY_SIZE(gfs2_gltype) * ARRAY_SIZE(gfs2_stype)) + +static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr) +{ + struct gfs2_glock_iter *gi = seq->private; + struct gfs2_sbd *sdp = gi->sdp; + unsigned index = gi->hash >> 3; + unsigned subindex = gi->hash & 0x07; + s64 value; + int i; + + if (index == 0 && subindex != 0) + return 0; + seq_printf(seq, "%-10s %8s:", gfs2_gltype[index], + (index == 0) ? "cpu": gfs2_stype[subindex]); + for_each_possible_cpu(i) { + const struct gfs2_pcpu_lkstats *lkstats = per_cpu_ptr(sdp->sd_lkstats, i); + if (index == 0) { + value = i; + } else { + value = lkstats->lkstats[index - 1].stats[subindex]; + } + seq_printf(seq, " %15lld", (long long)value); + } + seq_putc(seq, '\n'); + return 0; +} int __init gfs2_glock_init(void) { @@ -1828,6 +1914,35 @@ static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr) return dump_glock(seq, iter_ptr); } +static void *gfs2_sbstats_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct gfs2_glock_iter *gi = seq->private; + + gi->hash = *pos; + if (*pos >= GFS2_NR_SBSTATS) + return NULL; + preempt_disable(); + return SEQ_START_TOKEN; +} + +static void *gfs2_sbstats_seq_next(struct seq_file *seq, void *iter_ptr, + loff_t *pos) +{ + struct gfs2_glock_iter *gi = seq->private; + (*pos)++; + gi->hash++; + if (gi->hash >= GFS2_NR_SBSTATS) { + preempt_enable(); + return NULL; + } + return SEQ_START_TOKEN; +} + +static void gfs2_sbstats_seq_stop(struct seq_file *seq, void *iter_ptr) +{ + preempt_enable(); +} + static const struct seq_operations gfs2_glock_seq_ops = { .start = gfs2_glock_seq_start, .next = gfs2_glock_seq_next, @@ -1835,7 +1950,21 @@ static const struct seq_operations gfs2_glock_seq_ops = { .show = gfs2_glock_seq_show, }; -static int gfs2_debugfs_open(struct inode *inode, struct file *file) +static const struct seq_operations gfs2_glstats_seq_ops = { + .start = gfs2_glock_seq_start, + .next = gfs2_glock_seq_next, + .stop = gfs2_glock_seq_stop, + .show = gfs2_glstats_seq_show, +}; + +static const struct seq_operations gfs2_sbstats_seq_ops = { + .start = gfs2_sbstats_seq_start, + .next = gfs2_sbstats_seq_next, + .stop = gfs2_sbstats_seq_stop, + .show = gfs2_sbstats_seq_show, +}; + +static int gfs2_glocks_open(struct inode *inode, struct file *file) { int ret = seq_open_private(file, &gfs2_glock_seq_ops, sizeof(struct gfs2_glock_iter)); @@ -1847,9 +1976,49 @@ static int gfs2_debugfs_open(struct inode *inode, struct file *file) return ret; } -static const struct file_operations gfs2_debug_fops = { +static int gfs2_glstats_open(struct inode *inode, struct file *file) +{ + int ret = seq_open_private(file, &gfs2_glstats_seq_ops, + sizeof(struct gfs2_glock_iter)); + if (ret == 0) { + struct seq_file *seq = file->private_data; + struct gfs2_glock_iter *gi = seq->private; + gi->sdp = inode->i_private; + } + return ret; +} + +static int gfs2_sbstats_open(struct inode *inode, struct file *file) +{ + int ret = seq_open_private(file, &gfs2_sbstats_seq_ops, + sizeof(struct gfs2_glock_iter)); + if (ret == 0) { + struct seq_file *seq = file->private_data; + struct gfs2_glock_iter *gi = seq->private; + gi->sdp = inode->i_private; + } + return ret; +} + +static const struct file_operations gfs2_glocks_fops = { + .owner = THIS_MODULE, + .open = gfs2_glocks_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +static const struct file_operations gfs2_glstats_fops = { .owner = THIS_MODULE, - .open = gfs2_debugfs_open, + .open = gfs2_glstats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +static const struct file_operations gfs2_sbstats_fops = { + .owner = THIS_MODULE, + .open = gfs2_sbstats_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release_private, @@ -1863,20 +2032,45 @@ int gfs2_create_debugfs_file(struct gfs2_sbd *sdp) sdp->debugfs_dentry_glocks = debugfs_create_file("glocks", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp, - &gfs2_debug_fops); + &gfs2_glocks_fops); if (!sdp->debugfs_dentry_glocks) - return -ENOMEM; + goto fail; + + sdp->debugfs_dentry_glstats = debugfs_create_file("glstats", + S_IFREG | S_IRUGO, + sdp->debugfs_dir, sdp, + &gfs2_glstats_fops); + if (!sdp->debugfs_dentry_glstats) + goto fail; + + sdp->debugfs_dentry_sbstats = debugfs_create_file("sbstats", + S_IFREG | S_IRUGO, + sdp->debugfs_dir, sdp, + &gfs2_sbstats_fops); + if (!sdp->debugfs_dentry_sbstats) + goto fail; return 0; +fail: + gfs2_delete_debugfs_file(sdp); + return -ENOMEM; } void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp) { - if (sdp && sdp->debugfs_dir) { + if (sdp->debugfs_dir) { if (sdp->debugfs_dentry_glocks) { debugfs_remove(sdp->debugfs_dentry_glocks); sdp->debugfs_dentry_glocks = NULL; } + if (sdp->debugfs_dentry_glstats) { + debugfs_remove(sdp->debugfs_dentry_glstats); + sdp->debugfs_dentry_glstats = NULL; + } + if (sdp->debugfs_dentry_sbstats) { + debugfs_remove(sdp->debugfs_dentry_sbstats); + sdp->debugfs_dentry_sbstats = NULL; + } debugfs_remove(sdp->debugfs_dir); sdp->debugfs_dir = NULL; } diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 97742a7ea9cc..4d546df58ac9 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -19,6 +19,8 @@ #include #include #include +#include +#include #define DIO_WAIT 0x00000010 #define DIO_METADATA 0x00000020 @@ -204,6 +206,22 @@ struct gfs2_glock_operations { #define GLOF_ASPACE 1 }; +enum { + GFS2_LKS_SRTT = 0, /* Non blocking smoothed round trip time */ + GFS2_LKS_SRTTVAR = 1, /* Non blocking smoothed variance */ + GFS2_LKS_SRTTB = 2, /* Blocking smoothed round trip time */ + GFS2_LKS_SRTTVARB = 3, /* Blocking smoothed variance */ + GFS2_LKS_SIRT = 4, /* Smoothed Inter-request time */ + GFS2_LKS_SIRTVAR = 5, /* Smoothed Inter-request variance */ + GFS2_LKS_DCOUNT = 6, /* Count of dlm requests */ + GFS2_LKS_QCOUNT = 7, /* Count of gfs2_holder queues */ + GFS2_NR_LKSTATS +}; + +struct gfs2_lkstats { + s64 stats[GFS2_NR_LKSTATS]; +}; + enum { /* States */ HIF_HOLDER = 6, /* Set for gh that "holds" the glock */ @@ -238,10 +256,12 @@ enum { GLF_QUEUED = 12, GLF_LRU = 13, GLF_OBJECT = 14, /* Used only for tracing */ + GLF_BLOCKING = 15, }; struct gfs2_glock { struct hlist_bl_node gl_list; + struct gfs2_sbd *gl_sbd; unsigned long gl_flags; /* GLF_... */ struct lm_lockname gl_name; atomic_t gl_ref; @@ -261,16 +281,14 @@ struct gfs2_glock { struct list_head gl_holders; const struct gfs2_glock_operations *gl_ops; - char gl_strname[GDLM_STRNAME_BYTES]; + ktime_t gl_dstamp; + struct gfs2_lkstats gl_stats; struct dlm_lksb gl_lksb; char gl_lvb[32]; unsigned long gl_tchange; void *gl_object; struct list_head gl_lru; - - struct gfs2_sbd *gl_sbd; - struct list_head gl_ail_list; atomic_t gl_ail_count; atomic_t gl_revokes; @@ -560,8 +578,14 @@ struct lm_lockstruct { uint32_t *ls_recover_result; /* result of last jid recovery */ }; +struct gfs2_pcpu_lkstats { + /* One struct for each glock type */ + struct gfs2_lkstats lkstats[10]; +}; + struct gfs2_sbd { struct super_block *sd_vfs; + struct gfs2_pcpu_lkstats __percpu *sd_lkstats; struct kobject sd_kobj; unsigned long sd_flags; /* SDF_... */ struct gfs2_sb_host sd_sb; @@ -725,8 +749,23 @@ struct gfs2_sbd { unsigned long sd_last_warning; struct dentry *debugfs_dir; /* debugfs directory */ - struct dentry *debugfs_dentry_glocks; /* for debugfs */ + struct dentry *debugfs_dentry_glocks; + struct dentry *debugfs_dentry_glstats; + struct dentry *debugfs_dentry_sbstats; }; +static inline void gfs2_glstats_inc(struct gfs2_glock *gl, int which) +{ + gl->gl_stats.stats[which]++; +} + +static inline void gfs2_sbstats_inc(const struct gfs2_glock *gl, int which) +{ + const struct gfs2_sbd *sdp = gl->gl_sbd; + preempt_disable(); + this_cpu_ptr(sdp->sd_lkstats)->lkstats[gl->gl_name.ln_type].stats[which]++; + preempt_enable(); +} + #endif /* __INCORE_DOT_H__ */ diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 8944d1e32ab5..f8411bd1b805 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -18,14 +18,106 @@ #include "glock.h" #include "util.h" #include "sys.h" +#include "trace_gfs2.h" extern struct workqueue_struct *gfs2_control_wq; +/** + * gfs2_update_stats - Update time based stats + * @mv: Pointer to mean/variance structure to update + * @sample: New data to include + * + * @delta is the difference between the current rtt sample and the + * running average srtt. We add 1/8 of that to the srtt in order to + * update the current srtt estimate. The varience estimate is a bit + * more complicated. We subtract the abs value of the @delta from + * the current variance estimate and add 1/4 of that to the running + * total. + * + * Note that the index points at the array entry containing the smoothed + * mean value, and the variance is always in the following entry + * + * Reference: TCP/IP Illustrated, vol 2, p. 831,832 + * All times are in units of integer nanoseconds. Unlike the TCP/IP case, + * they are not scaled fixed point. + */ + +static inline void gfs2_update_stats(struct gfs2_lkstats *s, unsigned index, + s64 sample) +{ + s64 delta = sample - s->stats[index]; + s->stats[index] += (delta >> 3); + index++; + s->stats[index] += ((abs64(delta) - s->stats[index]) >> 2); +} + +/** + * gfs2_update_reply_times - Update locking statistics + * @gl: The glock to update + * + * This assumes that gl->gl_dstamp has been set earlier. + * + * The rtt (lock round trip time) is an estimate of the time + * taken to perform a dlm lock request. We update it on each + * reply from the dlm. + * + * The blocking flag is set on the glock for all dlm requests + * which may potentially block due to lock requests from other nodes. + * DLM requests where the current lock state is exclusive, the + * requested state is null (or unlocked) or where the TRY or + * TRY_1CB flags are set are classified as non-blocking. All + * other DLM requests are counted as (potentially) blocking. + */ +static inline void gfs2_update_reply_times(struct gfs2_glock *gl) +{ + struct gfs2_pcpu_lkstats *lks; + const unsigned gltype = gl->gl_name.ln_type; + unsigned index = test_bit(GLF_BLOCKING, &gl->gl_flags) ? + GFS2_LKS_SRTTB : GFS2_LKS_SRTT; + s64 rtt; + + preempt_disable(); + rtt = ktime_to_ns(ktime_sub(ktime_get_real(), gl->gl_dstamp)); + lks = this_cpu_ptr(gl->gl_sbd->sd_lkstats); + gfs2_update_stats(&gl->gl_stats, index, rtt); /* Local */ + gfs2_update_stats(&lks->lkstats[gltype], index, rtt); /* Global */ + preempt_enable(); + + trace_gfs2_glock_lock_time(gl, rtt); +} + +/** + * gfs2_update_request_times - Update locking statistics + * @gl: The glock to update + * + * The irt (lock inter-request times) measures the average time + * between requests to the dlm. It is updated immediately before + * each dlm call. + */ + +static inline void gfs2_update_request_times(struct gfs2_glock *gl) +{ + struct gfs2_pcpu_lkstats *lks; + const unsigned gltype = gl->gl_name.ln_type; + ktime_t dstamp; + s64 irt; + + preempt_disable(); + dstamp = gl->gl_dstamp; + gl->gl_dstamp = ktime_get_real(); + irt = ktime_to_ns(ktime_sub(gl->gl_dstamp, dstamp)); + lks = this_cpu_ptr(gl->gl_sbd->sd_lkstats); + gfs2_update_stats(&gl->gl_stats, GFS2_LKS_SIRT, irt); /* Local */ + gfs2_update_stats(&lks->lkstats[gltype], GFS2_LKS_SIRT, irt); /* Global */ + preempt_enable(); +} + static void gdlm_ast(void *arg) { struct gfs2_glock *gl = arg; unsigned ret = gl->gl_state; + gfs2_update_reply_times(gl); BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED); if (gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID) @@ -111,7 +203,7 @@ static int make_mode(const unsigned int lmstate) static u32 make_flags(const u32 lkid, const unsigned int gfs_flags, const int req) { - u32 lkf = 0; + u32 lkf = DLM_LKF_VALBLK; if (gfs_flags & LM_FLAG_TRY) lkf |= DLM_LKF_NOQUEUE; @@ -138,26 +230,43 @@ static u32 make_flags(const u32 lkid, const unsigned int gfs_flags, if (lkid != 0) lkf |= DLM_LKF_CONVERT; - lkf |= DLM_LKF_VALBLK; - return lkf; } +static void gfs2_reverse_hex(char *c, u64 value) +{ + while (value) { + *c-- = hex_asc[value & 0x0f]; + value >>= 4; + } +} + static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state, unsigned int flags) { struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; int req; u32 lkf; + char strname[GDLM_STRNAME_BYTES] = ""; req = make_mode(req_state); lkf = make_flags(gl->gl_lksb.sb_lkid, flags, req); - + gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT); + gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT); + if (gl->gl_lksb.sb_lkid) { + gfs2_update_request_times(gl); + } else { + memset(strname, ' ', GDLM_STRNAME_BYTES - 1); + strname[GDLM_STRNAME_BYTES - 1] = '\0'; + gfs2_reverse_hex(strname + 7, gl->gl_name.ln_type); + gfs2_reverse_hex(strname + 23, gl->gl_name.ln_number); + gl->gl_dstamp = ktime_get_real(); + } /* * Submit the actual lock request. */ - return dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, gl->gl_strname, + return dlm_lock(ls->ls_dlm, req, &gl->gl_lksb, lkf, strname, GDLM_STRNAME_BYTES - 1, 0, gdlm_ast, gl, gdlm_bast); } @@ -172,6 +281,10 @@ static void gdlm_put_lock(struct gfs2_glock *gl) return; } + clear_bit(GLF_BLOCKING, &gl->gl_flags); + gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT); + gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT); + gfs2_update_request_times(gl); error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK, NULL, gl); if (error) { diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 24f609c9ef91..a55baa7f3239 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -68,6 +68,12 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) sb->s_fs_info = sdp; sdp->sd_vfs = sb; + sdp->sd_lkstats = alloc_percpu(struct gfs2_pcpu_lkstats); + if (!sdp->sd_lkstats) { + kfree(sdp); + return NULL; + } + set_bit(SDF_NOJOURNALID, &sdp->sd_flags); gfs2_tune_init(&sdp->sd_tune); @@ -1221,6 +1227,7 @@ fail_sys: gfs2_sys_fs_del(sdp); fail: gfs2_delete_debugfs_file(sdp); + free_percpu(sdp->sd_lkstats); kfree(sdp); sb->s_fs_info = NULL; return error; @@ -1393,6 +1400,7 @@ static void gfs2_kill_sb(struct super_block *sb) shrink_dcache_sb(sb); kill_block_super(sb); gfs2_delete_debugfs_file(sdp); + free_percpu(sdp->sd_lkstats); kfree(sdp); } diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index 5d07609ec57d..dfa89cd75534 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -11,6 +11,7 @@ #include #include #include +#include #include "incore.h" #include "glock.h" @@ -43,7 +44,8 @@ {(1UL << GLF_FROZEN), "F" }, \ {(1UL << GLF_QUEUED), "q" }, \ {(1UL << GLF_LRU), "L" }, \ - {(1UL << GLF_OBJECT), "o" }) + {(1UL << GLF_OBJECT), "o" }, \ + {(1UL << GLF_BLOCKING), "b" }) #ifndef NUMPTY #define NUMPTY @@ -236,6 +238,62 @@ TRACE_EVENT(gfs2_glock_queue, glock_trace_name(__entry->state)) ); +/* DLM sends a reply to GFS2 */ +TRACE_EVENT(gfs2_glock_lock_time, + + TP_PROTO(const struct gfs2_glock *gl, s64 tdiff), + + TP_ARGS(gl, tdiff), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( u64, glnum ) + __field( u32, gltype ) + __field( int, status ) + __field( char, flags ) + __field( s64, tdiff ) + __field( s64, srtt ) + __field( s64, srttvar ) + __field( s64, srttb ) + __field( s64, srttvarb ) + __field( s64, sirt ) + __field( s64, sirtvar ) + __field( s64, dcount ) + __field( s64, qcount ) + ), + + TP_fast_assign( + __entry->dev = gl->gl_sbd->sd_vfs->s_dev; + __entry->glnum = gl->gl_name.ln_number; + __entry->gltype = gl->gl_name.ln_type; + __entry->status = gl->gl_lksb.sb_status; + __entry->flags = gl->gl_lksb.sb_flags; + __entry->tdiff = tdiff; + __entry->srtt = gl->gl_stats.stats[GFS2_LKS_SRTT]; + __entry->srttvar = gl->gl_stats.stats[GFS2_LKS_SRTTVAR]; + __entry->srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB]; + __entry->srttvarb = gl->gl_stats.stats[GFS2_LKS_SRTTVARB]; + __entry->sirt = gl->gl_stats.stats[GFS2_LKS_SIRT]; + __entry->sirtvar = gl->gl_stats.stats[GFS2_LKS_SIRTVAR]; + __entry->dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT]; + __entry->qcount = gl->gl_stats.stats[GFS2_LKS_QCOUNT]; + ), + + TP_printk("%u,%u glock %d:%lld status:%d flags:%02x tdiff:%lld srtt:%lld/%lld srttb:%lld/%lld sirt:%lld/%lld dcnt:%lld qcnt:%lld", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype, + (unsigned long long)__entry->glnum, + __entry->status, __entry->flags, + (long long)__entry->tdiff, + (long long)__entry->srtt, + (long long)__entry->srttvar, + (long long)__entry->srttb, + (long long)__entry->srttvarb, + (long long)__entry->sirt, + (long long)__entry->sirtvar, + (long long)__entry->dcount, + (long long)__entry->qcount) +); + /* Section 2 - Log/journal * * Objectives: -- cgit v1.2.3-59-g8ed1b From 47ac5537a794fc71f89d51af492a945bd233f70c Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 3 Feb 2012 15:21:59 +0000 Subject: GFS2: Move two functions from log.c to lops.c gfs2_log_get_buf() and gfs2_log_fake_buf() are both used only in lops.c, so move them next to their callers and they can then become static. Signed-off-by: Steven Whitehouse --- fs/gfs2/log.c | 101 +++------------------------------------------------------ fs/gfs2/log.h | 5 +-- fs/gfs2/lops.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 97 insertions(+), 101 deletions(-) diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 756fae9eaf8f..4d31379265cb 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -358,7 +358,7 @@ retry: return 0; } -static u64 log_bmap(struct gfs2_sbd *sdp, unsigned int lbn) +u64 gfs2_log_bmap(struct gfs2_sbd *sdp, unsigned int lbn) { struct gfs2_journal_extent *je; @@ -467,8 +467,8 @@ static unsigned int current_tail(struct gfs2_sbd *sdp) void gfs2_log_incr_head(struct gfs2_sbd *sdp) { - if (sdp->sd_log_flush_head == sdp->sd_log_tail) - BUG_ON(sdp->sd_log_flush_head != sdp->sd_log_head); + BUG_ON((sdp->sd_log_flush_head == sdp->sd_log_tail) && + (sdp->sd_log_flush_head != sdp->sd_log_head)); if (++sdp->sd_log_flush_head == sdp->sd_jdesc->jd_blocks) { sdp->sd_log_flush_head = 0; @@ -476,99 +476,6 @@ void gfs2_log_incr_head(struct gfs2_sbd *sdp) } } -/** - * gfs2_log_write_endio - End of I/O for a log buffer - * @bh: The buffer head - * @uptodate: I/O Status - * - */ - -static void gfs2_log_write_endio(struct buffer_head *bh, int uptodate) -{ - struct gfs2_sbd *sdp = bh->b_private; - bh->b_private = NULL; - - end_buffer_write_sync(bh, uptodate); - if (atomic_dec_and_test(&sdp->sd_log_in_flight)) - wake_up(&sdp->sd_log_flush_wait); -} - -/** - * gfs2_log_get_buf - Get and initialize a buffer to use for log control data - * @sdp: The GFS2 superblock - * - * Returns: the buffer_head - */ - -struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp) -{ - u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head); - struct buffer_head *bh; - - bh = sb_getblk(sdp->sd_vfs, blkno); - lock_buffer(bh); - memset(bh->b_data, 0, bh->b_size); - set_buffer_uptodate(bh); - clear_buffer_dirty(bh); - gfs2_log_incr_head(sdp); - atomic_inc(&sdp->sd_log_in_flight); - bh->b_private = sdp; - bh->b_end_io = gfs2_log_write_endio; - - return bh; -} - -/** - * gfs2_fake_write_endio - - * @bh: The buffer head - * @uptodate: The I/O Status - * - */ - -static void gfs2_fake_write_endio(struct buffer_head *bh, int uptodate) -{ - struct buffer_head *real_bh = bh->b_private; - struct gfs2_bufdata *bd = real_bh->b_private; - struct gfs2_sbd *sdp = bd->bd_gl->gl_sbd; - - end_buffer_write_sync(bh, uptodate); - free_buffer_head(bh); - unlock_buffer(real_bh); - brelse(real_bh); - if (atomic_dec_and_test(&sdp->sd_log_in_flight)) - wake_up(&sdp->sd_log_flush_wait); -} - -/** - * gfs2_log_fake_buf - Build a fake buffer head to write metadata buffer to log - * @sdp: the filesystem - * @data: the data the buffer_head should point to - * - * Returns: the log buffer descriptor - */ - -struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp, - struct buffer_head *real) -{ - u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head); - struct buffer_head *bh; - - bh = alloc_buffer_head(GFP_NOFS | __GFP_NOFAIL); - atomic_set(&bh->b_count, 1); - bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate) | (1 << BH_Lock); - set_bh_page(bh, real->b_page, bh_offset(real)); - bh->b_blocknr = blkno; - bh->b_size = sdp->sd_sb.sb_bsize; - bh->b_bdev = sdp->sd_vfs->s_bdev; - bh->b_private = real; - bh->b_end_io = gfs2_fake_write_endio; - - gfs2_log_incr_head(sdp); - atomic_inc(&sdp->sd_log_in_flight); - - return bh; -} - static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail) { unsigned int dist = log_distance(sdp, new_tail, sdp->sd_log_tail); @@ -592,7 +499,7 @@ static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail) static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull) { - u64 blkno = log_bmap(sdp, sdp->sd_log_flush_head); + u64 blkno = gfs2_log_bmap(sdp, sdp->sd_log_flush_head); struct buffer_head *bh; struct gfs2_log_header *lh; unsigned int tail; diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index ab0621698b73..ff07454b582c 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -53,10 +53,7 @@ extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct, extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); extern void gfs2_log_incr_head(struct gfs2_sbd *sdp); - -extern struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp); -extern struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp, - struct buffer_head *real); +extern u64 gfs2_log_bmap(struct gfs2_sbd *sdp, unsigned int lbn); extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl); extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans); extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 0301be655b12..8e323c4b7983 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -143,6 +143,98 @@ static inline __be64 *bh_ptr_end(struct buffer_head *bh) return (__force __be64 *)(bh->b_data + bh->b_size); } +/** + * gfs2_log_write_endio - End of I/O for a log buffer + * @bh: The buffer head + * @uptodate: I/O Status + * + */ + +static void gfs2_log_write_endio(struct buffer_head *bh, int uptodate) +{ + struct gfs2_sbd *sdp = bh->b_private; + bh->b_private = NULL; + + end_buffer_write_sync(bh, uptodate); + if (atomic_dec_and_test(&sdp->sd_log_in_flight)) + wake_up(&sdp->sd_log_flush_wait); +} + +/** + * gfs2_log_get_buf - Get and initialize a buffer to use for log control data + * @sdp: The GFS2 superblock + * + * tReturns: the buffer_head + */ + +static struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp) +{ + u64 blkno = gfs2_log_bmap(sdp, sdp->sd_log_flush_head); + struct buffer_head *bh; + + bh = sb_getblk(sdp->sd_vfs, blkno); + lock_buffer(bh); + memset(bh->b_data, 0, bh->b_size); + set_buffer_uptodate(bh); + clear_buffer_dirty(bh); + gfs2_log_incr_head(sdp); + atomic_inc(&sdp->sd_log_in_flight); + bh->b_private = sdp; + bh->b_end_io = gfs2_log_write_endio; + + return bh; +} + +/** + * gfs2_fake_write_endio - + * @bh: The buffer head + * @uptodate: The I/O Status + * + */ + +static void gfs2_fake_write_endio(struct buffer_head *bh, int uptodate) +{ + struct buffer_head *real_bh = bh->b_private; + struct gfs2_bufdata *bd = real_bh->b_private; + struct gfs2_sbd *sdp = bd->bd_gl->gl_sbd; + + end_buffer_write_sync(bh, uptodate); + free_buffer_head(bh); + unlock_buffer(real_bh); + brelse(real_bh); + if (atomic_dec_and_test(&sdp->sd_log_in_flight)) + wake_up(&sdp->sd_log_flush_wait); +} + +/** + * gfs2_log_fake_buf - Build a fake buffer head to write metadata buffer to log + * @sdp: the filesystem + * @data: the data the buffer_head should point to + * + * Returns: the log buffer descriptor + */ + +static struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp, + struct buffer_head *real) +{ + u64 blkno = gfs2_log_bmap(sdp, sdp->sd_log_flush_head); + struct buffer_head *bh; + + bh = alloc_buffer_head(GFP_NOFS | __GFP_NOFAIL); + atomic_set(&bh->b_count, 1); + bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate) | (1 << BH_Lock); + set_bh_page(bh, real->b_page, bh_offset(real)); + bh->b_blocknr = blkno; + bh->b_size = sdp->sd_sb.sb_bsize; + bh->b_bdev = sdp->sd_vfs->s_bdev; + bh->b_private = real; + bh->b_end_io = gfs2_fake_write_endio; + + gfs2_log_incr_head(sdp); + atomic_inc(&sdp->sd_log_in_flight); + + return bh; +} static struct buffer_head *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type) { -- cgit v1.2.3-59-g8ed1b From 66fc061bda3526650328b73f69985da3518c4256 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 8 Feb 2012 12:58:32 +0000 Subject: GFS2: FITRIM ioctl support The FITRIM ioctl provides an alternative way to send discard requests to the underlying device. Using the discard mount option results in every freed block generating a discard request to the block device. This can be slow, since many block devices can only process discard requests of larger sizes, and also such operations can be time consuming. Rather than using the discard mount option, FITRIM allows a sweep of the filesystem on an occasional basis, and also to optionally avoid sending down discard requests for smaller regions. In GFS2 FITRIM will work at resource group granularity. There is a flag for each resource group which keeps track of which resource groups have been trimmed. This flag is reset whenever a deallocation occurs in the resource group, and set whenever a successful FITRIM of that resource group has taken place. This helps to reduce repeated discard requests for the same block ranges, again improving performance. Signed-off-by: Steven Whitehouse --- fs/gfs2/file.c | 2 + fs/gfs2/inode.c | 4 +- fs/gfs2/lops.c | 2 +- fs/gfs2/rgrp.c | 164 +++++++++++++++++++++++++++++++++++++------- fs/gfs2/rgrp.h | 10 +-- fs/gfs2/super.c | 2 +- fs/gfs2/xattr.c | 4 +- include/linux/gfs2_ondisk.h | 1 + 8 files changed, 153 insertions(+), 36 deletions(-) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index c5fb3597f696..310f2fb6f7ea 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -313,6 +313,8 @@ static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return gfs2_get_flags(filp, (u32 __user *)arg); case FS_IOC_SETFLAGS: return gfs2_set_flags(filp, (u32 __user *)arg); + case FITRIM: + return gfs2_fitrim(filp, (void __user *)arg); } return -ENOTTY; } diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 56987460cdae..c98a60ee6dfd 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1036,7 +1036,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry) gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); - rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr); + rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr, 1); if (!rgd) goto out_inodes; @@ -1255,7 +1255,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, * this is the case of the target file already existing * so we unlink before doing the rename */ - nrgd = gfs2_blk2rgrpd(sdp, nip->i_no_addr); + nrgd = gfs2_blk2rgrpd(sdp, nip->i_no_addr, 1); if (nrgd) gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++); } diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 8e323c4b7983..fe369bd9e10c 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -76,7 +76,7 @@ static void maybe_release_space(struct gfs2_bufdata *bd) if (bi->bi_clone == 0) return; if (sdp->sd_args.ar_discard) - gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bd->bd_bh, bi); + gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bd->bd_bh, bi, 1, NULL); memcpy(bi->bi_clone + bi->bi_offset, bd->bd_bh->b_data + bi->bi_offset, bi->bi_len); clear_bit(GBF_FULL, &bi->bi_flags); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 49ada95209d0..1446b4e0ac73 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -327,23 +327,31 @@ static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block) * Returns: The resource group, or NULL if not found */ -struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk) +struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact) { - struct rb_node **newn; + struct rb_node *n, *next; struct gfs2_rgrpd *cur; spin_lock(&sdp->sd_rindex_spin); - newn = &sdp->sd_rindex_tree.rb_node; - while (*newn) { - cur = rb_entry(*newn, struct gfs2_rgrpd, rd_node); + n = sdp->sd_rindex_tree.rb_node; + while (n) { + cur = rb_entry(n, struct gfs2_rgrpd, rd_node); + next = NULL; if (blk < cur->rd_addr) - newn = &((*newn)->rb_left); + next = n->rb_left; else if (blk >= cur->rd_data0 + cur->rd_data) - newn = &((*newn)->rb_right); - else { + next = n->rb_right; + if (next == NULL) { spin_unlock(&sdp->sd_rindex_spin); + if (exact) { + if (blk < cur->rd_addr) + return NULL; + if (blk >= cur->rd_data0 + cur->rd_data) + return NULL; + } return cur; } + n = next; } spin_unlock(&sdp->sd_rindex_spin); @@ -810,9 +818,9 @@ void gfs2_rgrp_go_unlock(struct gfs2_holder *gh) } -void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, +int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, struct buffer_head *bh, - const struct gfs2_bitmap *bi) + const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed) { struct super_block *sb = sdp->sd_vfs; struct block_device *bdev = sb->s_bdev; @@ -823,11 +831,19 @@ void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, sector_t nr_sects = 0; int rv; unsigned int x; + u32 trimmed = 0; + u8 diff; for (x = 0; x < bi->bi_len; x++) { - const u8 *orig = bh->b_data + bi->bi_offset + x; - const u8 *clone = bi->bi_clone + bi->bi_offset + x; - u8 diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1)); + const u8 *clone = bi->bi_clone ? bi->bi_clone : bi->bi_bh->b_data; + clone += bi->bi_offset; + clone += x; + if (bh) { + const u8 *orig = bh->b_data + bi->bi_offset + x; + diff = ~(*orig | (*orig >> 1)) & (*clone | (*clone >> 1)); + } else { + diff = ~(*clone | (*clone >> 1)); + } diff &= 0x55; if (diff == 0) continue; @@ -838,11 +854,14 @@ void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, if (nr_sects == 0) goto start_new_extent; if ((start + nr_sects) != blk) { - rv = blkdev_issue_discard(bdev, start, - nr_sects, GFP_NOFS, - 0); - if (rv) - goto fail; + if (nr_sects >= minlen) { + rv = blkdev_issue_discard(bdev, + start, nr_sects, + GFP_NOFS, 0); + if (rv) + goto fail; + trimmed += nr_sects; + } nr_sects = 0; start_new_extent: start = blk; @@ -853,15 +872,108 @@ start_new_extent: blk += sects_per_blk; } } - if (nr_sects) { + if (nr_sects >= minlen) { rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, 0); if (rv) goto fail; + trimmed += nr_sects; } - return; + if (ptrimmed) + *ptrimmed = trimmed; + return 0; + fail: - fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem", rv); + if (sdp->sd_args.ar_discard) + fs_warn(sdp, "error %d on discard request, turning discards off for this filesystem", rv); sdp->sd_args.ar_discard = 0; + return -EIO; +} + +/** + * gfs2_fitrim - Generate discard requests for unused bits of the filesystem + * @filp: Any file on the filesystem + * @argp: Pointer to the arguments (also used to pass result) + * + * Returns: 0 on success, otherwise error code + */ + +int gfs2_fitrim(struct file *filp, void __user *argp) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct request_queue *q = bdev_get_queue(sdp->sd_vfs->s_bdev); + struct buffer_head *bh; + struct gfs2_rgrpd *rgd; + struct gfs2_rgrpd *rgd_end; + struct gfs2_holder gh; + struct fstrim_range r; + int ret = 0; + u64 amt; + u64 trimmed = 0; + unsigned int x; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + + ret = gfs2_rindex_update(sdp); + if (ret) + return ret; + + if (argp == NULL) { + r.start = 0; + r.len = ULLONG_MAX; + r.minlen = 0; + } else if (copy_from_user(&r, argp, sizeof(r))) + return -EFAULT; + + rgd = gfs2_blk2rgrpd(sdp, r.start, 0); + rgd_end = gfs2_blk2rgrpd(sdp, r.start + r.len, 0); + + while (1) { + + ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh); + if (ret) + goto out; + + if (!(rgd->rd_flags & GFS2_RGF_TRIMMED)) { + /* Trim each bitmap in the rgrp */ + for (x = 0; x < rgd->rd_length; x++) { + struct gfs2_bitmap *bi = rgd->rd_bits + x; + ret = gfs2_rgrp_send_discards(sdp, rgd->rd_data0, NULL, bi, r.minlen, &amt); + if (ret) { + gfs2_glock_dq_uninit(&gh); + goto out; + } + trimmed += amt; + } + + /* Mark rgrp as having been trimmed */ + ret = gfs2_trans_begin(sdp, RES_RG_HDR, 0); + if (ret == 0) { + bh = rgd->rd_bits[0].bi_bh; + rgd->rd_flags |= GFS2_RGF_TRIMMED; + gfs2_trans_add_bh(rgd->rd_gl, bh, 1); + gfs2_rgrp_out(rgd, bh->b_data); + gfs2_trans_end(sdp); + } + } + gfs2_glock_dq_uninit(&gh); + + if (rgd == rgd_end) + break; + + rgd = gfs2_rgrpd_get_next(rgd); + } + +out: + r.len = trimmed << 9; + if (argp && copy_to_user(argp, &r, sizeof(r))) + return -EFAULT; + + return ret; } /** @@ -1008,7 +1120,7 @@ static int get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) rgd = begin = ip->i_rgd; else - rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal); + rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1); if (rgd == NULL) return -EBADSLT; @@ -1293,7 +1405,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, u32 length, rgrp_blk, buf_blk; unsigned int buf; - rgd = gfs2_blk2rgrpd(sdp, bstart); + rgd = gfs2_blk2rgrpd(sdp, bstart, 1); if (!rgd) { if (gfs2_consist(sdp)) fs_err(sdp, "block = %llu\n", (unsigned long long)bstart); @@ -1474,7 +1586,7 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta) return; trace_gfs2_block_alloc(ip, bstart, blen, GFS2_BLKST_FREE); rgd->rd_free += blen; - + rgd->rd_flags &= ~GFS2_RGF_TRIMMED; gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); @@ -1567,7 +1679,7 @@ int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type) return error; error = -EINVAL; - rgd = gfs2_blk2rgrpd(sdp, no_addr); + rgd = gfs2_blk2rgrpd(sdp, no_addr, 1); if (!rgd) goto fail; @@ -1610,7 +1722,7 @@ void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, block)) rgd = ip->i_rgd; else - rgd = gfs2_blk2rgrpd(sdp, block); + rgd = gfs2_blk2rgrpd(sdp, block, 1); if (!rgd) { fs_err(sdp, "rlist_add: no rgrp for block %llu\n", (unsigned long long)block); return; diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index ceec9106cdf4..b4b10f4de25f 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -11,6 +11,7 @@ #define __RGRP_DOT_H__ #include +#include struct gfs2_rgrpd; struct gfs2_sbd; @@ -18,7 +19,7 @@ struct gfs2_holder; extern void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd); -extern struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk); +extern struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact); extern struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp); extern struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd); @@ -62,8 +63,9 @@ extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state); extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); extern u64 gfs2_ri_total(struct gfs2_sbd *sdp); extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl); -extern void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, - struct buffer_head *bh, - const struct gfs2_bitmap *bi); +extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, + struct buffer_head *bh, + const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed); +extern int gfs2_fitrim(struct file *filp, void __user *argp); #endif /* __RGRP_DOT_H__ */ diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 4553ce515f62..f3faf72fa7ae 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1417,7 +1417,7 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip) if (error) goto out; - rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr); + rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr, 1); if (!rgd) { gfs2_consist_inode(ip); error = -EIO; diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index e9636591b5d5..2e5ba425cae7 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -251,7 +251,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, if (!blks) return 0; - rgd = gfs2_blk2rgrpd(sdp, bn); + rgd = gfs2_blk2rgrpd(sdp, bn, 1); if (!rgd) { gfs2_consist_inode(ip); return -EIO; @@ -1439,7 +1439,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip) struct gfs2_holder gh; int error; - rgd = gfs2_blk2rgrpd(sdp, ip->i_eattr); + rgd = gfs2_blk2rgrpd(sdp, ip->i_eattr, 1); if (!rgd) { gfs2_consist_inode(ip); return -EIO; diff --git a/include/linux/gfs2_ondisk.h b/include/linux/gfs2_ondisk.h index b148087f49a6..fa98bdb073b9 100644 --- a/include/linux/gfs2_ondisk.h +++ b/include/linux/gfs2_ondisk.h @@ -168,6 +168,7 @@ struct gfs2_rindex { #define GFS2_RGF_METAONLY 0x00000002 #define GFS2_RGF_DATAONLY 0x00000004 #define GFS2_RGF_NOALLOC 0x00000008 +#define GFS2_RGF_TRIMMED 0x00000010 struct gfs2_rgrp { struct gfs2_meta_header rg_header; -- cgit v1.2.3-59-g8ed1b From 4a36d08d0d1cba0581d1656739102ce936f26557 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Tue, 14 Feb 2012 14:49:57 -0500 Subject: GFS2: Sort the ordered write list This patch sorts the ordered write list for GFS2 writes. This increases the throughput for simultaneous writes. For example, if you have ten processes, all doing: dd if=/dev/zero of=/mnt/gfs2/fileX on different files, the throughput will be much better. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/log.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 4d31379265cb..b8fe7b739c27 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "gfs2.h" #include "incore.h" @@ -566,6 +567,20 @@ static void log_flush_commit(struct gfs2_sbd *sdp) log_write_header(sdp, 0, 0); } +int bd_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct gfs2_bufdata *bda, *bdb; + + bda = list_entry(a, struct gfs2_bufdata, bd_le.le_list); + bdb = list_entry(b, struct gfs2_bufdata, bd_le.le_list); + + if (bda->bd_bh->b_blocknr < bdb->bd_bh->b_blocknr) + return -1; + if (bda->bd_bh->b_blocknr > bdb->bd_bh->b_blocknr) + return 1; + return 0; +} + static void gfs2_ordered_write(struct gfs2_sbd *sdp) { struct gfs2_bufdata *bd; @@ -573,6 +588,7 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp) LIST_HEAD(written); gfs2_log_lock(sdp); + list_sort(NULL, &sdp->sd_log_le_ordered, &bd_cmp); while (!list_empty(&sdp->sd_log_le_ordered)) { bd = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_bufdata, bd_le.le_list); list_move(&bd->bd_le.le_list, &written); -- cgit v1.2.3-59-g8ed1b From 08728f2d8b0ebf01618d3d63e69966f7d43859b9 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 21 Feb 2012 11:14:00 +0000 Subject: GFS2: Make bd_cmp() static Add missing static to bd_cmp() Signed-off-by: Steven Whitehouse --- fs/gfs2/log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index b8fe7b739c27..2b9f0d9b1b28 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -567,7 +567,7 @@ static void log_flush_commit(struct gfs2_sbd *sdp) log_write_header(sdp, 0, 0); } -int bd_cmp(void *priv, struct list_head *a, struct list_head *b) +static int bd_cmp(void *priv, struct list_head *a, struct list_head *b) { struct gfs2_bufdata *bda, *bdb; -- cgit v1.2.3-59-g8ed1b From a08fd280b58836c910a4af10eee2066e358d16db Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Wed, 29 Feb 2012 15:15:14 -0500 Subject: GFS2: Unlock rindex mutex on glock error This patch fixes an error path in function gfs2_rindex_update that leaves the rindex mutex held. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/rgrp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 1446b4e0ac73..e09370eec590 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -699,13 +699,14 @@ int gfs2_rindex_update(struct gfs2_sbd *sdp) if (!gfs2_glock_is_locked_by_me(gl)) { error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh); if (error) - return error; + goto out_unlock; unlock_required = 1; } if (!sdp->sd_rindex_uptodate) error = gfs2_ri_update(ip); if (unlock_required) gfs2_glock_dq_uninit(&ri_gh); +out_unlock: mutex_unlock(&sdp->sd_rindex_mutex); } -- cgit v1.2.3-59-g8ed1b From 6aad1c3d3eba3db38b3a1200e2b02ff3af501c5a Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Mon, 5 Mar 2012 09:20:59 -0500 Subject: GFS2: Eliminate sd_rindex_mutex Over time, we've slowly eliminated the use of sd_rindex_mutex. Up to this point, it was only used in two places: function gfs2_ri_total (which totals the file system size by reading and parsing the rindex file) and function gfs2_rindex_update which updates the rgrps in memory. Both of these functions have the rindex glock to protect them, so the rindex is unnecessary. Since gfs2_grow writes to the rindex via the meta_fs, the mutex is in the wrong order according to the normal rules. This patch eliminates the mutex entirely to avoid the problem. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/incore.h | 1 - fs/gfs2/ops_fstype.c | 1 - fs/gfs2/rgrp.c | 22 ++++++++++------------ 3 files changed, 10 insertions(+), 14 deletions(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 4d546df58ac9..47d0bda5ac2b 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -644,7 +644,6 @@ struct gfs2_sbd { int sd_rindex_uptodate; spinlock_t sd_rindex_spin; - struct mutex sd_rindex_mutex; struct rb_root sd_rindex_tree; unsigned int sd_rgrps; unsigned int sd_max_rg_data; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index a55baa7f3239..ae5e0a40c9b3 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -83,7 +83,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) spin_lock_init(&sdp->sd_statfs_spin); spin_lock_init(&sdp->sd_rindex_spin); - mutex_init(&sdp->sd_rindex_mutex); sdp->sd_rindex_tree.rb_node = NULL; INIT_LIST_HEAD(&sdp->sd_jindex_list); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index e09370eec590..6ff9f17f9ac2 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -540,7 +540,6 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp) struct file_ra_state ra_state; int error, rgrps; - mutex_lock(&sdp->sd_rindex_mutex); file_ra_state_init(&ra_state, inode->i_mapping); for (rgrps = 0;; rgrps++) { loff_t pos = rgrps * sizeof(struct gfs2_rindex); @@ -553,11 +552,10 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp) break; total_data += be32_to_cpu(((struct gfs2_rindex *)buf)->ri_data); } - mutex_unlock(&sdp->sd_rindex_mutex); return total_data; } -static void rgd_insert(struct gfs2_rgrpd *rgd) +static int rgd_insert(struct gfs2_rgrpd *rgd) { struct gfs2_sbd *sdp = rgd->rd_sbd; struct rb_node **newn = &sdp->sd_rindex_tree.rb_node, *parent = NULL; @@ -573,11 +571,13 @@ static void rgd_insert(struct gfs2_rgrpd *rgd) else if (rgd->rd_addr > cur->rd_addr) newn = &((*newn)->rb_right); else - return; + return -EEXIST; } rb_link_node(&rgd->rd_node, parent, newn); rb_insert_color(&rgd->rd_node, &sdp->sd_rindex_tree); + sdp->sd_rgrps++; + return 0; } /** @@ -631,10 +631,12 @@ static int read_rindex_entry(struct gfs2_inode *ip, if (rgd->rd_data > sdp->sd_max_rg_data) sdp->sd_max_rg_data = rgd->rd_data; spin_lock(&sdp->sd_rindex_spin); - rgd_insert(rgd); - sdp->sd_rgrps++; + error = rgd_insert(rgd); spin_unlock(&sdp->sd_rindex_spin); - return error; + if (!error) + return 0; + + error = 0; /* someone else read in the rgrp; free it and ignore it */ fail: kfree(rgd->rd_bits); @@ -695,22 +697,18 @@ int gfs2_rindex_update(struct gfs2_sbd *sdp) /* Read new copy from disk if we don't have the latest */ if (!sdp->sd_rindex_uptodate) { - mutex_lock(&sdp->sd_rindex_mutex); if (!gfs2_glock_is_locked_by_me(gl)) { error = gfs2_glock_nq_init(gl, LM_ST_SHARED, 0, &ri_gh); if (error) - goto out_unlock; + return error; unlock_required = 1; } if (!sdp->sd_rindex_uptodate) error = gfs2_ri_update(ip); if (unlock_required) gfs2_glock_dq_uninit(&ri_gh); -out_unlock: - mutex_unlock(&sdp->sd_rindex_mutex); } - return error; } -- cgit v1.2.3-59-g8ed1b From 58884c4df005ee5ee854cfcd0385d5a6bf25aa30 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Mon, 5 Mar 2012 10:19:35 -0500 Subject: GFS2: make sure rgrps are up to date in func gfs2_blk2rgrpd This patch adds a call to gfs2_rindex_update from function gfs2_blk2rgrpd and removes calls to it that are made redundant by it. The problem is that a gfs2_grow can add rgrps to the rindex, then put those rgrps into use, thus rendering the rindex we read in at mount time incomplete. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/rgrp.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 6ff9f17f9ac2..19bde40b4864 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -332,6 +332,9 @@ struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact) struct rb_node *n, *next; struct gfs2_rgrpd *cur; + if (gfs2_rindex_update(sdp)) + return NULL; + spin_lock(&sdp->sd_rindex_spin); n = sdp->sd_rindex_tree.rb_node; while (n) { @@ -917,10 +920,6 @@ int gfs2_fitrim(struct file *filp, void __user *argp) if (!blk_queue_discard(q)) return -EOPNOTSUPP; - ret = gfs2_rindex_update(sdp); - if (ret) - return ret; - if (argp == NULL) { r.start = 0; r.len = ULLONG_MAX; @@ -1671,13 +1670,8 @@ int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type) { struct gfs2_rgrpd *rgd; struct gfs2_holder rgd_gh; - int error; - - error = gfs2_rindex_update(sdp); - if (error) - return error; + int error = -EINVAL; - error = -EINVAL; rgd = gfs2_blk2rgrpd(sdp, no_addr, 1); if (!rgd) goto fail; -- cgit v1.2.3-59-g8ed1b From 35e478f42271673f79066a1ed008c6604621c6fe Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 7 Mar 2012 10:43:02 +0000 Subject: GFS2: Flush pending glock work when evicting an inode This ensures that we will not try to access the inode thats being flushed via the glock after it has been freed. Signed-off-by: Steven Whitehouse --- fs/gfs2/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index f3faf72fa7ae..6172fa77ad59 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1557,6 +1557,7 @@ out: end_writeback(inode); gfs2_dir_hash_inval(ip); ip->i_gl->gl_object = NULL; + flush_delayed_work_sync(&ip->i_gl->gl_work); gfs2_glock_add_to_lru(ip->i_gl); gfs2_glock_put(ip->i_gl); ip->i_gl = NULL; -- cgit v1.2.3-59-g8ed1b From 75ca61c101601a7071d93571920be9697b3fda9b Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Thu, 8 Mar 2012 12:10:23 +0000 Subject: GFS2: Remove a __GFP_NOFAIL allocation In order to ensure that we've got enough buffer heads for flushing the journal, the orignal code used __GFP_NOFAIL when performing this allocation. Here we dispense with that in favour of using a mempool. This should improve efficiency in low memory conditions since flushing the journal is a good way to get memory back, we don't want to be spinning, waiting on memory allocations. The buffers which are allocated via this mempool are fairly short lived, so that we'll recycle them pretty quickly. Although there are other memory allocations which occur during the journal flush process, this is the one which can potentially require the most memory, so the most important one to fix. The amount of memory reserved is a fixed amount, and we should not need to scale it when there are a greater number of filesystems in use. Signed-off-by: Steven Whitehouse --- fs/gfs2/lops.c | 5 +++-- fs/gfs2/main.c | 18 ++++++++++++++++++ fs/gfs2/util.c | 1 + fs/gfs2/util.h | 3 +++ 4 files changed, 25 insertions(+), 2 deletions(-) diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index fe369bd9e10c..87e6e0d66bb7 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -199,7 +200,7 @@ static void gfs2_fake_write_endio(struct buffer_head *bh, int uptodate) struct gfs2_sbd *sdp = bd->bd_gl->gl_sbd; end_buffer_write_sync(bh, uptodate); - free_buffer_head(bh); + mempool_free(bh, gfs2_bh_pool); unlock_buffer(real_bh); brelse(real_bh); if (atomic_dec_and_test(&sdp->sd_log_in_flight)) @@ -220,7 +221,7 @@ static struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp, u64 blkno = gfs2_log_bmap(sdp, sdp->sd_log_flush_head); struct buffer_head *bh; - bh = alloc_buffer_head(GFP_NOFS | __GFP_NOFAIL); + bh = mempool_alloc(gfs2_bh_pool, GFP_NOFS); atomic_set(&bh->b_count, 1); bh->b_state = (1 << BH_Mapped) | (1 << BH_Uptodate) | (1 << BH_Lock); set_bh_page(bh, real->b_page, bh_offset(real)); diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index a8d9bcd0e19c..754426b1e52c 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "gfs2.h" #include "incore.h" @@ -69,6 +70,16 @@ static void gfs2_init_gl_aspace_once(void *foo) address_space_init_once(mapping); } +static void *gfs2_bh_alloc(gfp_t mask, void *data) +{ + return alloc_buffer_head(mask); +} + +static void gfs2_bh_free(void *ptr, void *data) +{ + return free_buffer_head(ptr); +} + /** * init_gfs2_fs - Register GFS2 as a filesystem * @@ -151,6 +162,10 @@ static int __init init_gfs2_fs(void) gfs2_control_wq = alloc_workqueue("gfs2_control", WQ_NON_REENTRANT | WQ_UNBOUND | WQ_FREEZABLE, 0); if (!gfs2_control_wq) + goto fail_recovery; + + gfs2_bh_pool = mempool_create(1024, gfs2_bh_alloc, gfs2_bh_free, NULL); + if (!gfs2_bh_pool) goto fail_control; gfs2_register_debugfs(); @@ -160,6 +175,8 @@ static int __init init_gfs2_fs(void) return 0; fail_control: + destroy_workqueue(gfs2_control_wq); +fail_recovery: destroy_workqueue(gfs_recovery_wq); fail_wq: unregister_filesystem(&gfs2meta_fs_type); @@ -208,6 +225,7 @@ static void __exit exit_gfs2_fs(void) rcu_barrier(); + mempool_destroy(gfs2_bh_pool); kmem_cache_destroy(gfs2_quotad_cachep); kmem_cache_destroy(gfs2_rgrpd_cachep); kmem_cache_destroy(gfs2_bufdata_cachep); diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 53511291fe36..9e7765e8e7b0 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -25,6 +25,7 @@ struct kmem_cache *gfs2_inode_cachep __read_mostly; struct kmem_cache *gfs2_bufdata_cachep __read_mostly; struct kmem_cache *gfs2_rgrpd_cachep __read_mostly; struct kmem_cache *gfs2_quotad_cachep __read_mostly; +mempool_t *gfs2_bh_pool __read_mostly; void gfs2_assert_i(struct gfs2_sbd *sdp) { diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index b432e04600de..a4ce76c67dbb 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -10,6 +10,8 @@ #ifndef __UTIL_DOT_H__ #define __UTIL_DOT_H__ +#include + #include "incore.h" #define fs_printk(level, fs, fmt, arg...) \ @@ -150,6 +152,7 @@ extern struct kmem_cache *gfs2_inode_cachep; extern struct kmem_cache *gfs2_bufdata_cachep; extern struct kmem_cache *gfs2_rgrpd_cachep; extern struct kmem_cache *gfs2_quotad_cachep; +extern mempool_t *gfs2_bh_pool; static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt, unsigned int *p) -- cgit v1.2.3-59-g8ed1b From 34cc1781c2ae921107e89f6633cfab7436e355ba Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 9 Mar 2012 10:45:56 +0000 Subject: GFS2: Clean up log flush header writing We already send both a pre and post flush to the block device when writing a journal header. There is no need to wait for the previous I/O specifically when we do this, unless we've turned "barriers" off. As a side effect, this also cleans up the code path for flushing the journal and makes it more readable. Signed-off-by: Steven Whitehouse --- fs/gfs2/log.c | 131 +++++++++++++++++++++++++++++----------------------------- 1 file changed, 66 insertions(+), 65 deletions(-) diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 2b9f0d9b1b28..4752eadc7f6e 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -491,66 +491,8 @@ static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail) sdp->sd_log_tail = new_tail; } -/** - * log_write_header - Get and initialize a journal header buffer - * @sdp: The GFS2 superblock - * - * Returns: the initialized log buffer descriptor - */ -static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull) -{ - u64 blkno = gfs2_log_bmap(sdp, sdp->sd_log_flush_head); - struct buffer_head *bh; - struct gfs2_log_header *lh; - unsigned int tail; - u32 hash; - - bh = sb_getblk(sdp->sd_vfs, blkno); - lock_buffer(bh); - memset(bh->b_data, 0, bh->b_size); - set_buffer_uptodate(bh); - clear_buffer_dirty(bh); - - gfs2_ail1_empty(sdp); - tail = current_tail(sdp); - - lh = (struct gfs2_log_header *)bh->b_data; - memset(lh, 0, sizeof(struct gfs2_log_header)); - lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC); - lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH); - lh->lh_header.__pad0 = cpu_to_be64(0); - lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH); - lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); - lh->lh_sequence = cpu_to_be64(sdp->sd_log_sequence++); - lh->lh_flags = cpu_to_be32(flags); - lh->lh_tail = cpu_to_be32(tail); - lh->lh_blkno = cpu_to_be32(sdp->sd_log_flush_head); - hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header)); - lh->lh_hash = cpu_to_be32(hash); - - bh->b_end_io = end_buffer_write_sync; - get_bh(bh); - if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) - submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh); - else - submit_bh(WRITE_FLUSH_FUA | REQ_META, bh); - wait_on_buffer(bh); - - if (!buffer_uptodate(bh)) - gfs2_io_error_bh(sdp, bh); - brelse(bh); - - if (sdp->sd_log_tail != tail) - log_pull_tail(sdp, tail); - else - gfs2_assert_withdraw(sdp, !pull); - - sdp->sd_log_idle = (tail == sdp->sd_log_flush_head); - gfs2_log_incr_head(sdp); -} - -static void log_flush_commit(struct gfs2_sbd *sdp) +static void log_flush_wait(struct gfs2_sbd *sdp) { DEFINE_WAIT(wait); @@ -563,8 +505,6 @@ static void log_flush_commit(struct gfs2_sbd *sdp) } while(atomic_read(&sdp->sd_log_in_flight)); finish_wait(&sdp->sd_log_flush_wait, &wait); } - - log_write_header(sdp, 0, 0); } static int bd_cmp(void *priv, struct list_head *a, struct list_head *b) @@ -633,6 +573,68 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp) gfs2_log_unlock(sdp); } +/** + * log_write_header - Get and initialize a journal header buffer + * @sdp: The GFS2 superblock + * + * Returns: the initialized log buffer descriptor + */ + +static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull) +{ + u64 blkno = gfs2_log_bmap(sdp, sdp->sd_log_flush_head); + struct buffer_head *bh; + struct gfs2_log_header *lh; + unsigned int tail; + u32 hash; + + bh = sb_getblk(sdp->sd_vfs, blkno); + lock_buffer(bh); + memset(bh->b_data, 0, bh->b_size); + set_buffer_uptodate(bh); + clear_buffer_dirty(bh); + + gfs2_ail1_empty(sdp); + tail = current_tail(sdp); + + lh = (struct gfs2_log_header *)bh->b_data; + memset(lh, 0, sizeof(struct gfs2_log_header)); + lh->lh_header.mh_magic = cpu_to_be32(GFS2_MAGIC); + lh->lh_header.mh_type = cpu_to_be32(GFS2_METATYPE_LH); + lh->lh_header.__pad0 = cpu_to_be64(0); + lh->lh_header.mh_format = cpu_to_be32(GFS2_FORMAT_LH); + lh->lh_header.mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid); + lh->lh_sequence = cpu_to_be64(sdp->sd_log_sequence++); + lh->lh_flags = cpu_to_be32(flags); + lh->lh_tail = cpu_to_be32(tail); + lh->lh_blkno = cpu_to_be32(sdp->sd_log_flush_head); + hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header)); + lh->lh_hash = cpu_to_be32(hash); + + bh->b_end_io = end_buffer_write_sync; + get_bh(bh); + if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) { + gfs2_ordered_wait(sdp); + log_flush_wait(sdp); + submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh); + } else { + submit_bh(WRITE_FLUSH_FUA | REQ_META, bh); + } + wait_on_buffer(bh); + + if (!buffer_uptodate(bh)) + gfs2_io_error_bh(sdp, bh); + brelse(bh); + + if (sdp->sd_log_tail != tail) + log_pull_tail(sdp, tail); + else + gfs2_assert_withdraw(sdp, !pull); + + sdp->sd_log_idle = (tail == sdp->sd_log_flush_head); + gfs2_log_incr_head(sdp); +} + /** * gfs2_log_flush - flush incore transaction(s) * @sdp: the filesystem @@ -676,11 +678,10 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl) gfs2_ordered_write(sdp); lops_before_commit(sdp); - gfs2_ordered_wait(sdp); - if (sdp->sd_log_head != sdp->sd_log_flush_head) - log_flush_commit(sdp); - else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){ + if (sdp->sd_log_head != sdp->sd_log_flush_head) { + log_write_header(sdp, 0, 0); + } else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){ gfs2_log_lock(sdp); atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */ trace_gfs2_log_blocks(sdp, -1); -- cgit v1.2.3-59-g8ed1b From 58a7d5fb8e31279b992db4027e44b053a84b7344 Mon Sep 17 00:00:00 2001 From: Benjamin Marzinski Date: Thu, 8 Mar 2012 13:16:32 -0600 Subject: GFS2: call gfs2_write_alloc_required for each chunk gfs2_fallocate was calling gfs2_write_alloc_required() once at the start of the function. This caused problems since gfs2_write_alloc_required used a long unsigned int for the len, but gfs2_fallocate could allocate a much larger amount. This patch will move the call into the loop where the chunks are actually allocated and zeroed out. This will keep the allocation size under the limit, and also allow gfs2_fallocate to quickly skip over sections of the file that are already completely allocated. fallcate_chunk was also not correctly setting the file size. It was using the len veriable to find the last block written to, but by the time it was setting the size, the len variable had already been decremented to 0. Signed-off-by: Benjamin Marzinski Signed-off-by: Steven Whitehouse --- fs/gfs2/file.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 310f2fb6f7ea..76834587a8a4 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -676,6 +676,7 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, struct gfs2_inode *ip = GFS2_I(inode); struct buffer_head *dibh; int error; + loff_t size = len; unsigned int nr_blks; sector_t lblock = offset >> inode->i_blkbits; @@ -709,8 +710,8 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, goto out; } } - if (offset + len > inode->i_size && !(mode & FALLOC_FL_KEEP_SIZE)) - i_size_write(inode, offset + len); + if (offset + size > inode->i_size && !(mode & FALLOC_FL_KEEP_SIZE)) + i_size_write(inode, offset + size); mark_inode_dirty(inode); @@ -779,12 +780,14 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, if (unlikely(error)) goto out_uninit; - if (!gfs2_write_alloc_required(ip, offset, len)) - goto out_unlock; - while (len > 0) { if (len < bytes) bytes = len; + if (!gfs2_write_alloc_required(ip, offset, bytes)) { + len -= bytes; + offset += bytes; + continue; + } qa = gfs2_qadata_get(ip); if (!qa) { error = -ENOMEM; -- cgit v1.2.3-59-g8ed1b From 220cca2a4f5867db595135e0450381032eb54902 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Mon, 19 Mar 2012 15:25:50 -0400 Subject: GFS2: Change truncate page allocation to be GFP_NOFS This patch changes the page allocation in gfs2_block_truncate_page and two others to GFP_NOFS to avoid deadlock in low-memory conditions. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/bmap.c | 4 ++-- fs/gfs2/quota.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 14a704015970..197c5c47e577 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -60,7 +60,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, int release = 0; if (!page || page->index) { - page = grab_cache_page(inode->i_mapping, 0); + page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS); if (!page) return -ENOMEM; release = 1; @@ -930,7 +930,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from) struct page *page; int err; - page = grab_cache_page(mapping, index); + page = find_or_create_page(mapping, index, GFP_NOFS); if (!page) return 0; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index a45b21b03915..4856c66640bf 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -681,7 +681,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, ptr = qp; nbytes = sizeof(struct gfs2_quota); get_a_page: - page = grab_cache_page(mapping, index); + page = find_or_create_page(mapping, index, GFP_NOFS); if (!page) return -ENOMEM; -- cgit v1.2.3-59-g8ed1b