From dfe5b9ad83a63180f358b27d1018649a27b394a9 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 6 Dec 2013 11:52:34 +0000 Subject: GFS2: don't hold s_umount over blkdev_put This is a GFS2 version of Tejun's patch: 4f331f01b9c43bf001d3ffee578a97a1e0633eac vfs: don't hold s_umount over close_bdev_exclusive() call In this case its blkdev_put itself that is the issue and this patch uses the same solution of dropping and retaking s_umount. Reported-by: Tejun Heo Reported-by: Al Viro Signed-off-by: Steven Whitehouse --- fs/gfs2/ops_fstype.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 82303b474958..52fa88314f5c 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1366,8 +1366,18 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags, if (IS_ERR(s)) goto error_bdev; - if (s->s_root) + if (s->s_root) { + /* + * s_umount nests inside bd_mutex during + * __invalidate_device(). blkdev_put() acquires + * bd_mutex and can't be called under s_umount. Drop + * s_umount temporarily. This is safe as we're + * holding an active reference. + */ + up_write(&s->s_umount); blkdev_put(bdev, mode); + down_write(&s->s_umount); + } memset(&args, 0, sizeof(args)); args.ar_quota = GFS2_QUOTA_DEFAULT; -- cgit v1.2.3-59-g8ed1b From 9290a9a7c0bcf5400e8dbfbf9707fa68ea3fb338 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Tue, 10 Dec 2013 12:06:35 -0500 Subject: GFS2: Fix use-after-free race when calling gfs2_remove_from_ail Function gfs2_remove_from_ail drops the reference on the bh via brelse. This patch fixes a race condition whereby bh is deferenced after the brelse when setting bd->bd_blkno = bh->b_blocknr; Under certain rare circumstances, bh might be gone or reused, and bd->bd_blkno is set to whatever that memory happens to be, which is often 0. Later, in gfs2_trans_add_unrevoke, that bd fails the test "bd->bd_blkno >= blkno" which causes it to never be freed. The end result is that the bd is never freed from the bufdata cache, which results in this error: slab error in kmem_cache_destroy(): cache `gfs2_bufdata': Can't free all objects Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/log.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 610613fb65b5..9dcb9777a5f8 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -551,10 +551,10 @@ void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) struct buffer_head *bh = bd->bd_bh; struct gfs2_glock *gl = bd->bd_gl; - gfs2_remove_from_ail(bd); - bd->bd_bh = NULL; bh->b_private = NULL; bd->bd_blkno = bh->b_blocknr; + gfs2_remove_from_ail(bd); /* drops ref on bh */ + bd->bd_bh = NULL; bd->bd_ops = &gfs2_revoke_lops; sdp->sd_log_num_revoke++; atomic_inc(&gl->gl_revokes); -- cgit v1.2.3-59-g8ed1b From 502be2a32f09f388e4ff34ef2e3ebcabbbb261da Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Fri, 13 Dec 2013 08:31:06 -0500 Subject: GFS2: Fix slab memory leak in gfs2_bufdata This patch fixes a slab memory leak that sometimes can occur for files with a very short lifespan. The problem occurs when a dinode is deleted before it has gotten to the journal properly. In the leak scenario, the bd object is pinned for journal committment (queued to the metadata buffers queue: sd_log_le_buf) but is subsequently unpinned and dequeued before it finds its way to the ail or the revoke queue. In this rare circumstance, the bd object needs to be freed from slab memory, or it is forgotten. We have to be very careful how we do it, though, because multiple processes can call gfs2_remove_from_journal. In order to avoid double-frees, only the process that does the unpinning is allowed to free the bd. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/meta_io.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 932415050540..52f177be3bf8 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -258,6 +258,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int struct address_space *mapping = bh->b_page->mapping; struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping); struct gfs2_bufdata *bd = bh->b_private; + int was_pinned = 0; if (test_clear_buffer_pinned(bh)) { trace_gfs2_pin(bd, 0); @@ -273,12 +274,16 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int tr->tr_num_databuf_rm++; } tr->tr_touched = 1; + was_pinned = 1; brelse(bh); } if (bd) { spin_lock(&sdp->sd_ail_lock); if (bd->bd_tr) { gfs2_trans_add_revoke(sdp, bd); + } else if (was_pinned) { + bh->b_private = NULL; + kmem_cache_free(gfs2_bufdata_cachep, bd); } spin_unlock(&sdp->sd_ail_lock); } -- cgit v1.2.3-59-g8ed1b From dfd11184d894cd0a92397b25cac18831a1a6a5bc Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 18 Dec 2013 14:14:52 +0000 Subject: GFS2: Fix incorrect invalidation for DIO/buffered I/O In patch 209806aba9d540dde3db0a5ce72307f85f33468f we allowed local deferred locks to be granted against a cached exclusive lock. That opened up a corner case which this patch now fixes. The solution to the problem is to check whether we have cached pages each time we do direct I/O and if so to unmap, flush and invalidate those pages. Since the glock state machine normally does that for us, mostly the code will be a no-op. Signed-off-by: Steven Whitehouse --- fs/gfs2/aops.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index b7fc035a6943..73f3e4ee4037 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -986,6 +986,7 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + struct address_space *mapping = inode->i_mapping; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int rv; @@ -1006,6 +1007,35 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, if (rv != 1) goto out; /* dio not valid, fall back to buffered i/o */ + /* + * Now since we are holding a deferred (CW) lock at this point, you + * might be wondering why this is ever needed. There is a case however + * where we've granted a deferred local lock against a cached exclusive + * glock. That is ok provided all granted local locks are deferred, but + * it also means that it is possible to encounter pages which are + * cached and possibly also mapped. So here we check for that and sort + * them out ahead of the dio. The glock state machine will take care of + * everything else. + * + * If in fact the cached glock state (gl->gl_state) is deferred (CW) in + * the first place, mapping->nr_pages will always be zero. + */ + if (mapping->nrpages) { + loff_t lstart = offset & (PAGE_CACHE_SIZE - 1); + loff_t len = iov_length(iov, nr_segs); + loff_t end = PAGE_ALIGN(offset + len) - 1; + + rv = 0; + if (len == 0) + goto out; + if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags)) + unmap_shared_mapping_range(ip->i_inode.i_mapping, offset, len); + rv = filemap_write_and_wait_range(mapping, lstart, end); + if (rv) + return rv; + truncate_inode_pages_range(mapping, lstart, end); + } + rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, gfs2_get_block_direct, NULL, NULL, 0); -- cgit v1.2.3-59-g8ed1b From 582d2f7aedfde9e1a01170deb003df510aa778d3 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Thu, 19 Dec 2013 11:04:14 +0000 Subject: GFS2: Wait for async DIO in glock state changes We need to wait for any outstanding DIO to complete in a couple of situations. Firstly, in case we are changing out of deferred mode (in inode_go_sync) where GLF_DIRTY will not be set. That call could be prefixed with a test for gl_state == LM_ST_DEFERRED but it doesn't seem worth it bearing in mind that the test for outstanding DIO is very quick anyway, in the usual case that there is none. The second case is in inode_go_lock which will catch the cases where we have a cached EX lock, but where we grant deferred locks against it so that there is no glock state transistion. We only need to wait if the state is not deferred, since DIO is valid anyway in that state. Signed-off-by: Steven Whitehouse --- fs/gfs2/glops.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index db908f697139..f88dcd925010 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -192,8 +192,11 @@ static void inode_go_sync(struct gfs2_glock *gl) if (ip && !S_ISREG(ip->i_inode.i_mode)) ip = NULL; - if (ip && test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags)) - unmap_shared_mapping_range(ip->i_inode.i_mapping, 0, 0); + if (ip) { + if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags)) + unmap_shared_mapping_range(ip->i_inode.i_mapping, 0, 0); + inode_dio_wait(&ip->i_inode); + } if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) return; @@ -410,6 +413,9 @@ static int inode_go_lock(struct gfs2_holder *gh) return error; } + if (gh->gh_state != LM_ST_DEFERRED) + inode_dio_wait(&ip->i_inode); + if ((ip->i_diskflags & GFS2_DIF_TRUNC_IN_PROG) && (gl->gl_state == LM_ST_EXCLUSIVE) && (gh->gh_state == LM_ST_EXCLUSIVE)) { -- cgit v1.2.3-59-g8ed1b From 0b3a2c9968d453d5827e635a6f3d69129f70af66 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 2 Jan 2014 19:52:20 +0900 Subject: GFS2: Fix unsafe dereference in dump_holder() GLOCK_BUG_ON() might call this function without RCU read lock. Make sure that RCU read lock is held when using task_struct returned from pid_task(). Signed-off-by: Tetsuo Handa Signed-off-by: Steven Whitehouse --- fs/gfs2/glock.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index c8420f7e4db6..6f7a47c05259 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1655,6 +1655,7 @@ static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh) struct task_struct *gh_owner = NULL; char flags_buf[32]; + rcu_read_lock(); if (gh->gh_owner_pid) gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID); gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %pS\n", @@ -1664,6 +1665,7 @@ static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh) gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1, gh_owner ? gh_owner->comm : "(ended)", (void *)gh->gh_ip); + rcu_read_unlock(); return 0; } -- cgit v1.2.3-59-g8ed1b