From dfe5b9ad83a63180f358b27d1018649a27b394a9 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 6 Dec 2013 11:52:34 +0000
Subject: GFS2: don't hold s_umount over blkdev_put

This is a GFS2 version of Tejun's patch:
4f331f01b9c43bf001d3ffee578a97a1e0633eac
vfs: don't hold s_umount over close_bdev_exclusive() call

In this case its blkdev_put itself that is the issue and this
patch uses the same solution of dropping and retaking s_umount.

Reported-by: Tejun Heo <tj@kernel.org>
Reported-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_fstype.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 82303b474958..52fa88314f5c 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1366,8 +1366,18 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
 	if (IS_ERR(s))
 		goto error_bdev;
 
-	if (s->s_root)
+	if (s->s_root) {
+		/*
+		 * s_umount nests inside bd_mutex during
+		 * __invalidate_device().  blkdev_put() acquires
+		 * bd_mutex and can't be called under s_umount.  Drop
+		 * s_umount temporarily.  This is safe as we're
+		 * holding an active reference.
+		 */
+		up_write(&s->s_umount);
 		blkdev_put(bdev, mode);
+		down_write(&s->s_umount);
+	}
 
 	memset(&args, 0, sizeof(args));
 	args.ar_quota = GFS2_QUOTA_DEFAULT;
-- 
cgit v1.2.3-59-g8ed1b


From 9290a9a7c0bcf5400e8dbfbf9707fa68ea3fb338 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Tue, 10 Dec 2013 12:06:35 -0500
Subject: GFS2: Fix use-after-free race when calling gfs2_remove_from_ail

Function gfs2_remove_from_ail drops the reference on the bh via
brelse. This patch fixes a race condition whereby bh is deferenced
after the brelse when setting bd->bd_blkno = bh->b_blocknr;
Under certain rare circumstances, bh might be gone or reused,
and bd->bd_blkno is set to whatever that memory happens to be,
which is often 0. Later, in gfs2_trans_add_unrevoke, that bd fails
the test "bd->bd_blkno >= blkno" which causes it to never be freed.
The end result is that the bd is never freed from the bufdata cache,
which results in this error:
slab error in kmem_cache_destroy(): cache `gfs2_bufdata': Can't free all objects

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/log.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 610613fb65b5..9dcb9777a5f8 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -551,10 +551,10 @@ void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
 	struct buffer_head *bh = bd->bd_bh;
 	struct gfs2_glock *gl = bd->bd_gl;
 
-	gfs2_remove_from_ail(bd);
-	bd->bd_bh = NULL;
 	bh->b_private = NULL;
 	bd->bd_blkno = bh->b_blocknr;
+	gfs2_remove_from_ail(bd); /* drops ref on bh */
+	bd->bd_bh = NULL;
 	bd->bd_ops = &gfs2_revoke_lops;
 	sdp->sd_log_num_revoke++;
 	atomic_inc(&gl->gl_revokes);
-- 
cgit v1.2.3-59-g8ed1b


From 502be2a32f09f388e4ff34ef2e3ebcabbbb261da Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Fri, 13 Dec 2013 08:31:06 -0500
Subject: GFS2: Fix slab memory leak in gfs2_bufdata

This patch fixes a slab memory leak that sometimes can occur
for files with a very short lifespan. The problem occurs when
a dinode is deleted before it has gotten to the journal properly.
In the leak scenario, the bd object is pinned for journal
committment (queued to the metadata buffers queue: sd_log_le_buf)
but is subsequently unpinned and dequeued before it finds its way
to the ail or the revoke queue. In this rare circumstance, the bd
object needs to be freed from slab memory, or it is forgotten.
We have to be very careful how we do it, though, because
multiple processes can call gfs2_remove_from_journal. In order to
avoid double-frees, only the process that does the unpinning is
allowed to free the bd.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/meta_io.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 932415050540..52f177be3bf8 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -258,6 +258,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
 	struct address_space *mapping = bh->b_page->mapping;
 	struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
 	struct gfs2_bufdata *bd = bh->b_private;
+	int was_pinned = 0;
 
 	if (test_clear_buffer_pinned(bh)) {
 		trace_gfs2_pin(bd, 0);
@@ -273,12 +274,16 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
 			tr->tr_num_databuf_rm++;
 		}
 		tr->tr_touched = 1;
+		was_pinned = 1;
 		brelse(bh);
 	}
 	if (bd) {
 		spin_lock(&sdp->sd_ail_lock);
 		if (bd->bd_tr) {
 			gfs2_trans_add_revoke(sdp, bd);
+		} else if (was_pinned) {
+			bh->b_private = NULL;
+			kmem_cache_free(gfs2_bufdata_cachep, bd);
 		}
 		spin_unlock(&sdp->sd_ail_lock);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From dfd11184d894cd0a92397b25cac18831a1a6a5bc Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 18 Dec 2013 14:14:52 +0000
Subject: GFS2: Fix incorrect invalidation for DIO/buffered I/O

In patch 209806aba9d540dde3db0a5ce72307f85f33468f we allowed
local deferred locks to be granted against a cached exclusive
lock. That opened up a corner case which this patch now
fixes.

The solution to the problem is to check whether we have cached
pages each time we do direct I/O and if so to unmap, flush
and invalidate those pages. Since the glock state machine
normally does that for us, mostly the code will be a no-op.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/aops.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index b7fc035a6943..73f3e4ee4037 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -986,6 +986,7 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
+	struct address_space *mapping = inode->i_mapping;
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_holder gh;
 	int rv;
@@ -1006,6 +1007,35 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
 	if (rv != 1)
 		goto out; /* dio not valid, fall back to buffered i/o */
 
+	/*
+	 * Now since we are holding a deferred (CW) lock at this point, you
+	 * might be wondering why this is ever needed. There is a case however
+	 * where we've granted a deferred local lock against a cached exclusive
+	 * glock. That is ok provided all granted local locks are deferred, but
+	 * it also means that it is possible to encounter pages which are
+	 * cached and possibly also mapped. So here we check for that and sort
+	 * them out ahead of the dio. The glock state machine will take care of
+	 * everything else.
+	 *
+	 * If in fact the cached glock state (gl->gl_state) is deferred (CW) in
+	 * the first place, mapping->nr_pages will always be zero.
+	 */
+	if (mapping->nrpages) {
+		loff_t lstart = offset & (PAGE_CACHE_SIZE - 1);
+		loff_t len = iov_length(iov, nr_segs);
+		loff_t end = PAGE_ALIGN(offset + len) - 1;
+
+		rv = 0;
+		if (len == 0)
+			goto out;
+		if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags))
+			unmap_shared_mapping_range(ip->i_inode.i_mapping, offset, len);
+		rv = filemap_write_and_wait_range(mapping, lstart, end);
+		if (rv)
+			return rv;
+		truncate_inode_pages_range(mapping, lstart, end);
+	}
+
 	rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
 				  offset, nr_segs, gfs2_get_block_direct,
 				  NULL, NULL, 0);
-- 
cgit v1.2.3-59-g8ed1b


From 582d2f7aedfde9e1a01170deb003df510aa778d3 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 19 Dec 2013 11:04:14 +0000
Subject: GFS2: Wait for async DIO in glock state changes

We need to wait for any outstanding DIO to complete in a couple
of situations. Firstly, in case we are changing out of deferred
mode (in inode_go_sync) where GLF_DIRTY will not be set. That
call could be prefixed with a test for gl_state == LM_ST_DEFERRED
but it doesn't seem worth it bearing in mind that the test for
outstanding DIO is very quick anyway, in the usual case that there
is none.

The second case is in inode_go_lock which will catch the cases
where we have a cached EX lock, but where we grant deferred locks
against it so that there is no glock state transistion. We only
need to wait if the state is not deferred, since DIO is valid
anyway in that state.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glops.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index db908f697139..f88dcd925010 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -192,8 +192,11 @@ static void inode_go_sync(struct gfs2_glock *gl)
 
 	if (ip && !S_ISREG(ip->i_inode.i_mode))
 		ip = NULL;
-	if (ip && test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags))
-		unmap_shared_mapping_range(ip->i_inode.i_mapping, 0, 0);
+	if (ip) {
+		if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags))
+			unmap_shared_mapping_range(ip->i_inode.i_mapping, 0, 0);
+		inode_dio_wait(&ip->i_inode);
+	}
 	if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
 		return;
 
@@ -410,6 +413,9 @@ static int inode_go_lock(struct gfs2_holder *gh)
 			return error;
 	}
 
+	if (gh->gh_state != LM_ST_DEFERRED)
+		inode_dio_wait(&ip->i_inode);
+
 	if ((ip->i_diskflags & GFS2_DIF_TRUNC_IN_PROG) &&
 	    (gl->gl_state == LM_ST_EXCLUSIVE) &&
 	    (gh->gh_state == LM_ST_EXCLUSIVE)) {
-- 
cgit v1.2.3-59-g8ed1b


From 0b3a2c9968d453d5827e635a6f3d69129f70af66 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Thu, 2 Jan 2014 19:52:20 +0900
Subject: GFS2: Fix unsafe dereference in dump_holder()

GLOCK_BUG_ON() might call this function without RCU read lock. Make sure that
RCU read lock is held when using task_struct returned from pid_task().

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/glock.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index c8420f7e4db6..6f7a47c05259 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1655,6 +1655,7 @@ static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
 	struct task_struct *gh_owner = NULL;
 	char flags_buf[32];
 
+	rcu_read_lock();
 	if (gh->gh_owner_pid)
 		gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID);
 	gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %pS\n",
@@ -1664,6 +1665,7 @@ static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
 		       gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
 		       gh_owner ? gh_owner->comm : "(ended)",
 		       (void *)gh->gh_ip);
+	rcu_read_unlock();
 	return 0;
 }
 
-- 
cgit v1.2.3-59-g8ed1b